Message ID | 80c66f55ac7f04b3ecd4ebf12d69d86c89480fa7.1603448113.git.viresh.kumar@linaro.org (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Daniel Lezcano |
Headers | show |
Series | cpufreq_cooling: Get effective CPU utilization from scheduler | expand |
On Fri, Oct 23, 2020 at 03:50:20PM +0530, Viresh Kumar wrote: > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index d2003a7d5ab5..369ff54d11d4 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5117,6 +5117,119 @@ struct task_struct *idle_task(int cpu) > return cpu_rq(cpu)->idle; > } > > +/* > + * This function computes an effective utilization for the given CPU, to be > + * used for frequency selection given the linear relation: f = u * f_max. > + * > + * The scheduler tracks the following metrics: > + * > + * cpu_util_{cfs,rt,dl,irq}() > + * cpu_bw_dl() > + * > + * Where the cfs,rt and dl util numbers are tracked with the same metric and > + * synchronized windows and are thus directly comparable. > + * > + * The cfs,rt,dl utilization are the running times measured with rq->clock_task > + * which excludes things like IRQ and steal-time. These latter are then accrued > + * in the irq utilization. > + * > + * The DL bandwidth number otoh is not a measured metric but a value computed > + * based on the task model parameters and gives the minimal utilization > + * required to meet deadlines. > + */ > +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, > + unsigned long max, enum cpu_util_type type, > + struct task_struct *p) > +{ ... > +} > + > +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type, > + unsigned long max) > +{ > + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, type, > + NULL); > +} Shouldn't all that be: #ifdef CONFIG_SMP ?
On Fri, 23 Oct 2020 at 12:20, Viresh Kumar <viresh.kumar@linaro.org> wrote: > > There is nothing schedutil specific in schedutil_cpu_util(), move it to > core.c and rename it to sched_cpu_util(), so it can be used from other I wonder if pelt.c would be a better place for this than core.c ? > parts of the kernel as well. > > The cpufreq_cooling stuff will make use of this in a later commit. > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > --- > include/linux/sched.h | 19 +++++ > kernel/sched/core.c | 113 ++++++++++++++++++++++++++++++ > kernel/sched/cpufreq_schedutil.c | 116 +------------------------------ > kernel/sched/fair.c | 6 +- > kernel/sched/sched.h | 29 +------- > 5 files changed, 140 insertions(+), 143 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 393db0690101..3c27c10141cb 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1930,6 +1930,25 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); > #define TASK_SIZE_OF(tsk) TASK_SIZE > #endif > > +/** > + * enum cpu_util_type - CPU utilization type > + * @FREQUENCY_UTIL: Utilization used to select frequency > + * @ENERGY_UTIL: Utilization used during energy calculation > + * > + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time > + * need to be aggregated differently depending on the usage made of them. This > + * enum is used within sched_cpu_util() to differentiate the types of > + * utilization expected by the callers, and adjust the aggregation accordingly. > + */ > +enum cpu_util_type { > + FREQUENCY_UTIL, > + ENERGY_UTIL, > +}; > + > +/* Returns effective CPU utilization, as seen by the scheduler */ > +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type, > + unsigned long max); > + > #ifdef CONFIG_RSEQ > > /* > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index d2003a7d5ab5..369ff54d11d4 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5117,6 +5117,119 @@ struct task_struct *idle_task(int cpu) > return cpu_rq(cpu)->idle; > } > > +/* > + * This function computes an effective utilization for the given CPU, to be > + * used for frequency selection given the linear relation: f = u * f_max. > + * > + * The scheduler tracks the following metrics: > + * > + * cpu_util_{cfs,rt,dl,irq}() > + * cpu_bw_dl() > + * > + * Where the cfs,rt and dl util numbers are tracked with the same metric and > + * synchronized windows and are thus directly comparable. > + * > + * The cfs,rt,dl utilization are the running times measured with rq->clock_task > + * which excludes things like IRQ and steal-time. These latter are then accrued > + * in the irq utilization. > + * > + * The DL bandwidth number otoh is not a measured metric but a value computed > + * based on the task model parameters and gives the minimal utilization > + * required to meet deadlines. > + */ > +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, > + unsigned long max, enum cpu_util_type type, > + struct task_struct *p) > +{ > + unsigned long dl_util, util, irq; > + struct rq *rq = cpu_rq(cpu); > + > + if (!uclamp_is_used() && > + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { > + return max; > + } > + > + /* > + * Early check to see if IRQ/steal time saturates the CPU, can be > + * because of inaccuracies in how we track these -- see > + * update_irq_load_avg(). > + */ > + irq = cpu_util_irq(rq); > + if (unlikely(irq >= max)) > + return max; > + > + /* > + * Because the time spend on RT/DL tasks is visible as 'lost' time to > + * CFS tasks and we use the same metric to track the effective > + * utilization (PELT windows are synchronized) we can directly add them > + * to obtain the CPU's actual utilization. > + * > + * CFS and RT utilization can be boosted or capped, depending on > + * utilization clamp constraints requested by currently RUNNABLE > + * tasks. > + * When there are no CFS RUNNABLE tasks, clamps are released and > + * frequency will be gracefully reduced with the utilization decay. > + */ > + util = util_cfs + cpu_util_rt(rq); > + if (type == FREQUENCY_UTIL) > + util = uclamp_rq_util_with(rq, util, p); > + > + dl_util = cpu_util_dl(rq); > + > + /* > + * For frequency selection we do not make cpu_util_dl() a permanent part > + * of this sum because we want to use cpu_bw_dl() later on, but we need > + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such > + * that we select f_max when there is no idle time. > + * > + * NOTE: numerical errors or stop class might cause us to not quite hit > + * saturation when we should -- something for later. > + */ > + if (util + dl_util >= max) > + return max; > + > + /* > + * OTOH, for energy computation we need the estimated running time, so > + * include util_dl and ignore dl_bw. > + */ > + if (type == ENERGY_UTIL) > + util += dl_util; > + > + /* > + * There is still idle time; further improve the number by using the > + * irq metric. Because IRQ/steal time is hidden from the task clock we > + * need to scale the task numbers: > + * > + * max - irq > + * U' = irq + --------- * U > + * max > + */ > + util = scale_irq_capacity(util, irq, max); > + util += irq; > + > + /* > + * Bandwidth required by DEADLINE must always be granted while, for > + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism > + * to gracefully reduce the frequency when no tasks show up for longer > + * periods of time. > + * > + * Ideally we would like to set bw_dl as min/guaranteed freq and util + > + * bw_dl as requested freq. However, cpufreq is not yet ready for such > + * an interface. So, we only do the latter for now. > + */ > + if (type == FREQUENCY_UTIL) > + util += cpu_bw_dl(rq); > + > + return min(max, util); > +} > + > +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type, > + unsigned long max) > +{ > + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, type, > + NULL); > +} > + > /** > * find_process_by_pid - find a process with a matching PID value. > * @pid: the pid in question. > diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c > index 5ae7b4e6e8d6..0c5c61a095f6 100644 > --- a/kernel/sched/cpufreq_schedutil.c > +++ b/kernel/sched/cpufreq_schedutil.c > @@ -169,122 +169,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, > return cpufreq_driver_resolve_freq(policy, freq); > } > > -/* > - * This function computes an effective utilization for the given CPU, to be > - * used for frequency selection given the linear relation: f = u * f_max. > - * > - * The scheduler tracks the following metrics: > - * > - * cpu_util_{cfs,rt,dl,irq}() > - * cpu_bw_dl() > - * > - * Where the cfs,rt and dl util numbers are tracked with the same metric and > - * synchronized windows and are thus directly comparable. > - * > - * The cfs,rt,dl utilization are the running times measured with rq->clock_task > - * which excludes things like IRQ and steal-time. These latter are then accrued > - * in the irq utilization. > - * > - * The DL bandwidth number otoh is not a measured metric but a value computed > - * based on the task model parameters and gives the minimal utilization > - * required to meet deadlines. > - */ > -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, > - unsigned long max, enum schedutil_type type, > - struct task_struct *p) > -{ > - unsigned long dl_util, util, irq; > - struct rq *rq = cpu_rq(cpu); > - > - if (!uclamp_is_used() && > - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { > - return max; > - } > - > - /* > - * Early check to see if IRQ/steal time saturates the CPU, can be > - * because of inaccuracies in how we track these -- see > - * update_irq_load_avg(). > - */ > - irq = cpu_util_irq(rq); > - if (unlikely(irq >= max)) > - return max; > - > - /* > - * Because the time spend on RT/DL tasks is visible as 'lost' time to > - * CFS tasks and we use the same metric to track the effective > - * utilization (PELT windows are synchronized) we can directly add them > - * to obtain the CPU's actual utilization. > - * > - * CFS and RT utilization can be boosted or capped, depending on > - * utilization clamp constraints requested by currently RUNNABLE > - * tasks. > - * When there are no CFS RUNNABLE tasks, clamps are released and > - * frequency will be gracefully reduced with the utilization decay. > - */ > - util = util_cfs + cpu_util_rt(rq); > - if (type == FREQUENCY_UTIL) > - util = uclamp_rq_util_with(rq, util, p); > - > - dl_util = cpu_util_dl(rq); > - > - /* > - * For frequency selection we do not make cpu_util_dl() a permanent part > - * of this sum because we want to use cpu_bw_dl() later on, but we need > - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such > - * that we select f_max when there is no idle time. > - * > - * NOTE: numerical errors or stop class might cause us to not quite hit > - * saturation when we should -- something for later. > - */ > - if (util + dl_util >= max) > - return max; > - > - /* > - * OTOH, for energy computation we need the estimated running time, so > - * include util_dl and ignore dl_bw. > - */ > - if (type == ENERGY_UTIL) > - util += dl_util; > - > - /* > - * There is still idle time; further improve the number by using the > - * irq metric. Because IRQ/steal time is hidden from the task clock we > - * need to scale the task numbers: > - * > - * max - irq > - * U' = irq + --------- * U > - * max > - */ > - util = scale_irq_capacity(util, irq, max); > - util += irq; > - > - /* > - * Bandwidth required by DEADLINE must always be granted while, for > - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism > - * to gracefully reduce the frequency when no tasks show up for longer > - * periods of time. > - * > - * Ideally we would like to set bw_dl as min/guaranteed freq and util + > - * bw_dl as requested freq. However, cpufreq is not yet ready for such > - * an interface. So, we only do the latter for now. > - */ > - if (type == FREQUENCY_UTIL) > - util += cpu_bw_dl(rq); > - > - return min(max, util); > -} > - > static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) > { > - struct rq *rq = cpu_rq(sg_cpu->cpu); > - unsigned long util = cpu_util_cfs(rq); > - unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); > - > - sg_cpu->max = max; > - sg_cpu->bw_dl = cpu_bw_dl(rq); > + sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); > + sg_cpu->bw_dl = cpu_bw_dl(cpu_rq(sg_cpu->cpu)); > > - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); > + return sched_cpu_util(sg_cpu->cpu, FREQUENCY_UTIL, sg_cpu->max); > } > > /** > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index aa4c6227cd6d..52e2d866e875 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -6499,7 +6499,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) > * is already enough to scale the EM reported power > * consumption at the (eventually clamped) cpu_capacity. > */ > - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, > + sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap, > ENERGY_UTIL, NULL); > > /* > @@ -6509,7 +6509,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) > * NOTE: in case RT tasks are running, by default the > * FREQUENCY_UTIL's utilization can be max OPP. > */ > - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, > + cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap, > FREQUENCY_UTIL, tsk); > max_util = max(max_util, cpu_util); > } > @@ -6607,7 +6607,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) > * IOW, placing the task there would make the CPU > * overutilized. Take uclamp into account to see how > * much capacity we can get out of the CPU; this is > - * aligned with schedutil_cpu_util(). > + * aligned with sched_cpu_util(). > */ > util = uclamp_rq_util_with(cpu_rq(cpu), util, p); > if (!fits_capacity(util, cpu_cap)) > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index df80bfcea92e..0f0439344eec 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -2486,25 +2486,8 @@ static inline unsigned long capacity_orig_of(int cpu) > } > #endif > > -/** > - * enum schedutil_type - CPU utilization type > - * @FREQUENCY_UTIL: Utilization used to select frequency > - * @ENERGY_UTIL: Utilization used during energy calculation > - * > - * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time > - * need to be aggregated differently depending on the usage made of them. This > - * enum is used within schedutil_freq_util() to differentiate the types of > - * utilization expected by the callers, and adjust the aggregation accordingly. > - */ > -enum schedutil_type { > - FREQUENCY_UTIL, > - ENERGY_UTIL, > -}; > - > -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL > - > -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, > - unsigned long max, enum schedutil_type type, > +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, > + unsigned long max, enum cpu_util_type type, > struct task_struct *p); > > static inline unsigned long cpu_bw_dl(struct rq *rq) > @@ -2533,14 +2516,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) > { > return READ_ONCE(rq->avg_rt.util_avg); > } > -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ > -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, > - unsigned long max, enum schedutil_type type, > - struct task_struct *p) > -{ > - return 0; > -} > -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ > > #ifdef CONFIG_HAVE_SCHED_AVG_IRQ > static inline unsigned long cpu_util_irq(struct rq *rq) > -- > 2.25.0.rc1.19.g042ed3e048af >
On 23-10-20, 12:34, Peter Zijlstra wrote: > On Fri, Oct 23, 2020 at 03:50:20PM +0530, Viresh Kumar wrote: > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > > index d2003a7d5ab5..369ff54d11d4 100644 > > --- a/kernel/sched/core.c > > +++ b/kernel/sched/core.c > > @@ -5117,6 +5117,119 @@ struct task_struct *idle_task(int cpu) > > return cpu_rq(cpu)->idle; > > } > > > > +/* > > + * This function computes an effective utilization for the given CPU, to be > > + * used for frequency selection given the linear relation: f = u * f_max. > > + * > > + * The scheduler tracks the following metrics: > > + * > > + * cpu_util_{cfs,rt,dl,irq}() > > + * cpu_bw_dl() > > + * > > + * Where the cfs,rt and dl util numbers are tracked with the same metric and > > + * synchronized windows and are thus directly comparable. > > + * > > + * The cfs,rt,dl utilization are the running times measured with rq->clock_task > > + * which excludes things like IRQ and steal-time. These latter are then accrued > > + * in the irq utilization. > > + * > > + * The DL bandwidth number otoh is not a measured metric but a value computed > > + * based on the task model parameters and gives the minimal utilization > > + * required to meet deadlines. > > + */ > > +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, > > + unsigned long max, enum cpu_util_type type, > > + struct task_struct *p) > > +{ > ... > > +} > > + > > +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type, > > + unsigned long max) > > +{ > > + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, type, > > + NULL); > > +} > > Shouldn't all that be: #ifdef CONFIG_SMP ? I didn't realize that these matrices are only available in case of SMP and that's why schedutil isn't available for !SMP. I wonder what we should be doing in cpufreq_cooling now ? Make it depend on SMP ? Or calculate load the traditional way (the stuff I just removed) for !SMP case ? :)
On 23-10-20, 12:43, Vincent Guittot wrote: > On Fri, 23 Oct 2020 at 12:20, Viresh Kumar <viresh.kumar@linaro.org> wrote: > > > > There is nothing schedutil specific in schedutil_cpu_util(), move it to > > core.c and rename it to sched_cpu_util(), so it can be used from other > > I wonder if pelt.c would be a better place for this than core.c ? Wherever you guys ask me to move it :)
On 10/23/20 11:54 AM, Viresh Kumar wrote: > On 23-10-20, 12:34, Peter Zijlstra wrote: >> On Fri, Oct 23, 2020 at 03:50:20PM +0530, Viresh Kumar wrote: >>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >>> index d2003a7d5ab5..369ff54d11d4 100644 >>> --- a/kernel/sched/core.c >>> +++ b/kernel/sched/core.c >>> @@ -5117,6 +5117,119 @@ struct task_struct *idle_task(int cpu) >>> return cpu_rq(cpu)->idle; >>> } >>> >>> +/* >>> + * This function computes an effective utilization for the given CPU, to be >>> + * used for frequency selection given the linear relation: f = u * f_max. >>> + * >>> + * The scheduler tracks the following metrics: >>> + * >>> + * cpu_util_{cfs,rt,dl,irq}() >>> + * cpu_bw_dl() >>> + * >>> + * Where the cfs,rt and dl util numbers are tracked with the same metric and >>> + * synchronized windows and are thus directly comparable. >>> + * >>> + * The cfs,rt,dl utilization are the running times measured with rq->clock_task >>> + * which excludes things like IRQ and steal-time. These latter are then accrued >>> + * in the irq utilization. >>> + * >>> + * The DL bandwidth number otoh is not a measured metric but a value computed >>> + * based on the task model parameters and gives the minimal utilization >>> + * required to meet deadlines. >>> + */ >>> +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, >>> + unsigned long max, enum cpu_util_type type, >>> + struct task_struct *p) >>> +{ >> ... >>> +} >>> + >>> +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type, >>> + unsigned long max) >>> +{ >>> + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, type, >>> + NULL); >>> +} >> >> Shouldn't all that be: #ifdef CONFIG_SMP ? > > I didn't realize that these matrices are only available in case of SMP > and that's why schedutil isn't available for !SMP. I wonder what we > should be doing in cpufreq_cooling now ? Make it depend on SMP ? Or > calculate load the traditional way (the stuff I just removed) for !SMP > case ? IMO the !SMP can leave with the old design, so keeping two implementations under #ifdef CONFIG_SMP is fair I would say in this case. There are popular platforms !SMP (BeagleBone, RPi1, RPiZero) but I haven't heard anyone was using IPA on them. Regards, Lukasz > > :) >
On Fri, 23 Oct 2020 at 12:54, Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 23-10-20, 12:34, Peter Zijlstra wrote: > > On Fri, Oct 23, 2020 at 03:50:20PM +0530, Viresh Kumar wrote: > > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > > > index d2003a7d5ab5..369ff54d11d4 100644 > > > --- a/kernel/sched/core.c > > > +++ b/kernel/sched/core.c > > > @@ -5117,6 +5117,119 @@ struct task_struct *idle_task(int cpu) > > > return cpu_rq(cpu)->idle; > > > } > > > > > > +/* > > > + * This function computes an effective utilization for the given CPU, to be > > > + * used for frequency selection given the linear relation: f = u * f_max. > > > + * > > > + * The scheduler tracks the following metrics: > > > + * > > > + * cpu_util_{cfs,rt,dl,irq}() > > > + * cpu_bw_dl() > > > + * > > > + * Where the cfs,rt and dl util numbers are tracked with the same metric and > > > + * synchronized windows and are thus directly comparable. > > > + * > > > + * The cfs,rt,dl utilization are the running times measured with rq->clock_task > > > + * which excludes things like IRQ and steal-time. These latter are then accrued > > > + * in the irq utilization. > > > + * > > > + * The DL bandwidth number otoh is not a measured metric but a value computed > > > + * based on the task model parameters and gives the minimal utilization > > > + * required to meet deadlines. > > > + */ > > > +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, > > > + unsigned long max, enum cpu_util_type type, > > > + struct task_struct *p) > > > +{ > > ... > > > +} > > > + > > > +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type, > > > + unsigned long max) > > > +{ > > > + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, type, > > > + NULL); > > > +} > > > > Shouldn't all that be: #ifdef CONFIG_SMP ? > > I didn't realize that these matrices are only available in case of SMP > and that's why schedutil isn't available for !SMP. I wonder what we Maybe it's time to make sched_util and pelt available for !SMP too. With util_est and uclamp, I can see some benefits for !SMP compare to ondemand > should be doing in cpufreq_cooling now ? Make it depend on SMP ? Or > calculate load the traditional way (the stuff I just removed) for !SMP > case ? > > :) > > -- > viresh
On 23-10-20, 14:34, Vincent Guittot wrote: > Maybe it's time to make sched_util and pelt available for !SMP too. > > With util_est and uclamp, I can see some benefits for !SMP compare to ondemand That's a decision you guys (sched maintainers) need to make :)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 393db0690101..3c27c10141cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1930,6 +1930,25 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +/** + * enum cpu_util_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within sched_cpu_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum cpu_util_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +/* Returns effective CPU utilization, as seen by the scheduler */ +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type, + unsigned long max); + #ifdef CONFIG_RSEQ /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..369ff54d11d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5117,6 +5117,119 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum cpu_util_type type, + struct task_struct *p) +{ + unsigned long dl_util, util, irq; + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { + return max; + } + + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= max)) + return max; + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + * + * CFS and RT utilization can be boosted or capped, depending on + * utilization clamp constraints requested by currently RUNNABLE + * tasks. + * When there are no CFS RUNNABLE tasks, clamps are released and + * frequency will be gracefully reduced with the utilization decay. + */ + util = util_cfs + cpu_util_rt(rq); + if (type == FREQUENCY_UTIL) + util = uclamp_rq_util_with(rq, util, p); + + dl_util = cpu_util_dl(rq); + + /* + * For frequency selection we do not make cpu_util_dl() a permanent part + * of this sum because we want to use cpu_bw_dl() later on, but we need + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. + */ + if (util + dl_util >= max) + return max; + + /* + * OTOH, for energy computation we need the estimated running time, so + * include util_dl and ignore dl_bw. + */ + if (type == ENERGY_UTIL) + util += dl_util; + + /* + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, max); + util += irq; + + /* + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer + * periods of time. + * + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. + */ + if (type == FREQUENCY_UTIL) + util += cpu_bw_dl(rq); + + return min(max, util); +} + +unsigned long sched_cpu_util(int cpu, enum cpu_util_type type, + unsigned long max) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, type, + NULL); +} + /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5ae7b4e6e8d6..0c5c61a095f6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -169,122 +169,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - unsigned long dl_util, util, irq; - struct rq *rq = cpu_rq(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. - */ - util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); - - /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. - */ - if (util + dl_util >= max) - return max; - - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, max); - util += irq; - - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); -} - static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util = cpu_util_cfs(rq); - unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); - - sg_cpu->max = max; - sg_cpu->bw_dl = cpu_bw_dl(rq); + sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); + sg_cpu->bw_dl = cpu_bw_dl(cpu_rq(sg_cpu->cpu)); - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + return sched_cpu_util(sg_cpu->cpu, FREQUENCY_UTIL, sg_cpu->max); } /** diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa4c6227cd6d..52e2d866e875 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6499,7 +6499,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap, ENERGY_UTIL, NULL); /* @@ -6509,7 +6509,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap, FREQUENCY_UTIL, tsk); max_util = max(max_util, cpu_util); } @@ -6607,7 +6607,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df80bfcea92e..0f0439344eec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2486,25 +2486,8 @@ static inline unsigned long capacity_orig_of(int cpu) } #endif -/** - * enum schedutil_type - CPU utilization type - * @FREQUENCY_UTIL: Utilization used to select frequency - * @ENERGY_UTIL: Utilization used during energy calculation - * - * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time - * need to be aggregated differently depending on the usage made of them. This - * enum is used within schedutil_freq_util() to differentiate the types of - * utilization expected by the callers, and adjust the aggregation accordingly. - */ -enum schedutil_type { - FREQUENCY_UTIL, - ENERGY_UTIL, -}; - -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum cpu_util_type type, struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) @@ -2533,14 +2516,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - return 0; -} -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq)
There is nothing schedutil specific in schedutil_cpu_util(), move it to core.c and rename it to sched_cpu_util(), so it can be used from other parts of the kernel as well. The cpufreq_cooling stuff will make use of this in a later commit. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- include/linux/sched.h | 19 +++++ kernel/sched/core.c | 113 ++++++++++++++++++++++++++++++ kernel/sched/cpufreq_schedutil.c | 116 +------------------------------ kernel/sched/fair.c | 6 +- kernel/sched/sched.h | 29 +------- 5 files changed, 140 insertions(+), 143 deletions(-)