diff mbox series

[v3,1/1] PM: EM: Add min/max available performance state limits

Message ID 20241030164126.1263793-2-lukasz.luba@arm.com (mailing list archive)
State In Next
Delegated to: Rafael Wysocki
Headers show
Series Update Energy Model with performance limits | expand

Commit Message

Lukasz Luba Oct. 30, 2024, 4:39 p.m. UTC
On some devices there are HW dependencies for shared frequency and voltage
between devices. It will impact Energy Aware Scheduler (EAS) decision,
where CPUs share the voltage & frequency domain with other CPUs or devices
e.g.
- Mid CPUs + Big CPU
- Little CPU + L3 cache in DSU
- some other device + Little CPUs

Detailed explanation of one example:
When the L3 cache frequency is increased, the affected Little CPUs might
run at higher voltage and frequency. That higher voltage causes higher CPU
power and thus more energy is used for running the tasks. This is
important for background running tasks, which try to run on energy
efficient CPUs.

Therefore, add performance state limits which are applied for the device
(in this case CPU). This is important on SoCs with HW dependencies
mentioned above so that the Energy Aware Scheduler (EAS) does not use
performance states outside the valid min-max range for energy calculation.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
---
 include/linux/energy_model.h | 29 ++++++++++++++------
 kernel/power/energy_model.c  | 52 ++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 8 deletions(-)

Comments

Rafael J. Wysocki Nov. 4, 2024, 10:02 p.m. UTC | #1
On Wed, Oct 30, 2024 at 5:40 PM Lukasz Luba <lukasz.luba@arm.com> wrote:
>
> On some devices there are HW dependencies for shared frequency and voltage
> between devices. It will impact Energy Aware Scheduler (EAS) decision,
> where CPUs share the voltage & frequency domain with other CPUs or devices
> e.g.
> - Mid CPUs + Big CPU
> - Little CPU + L3 cache in DSU
> - some other device + Little CPUs
>
> Detailed explanation of one example:
> When the L3 cache frequency is increased, the affected Little CPUs might
> run at higher voltage and frequency. That higher voltage causes higher CPU
> power and thus more energy is used for running the tasks. This is
> important for background running tasks, which try to run on energy
> efficient CPUs.
>
> Therefore, add performance state limits which are applied for the device
> (in this case CPU). This is important on SoCs with HW dependencies
> mentioned above so that the Energy Aware Scheduler (EAS) does not use
> performance states outside the valid min-max range for energy calculation.
>
> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
> ---
>  include/linux/energy_model.h | 29 ++++++++++++++------
>  kernel/power/energy_model.c  | 52 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 73 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
> index 1ff52020cf757..752e0b2975820 100644
> --- a/include/linux/energy_model.h
> +++ b/include/linux/energy_model.h
> @@ -55,6 +55,8 @@ struct em_perf_table {
>   * struct em_perf_domain - Performance domain
>   * @em_table:          Pointer to the runtime modifiable em_perf_table
>   * @nr_perf_states:    Number of performance states
> + * @min_perf_state:    Minimum allowed Performance State index
> + * @max_perf_state:    Maximum allowed Performance State index
>   * @flags:             See "em_perf_domain flags"
>   * @cpus:              Cpumask covering the CPUs of the domain. It's here
>   *                     for performance reasons to avoid potential cache
> @@ -70,6 +72,8 @@ struct em_perf_table {
>  struct em_perf_domain {
>         struct em_perf_table __rcu *em_table;
>         int nr_perf_states;
> +       int min_perf_state;
> +       int max_perf_state;
>         unsigned long flags;
>         unsigned long cpus[];
>  };
> @@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table);
>  int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
>                          int nr_states);
>  int em_dev_update_chip_binning(struct device *dev);
> +int em_update_performance_limits(struct em_perf_domain *pd,
> +               unsigned long freq_min_khz, unsigned long freq_max_khz);
>
>  /**
>   * em_pd_get_efficient_state() - Get an efficient performance state from the EM
>   * @table:             List of performance states, in ascending order
> - * @nr_perf_states:    Number of performance states
> + * @pd:                        performance domain for which this must be done
>   * @max_util:          Max utilization to map with the EM
> - * @pd_flags:          Performance Domain flags
>   *
>   * It is called from the scheduler code quite frequently and as a consequence
>   * doesn't implement any check.
> @@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev);
>   * requirement.
>   */
>  static inline int
> -em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
> -                         unsigned long max_util, unsigned long pd_flags)
> +em_pd_get_efficient_state(struct em_perf_state *table,
> +                         struct em_perf_domain *pd, unsigned long max_util)
>  {
> +       unsigned long pd_flags = pd->flags;
> +       int min_ps = pd->min_perf_state;
> +       int max_ps = pd->max_perf_state;
>         struct em_perf_state *ps;
>         int i;
>
> -       for (i = 0; i < nr_perf_states; i++) {
> +       for (i = min_ps; i <= max_ps; i++) {
>                 ps = &table[i];
>                 if (ps->performance >= max_util) {
>                         if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
> @@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
>                 }
>         }
>
> -       return nr_perf_states - 1;
> +       return max_ps;
>  }
>
>  /**
> @@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>          * requested performance.
>          */
>         em_table = rcu_dereference(pd->em_table);
> -       i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
> -                                     max_util, pd->flags);
> +       i = em_pd_get_efficient_state(em_table->state, pd, max_util);
>         ps = &em_table->state[i];
>
>         /*
> @@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev)
>  {
>         return -EINVAL;
>  }
> +static inline
> +int em_update_performance_limits(struct em_perf_domain *pd,
> +               unsigned long freq_min_khz, unsigned long freq_max_khz)
> +{
> +       return -EINVAL;
> +}
>  #endif
>
>  #endif
> diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
> index 927cc55ba0b3d..d07faf42eace6 100644
> --- a/kernel/power/energy_model.c
> +++ b/kernel/power/energy_model.c
> @@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>                 goto unlock;
>
>         dev->em_pd->flags |= flags;
> +       dev->em_pd->min_perf_state = 0;
> +       dev->em_pd->max_perf_state = nr_states - 1;
>
>         em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
>
> @@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev)
>         return em_recalc_and_update(dev, pd, em_table);
>  }
>  EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
> +
> +
> +/**
> + * em_update_performance_limits() - Update Energy Model with performance
> + *                             limits information.
> + * @pd                 : Performance Domain with EM that has to be updated.
> + * @freq_min_khz       : New minimum allowed frequency for this device.
> + * @freq_max_khz       : New maximum allowed frequency for this device.
> + *
> + * This function allows to update the EM with information about available
> + * performance levels. It takes the minimum and maximum frequency in kHz
> + * and does internal translation to performance levels.
> + * Returns 0 on success or -EINVAL when failed.
> + */
> +int em_update_performance_limits(struct em_perf_domain *pd,
> +               unsigned long freq_min_khz, unsigned long freq_max_khz)
> +{
> +       struct em_perf_state *table;
> +       int min_ps = -1;
> +       int max_ps = -1;
> +       int i;
> +
> +       if (!pd)
> +               return -EINVAL;
> +
> +       rcu_read_lock();
> +       table = em_perf_state_from_pd(pd);
> +
> +       for (i = 0; i < pd->nr_perf_states; i++) {
> +               if (freq_min_khz == table[i].frequency)
> +                       min_ps = i;
> +               if (freq_max_khz == table[i].frequency)
> +                       max_ps = i;
> +       }
> +       rcu_read_unlock();
> +
> +       /* Only update when both are found and sane */
> +       if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
> +               return -EINVAL;
> +
> +
> +       /* Guard simultaneous updates and make them atomic */
> +       mutex_lock(&em_pd_mutex);
> +       pd->min_perf_state = min_ps;
> +       pd->max_perf_state = max_ps;
> +       mutex_unlock(&em_pd_mutex);
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(em_update_performance_limits);
> --

Applied as 6.13 material, thanks!
Lukasz Luba Nov. 5, 2024, 9:08 a.m. UTC | #2
On 11/4/24 22:02, Rafael J. Wysocki wrote:
> On Wed, Oct 30, 2024 at 5:40 PM Lukasz Luba <lukasz.luba@arm.com> wrote:
>>
>> On some devices there are HW dependencies for shared frequency and voltage
>> between devices. It will impact Energy Aware Scheduler (EAS) decision,
>> where CPUs share the voltage & frequency domain with other CPUs or devices
>> e.g.
>> - Mid CPUs + Big CPU
>> - Little CPU + L3 cache in DSU
>> - some other device + Little CPUs
>>
>> Detailed explanation of one example:
>> When the L3 cache frequency is increased, the affected Little CPUs might
>> run at higher voltage and frequency. That higher voltage causes higher CPU
>> power and thus more energy is used for running the tasks. This is
>> important for background running tasks, which try to run on energy
>> efficient CPUs.
>>
>> Therefore, add performance state limits which are applied for the device
>> (in this case CPU). This is important on SoCs with HW dependencies
>> mentioned above so that the Energy Aware Scheduler (EAS) does not use
>> performance states outside the valid min-max range for energy calculation.
>>
>> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
>> ---
>>   include/linux/energy_model.h | 29 ++++++++++++++------
>>   kernel/power/energy_model.c  | 52 ++++++++++++++++++++++++++++++++++++
>>   2 files changed, 73 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
>> index 1ff52020cf757..752e0b2975820 100644
>> --- a/include/linux/energy_model.h
>> +++ b/include/linux/energy_model.h
>> @@ -55,6 +55,8 @@ struct em_perf_table {
>>    * struct em_perf_domain - Performance domain
>>    * @em_table:          Pointer to the runtime modifiable em_perf_table
>>    * @nr_perf_states:    Number of performance states
>> + * @min_perf_state:    Minimum allowed Performance State index
>> + * @max_perf_state:    Maximum allowed Performance State index
>>    * @flags:             See "em_perf_domain flags"
>>    * @cpus:              Cpumask covering the CPUs of the domain. It's here
>>    *                     for performance reasons to avoid potential cache
>> @@ -70,6 +72,8 @@ struct em_perf_table {
>>   struct em_perf_domain {
>>          struct em_perf_table __rcu *em_table;
>>          int nr_perf_states;
>> +       int min_perf_state;
>> +       int max_perf_state;
>>          unsigned long flags;
>>          unsigned long cpus[];
>>   };
>> @@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table);
>>   int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
>>                           int nr_states);
>>   int em_dev_update_chip_binning(struct device *dev);
>> +int em_update_performance_limits(struct em_perf_domain *pd,
>> +               unsigned long freq_min_khz, unsigned long freq_max_khz);
>>
>>   /**
>>    * em_pd_get_efficient_state() - Get an efficient performance state from the EM
>>    * @table:             List of performance states, in ascending order
>> - * @nr_perf_states:    Number of performance states
>> + * @pd:                        performance domain for which this must be done
>>    * @max_util:          Max utilization to map with the EM
>> - * @pd_flags:          Performance Domain flags
>>    *
>>    * It is called from the scheduler code quite frequently and as a consequence
>>    * doesn't implement any check.
>> @@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev);
>>    * requirement.
>>    */
>>   static inline int
>> -em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
>> -                         unsigned long max_util, unsigned long pd_flags)
>> +em_pd_get_efficient_state(struct em_perf_state *table,
>> +                         struct em_perf_domain *pd, unsigned long max_util)
>>   {
>> +       unsigned long pd_flags = pd->flags;
>> +       int min_ps = pd->min_perf_state;
>> +       int max_ps = pd->max_perf_state;
>>          struct em_perf_state *ps;
>>          int i;
>>
>> -       for (i = 0; i < nr_perf_states; i++) {
>> +       for (i = min_ps; i <= max_ps; i++) {
>>                  ps = &table[i];
>>                  if (ps->performance >= max_util) {
>>                          if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
>> @@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
>>                  }
>>          }
>>
>> -       return nr_perf_states - 1;
>> +       return max_ps;
>>   }
>>
>>   /**
>> @@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>>           * requested performance.
>>           */
>>          em_table = rcu_dereference(pd->em_table);
>> -       i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
>> -                                     max_util, pd->flags);
>> +       i = em_pd_get_efficient_state(em_table->state, pd, max_util);
>>          ps = &em_table->state[i];
>>
>>          /*
>> @@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev)
>>   {
>>          return -EINVAL;
>>   }
>> +static inline
>> +int em_update_performance_limits(struct em_perf_domain *pd,
>> +               unsigned long freq_min_khz, unsigned long freq_max_khz)
>> +{
>> +       return -EINVAL;
>> +}
>>   #endif
>>
>>   #endif
>> diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
>> index 927cc55ba0b3d..d07faf42eace6 100644
>> --- a/kernel/power/energy_model.c
>> +++ b/kernel/power/energy_model.c
>> @@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>>                  goto unlock;
>>
>>          dev->em_pd->flags |= flags;
>> +       dev->em_pd->min_perf_state = 0;
>> +       dev->em_pd->max_perf_state = nr_states - 1;
>>
>>          em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
>>
>> @@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev)
>>          return em_recalc_and_update(dev, pd, em_table);
>>   }
>>   EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
>> +
>> +
>> +/**
>> + * em_update_performance_limits() - Update Energy Model with performance
>> + *                             limits information.
>> + * @pd                 : Performance Domain with EM that has to be updated.
>> + * @freq_min_khz       : New minimum allowed frequency for this device.
>> + * @freq_max_khz       : New maximum allowed frequency for this device.
>> + *
>> + * This function allows to update the EM with information about available
>> + * performance levels. It takes the minimum and maximum frequency in kHz
>> + * and does internal translation to performance levels.
>> + * Returns 0 on success or -EINVAL when failed.
>> + */
>> +int em_update_performance_limits(struct em_perf_domain *pd,
>> +               unsigned long freq_min_khz, unsigned long freq_max_khz)
>> +{
>> +       struct em_perf_state *table;
>> +       int min_ps = -1;
>> +       int max_ps = -1;
>> +       int i;
>> +
>> +       if (!pd)
>> +               return -EINVAL;
>> +
>> +       rcu_read_lock();
>> +       table = em_perf_state_from_pd(pd);
>> +
>> +       for (i = 0; i < pd->nr_perf_states; i++) {
>> +               if (freq_min_khz == table[i].frequency)
>> +                       min_ps = i;
>> +               if (freq_max_khz == table[i].frequency)
>> +                       max_ps = i;
>> +       }
>> +       rcu_read_unlock();
>> +
>> +       /* Only update when both are found and sane */
>> +       if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
>> +               return -EINVAL;
>> +
>> +
>> +       /* Guard simultaneous updates and make them atomic */
>> +       mutex_lock(&em_pd_mutex);
>> +       pd->min_perf_state = min_ps;
>> +       pd->max_perf_state = max_ps;
>> +       mutex_unlock(&em_pd_mutex);
>> +
>> +       return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(em_update_performance_limits);
>> --
> 
> Applied as 6.13 material, thanks!

Thank you Rafael!
diff mbox series

Patch

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 1ff52020cf757..752e0b2975820 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -55,6 +55,8 @@  struct em_perf_table {
  * struct em_perf_domain - Performance domain
  * @em_table:		Pointer to the runtime modifiable em_perf_table
  * @nr_perf_states:	Number of performance states
+ * @min_perf_state:	Minimum allowed Performance State index
+ * @max_perf_state:	Maximum allowed Performance State index
  * @flags:		See "em_perf_domain flags"
  * @cpus:		Cpumask covering the CPUs of the domain. It's here
  *			for performance reasons to avoid potential cache
@@ -70,6 +72,8 @@  struct em_perf_table {
 struct em_perf_domain {
 	struct em_perf_table __rcu *em_table;
 	int nr_perf_states;
+	int min_perf_state;
+	int max_perf_state;
 	unsigned long flags;
 	unsigned long cpus[];
 };
@@ -173,13 +177,14 @@  void em_table_free(struct em_perf_table __rcu *table);
 int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
 			 int nr_states);
 int em_dev_update_chip_binning(struct device *dev);
+int em_update_performance_limits(struct em_perf_domain *pd,
+		unsigned long freq_min_khz, unsigned long freq_max_khz);
 
 /**
  * em_pd_get_efficient_state() - Get an efficient performance state from the EM
  * @table:		List of performance states, in ascending order
- * @nr_perf_states:	Number of performance states
+ * @pd:			performance domain for which this must be done
  * @max_util:		Max utilization to map with the EM
- * @pd_flags:		Performance Domain flags
  *
  * It is called from the scheduler code quite frequently and as a consequence
  * doesn't implement any check.
@@ -188,13 +193,16 @@  int em_dev_update_chip_binning(struct device *dev);
  * requirement.
  */
 static inline int
-em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
-			  unsigned long max_util, unsigned long pd_flags)
+em_pd_get_efficient_state(struct em_perf_state *table,
+			  struct em_perf_domain *pd, unsigned long max_util)
 {
+	unsigned long pd_flags = pd->flags;
+	int min_ps = pd->min_perf_state;
+	int max_ps = pd->max_perf_state;
 	struct em_perf_state *ps;
 	int i;
 
-	for (i = 0; i < nr_perf_states; i++) {
+	for (i = min_ps; i <= max_ps; i++) {
 		ps = &table[i];
 		if (ps->performance >= max_util) {
 			if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
@@ -204,7 +212,7 @@  em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
 		}
 	}
 
-	return nr_perf_states - 1;
+	return max_ps;
 }
 
 /**
@@ -253,8 +261,7 @@  static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 * requested performance.
 	 */
 	em_table = rcu_dereference(pd->em_table);
-	i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
-				      max_util, pd->flags);
+	i = em_pd_get_efficient_state(em_table->state, pd, max_util);
 	ps = &em_table->state[i];
 
 	/*
@@ -391,6 +398,12 @@  static inline int em_dev_update_chip_binning(struct device *dev)
 {
 	return -EINVAL;
 }
+static inline
+int em_update_performance_limits(struct em_perf_domain *pd,
+		unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 927cc55ba0b3d..d07faf42eace6 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -628,6 +628,8 @@  int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 		goto unlock;
 
 	dev->em_pd->flags |= flags;
+	dev->em_pd->min_perf_state = 0;
+	dev->em_pd->max_perf_state = nr_states - 1;
 
 	em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
 
@@ -856,3 +858,53 @@  int em_dev_update_chip_binning(struct device *dev)
 	return em_recalc_and_update(dev, pd, em_table);
 }
 EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
+
+
+/**
+ * em_update_performance_limits() - Update Energy Model with performance
+ *				limits information.
+ * @pd			: Performance Domain with EM that has to be updated.
+ * @freq_min_khz	: New minimum allowed frequency for this device.
+ * @freq_max_khz	: New maximum allowed frequency for this device.
+ *
+ * This function allows to update the EM with information about available
+ * performance levels. It takes the minimum and maximum frequency in kHz
+ * and does internal translation to performance levels.
+ * Returns 0 on success or -EINVAL when failed.
+ */
+int em_update_performance_limits(struct em_perf_domain *pd,
+		unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+	struct em_perf_state *table;
+	int min_ps = -1;
+	int max_ps = -1;
+	int i;
+
+	if (!pd)
+		return -EINVAL;
+
+	rcu_read_lock();
+	table = em_perf_state_from_pd(pd);
+
+	for (i = 0; i < pd->nr_perf_states; i++) {
+		if (freq_min_khz == table[i].frequency)
+			min_ps = i;
+		if (freq_max_khz == table[i].frequency)
+			max_ps = i;
+	}
+	rcu_read_unlock();
+
+	/* Only update when both are found and sane */
+	if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
+		return -EINVAL;
+
+
+	/* Guard simultaneous updates and make them atomic */
+	mutex_lock(&em_pd_mutex);
+	pd->min_perf_state = min_ps;
+	pd->max_perf_state = max_ps;
+	mutex_unlock(&em_pd_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(em_update_performance_limits);