diff mbox series

[3/3] thermal: cpu_cooling: Migrate to using the EM framework

Message ID 20190328101352.25657-4-quentin.perret@arm.com (mailing list archive)
State Not Applicable, archived
Headers show
Series cpu_cooling: Make IPA use PM_EM | expand

Commit Message

Quentin Perret March 28, 2019, 10:13 a.m. UTC
The newly introduced Energy Model framework manages power cost tables in
a generic way. Moreover, it supports a several types of models since the
tables can come from DT or firmware (through SCMI) for example. On the
other hand, the cpu_cooling subsystem manages its own power cost tables
using only DT data.

In order to avoid the duplication of data in the kernel, and in order to
enable IPA with EMs coming from more than just DT, remove the private
tables from cpu_cooling.c and migrate it to using the centralized EM
framework.

The case where the thermal subsystem is used without an Energy Model
(cpufreq_cooling_ops) is handled by looking directly at CPUFreq's
frequency table which is already a dependency for cpu_cooling.c anyway.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
---
 drivers/thermal/cpu_cooling.c | 231 +++++++++++-----------------------
 1 file changed, 75 insertions(+), 156 deletions(-)

Comments

Daniel Lezcano March 28, 2019, 8:23 p.m. UTC | #1
On 28/03/2019 11:13, Quentin Perret wrote:
> The newly introduced Energy Model framework manages power cost tables in
> a generic way. Moreover, it supports a several types of models since the
> tables can come from DT or firmware (through SCMI) for example. On the
> other hand, the cpu_cooling subsystem manages its own power cost tables
> using only DT data.
> 
> In order to avoid the duplication of data in the kernel, and in order to
> enable IPA with EMs coming from more than just DT, remove the private
> tables from cpu_cooling.c and migrate it to using the centralized EM
> framework.
> 
> The case where the thermal subsystem is used without an Energy Model
> (cpufreq_cooling_ops) is handled by looking directly at CPUFreq's
> frequency table which is already a dependency for cpu_cooling.c anyway.
> 
> Signed-off-by: Quentin Perret <quentin.perret@arm.com>
> ---
>  drivers/thermal/cpu_cooling.c | 231 +++++++++++-----------------------
>  1 file changed, 75 insertions(+), 156 deletions(-)
> 
> diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
> index f7c1f49ec87f..a74ec8269b7b 100644
> --- a/drivers/thermal/cpu_cooling.c
> +++ b/drivers/thermal/cpu_cooling.c
> @@ -31,6 +31,7 @@
>  #include <linux/slab.h>
>  #include <linux/cpu.h>
>  #include <linux/cpu_cooling.h>
> +#include <linux/energy_model.h>
>  
>  #include <trace/events/thermal.h>
>  
> @@ -48,19 +49,6 @@
>   *	...
>   */
>  
> -/**
> - * struct freq_table - frequency table along with power entries
> - * @frequency:	frequency in KHz
> - * @power:	power in mW
> - *
> - * This structure is built when the cooling device registers and helps
> - * in translating frequency to power and vice versa.
> - */
> -struct freq_table {
> -	u32 frequency;
> -	u32 power;
> -};
> -
>  /**
>   * struct time_in_idle - Idle time stats
>   * @time: previous reading of the absolute time that this cpu was idle
> @@ -82,7 +70,7 @@ struct time_in_idle {
>   *	frequency.
>   * @max_level: maximum cooling level. One less than total number of valid
>   *	cpufreq frequencies.
> - * @freq_table: Freq table in descending order of frequencies
> + * @em: Reference on the Energy Model of the device
>   * @cdev: thermal_cooling_device pointer to keep track of the
>   *	registered cooling device.
>   * @policy: cpufreq policy.
> @@ -98,7 +86,7 @@ struct cpufreq_cooling_device {
>  	unsigned int cpufreq_state;
>  	unsigned int clipped_freq;
>  	unsigned int max_level;
> -	struct freq_table *freq_table;	/* In descending order */
> +	struct em_perf_domain *em;

Why do you need to add this field? it will be accessible via policy->em, no?

>  	struct thermal_cooling_device *cdev;
>  	struct cpufreq_policy *policy;
>  	struct list_head node;
> @@ -121,14 +109,14 @@ static LIST_HEAD(cpufreq_cdev_list);
>  static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
[ ... ]
Quentin Perret March 29, 2019, 9:16 a.m. UTC | #2
Hi Daniel,

On Thursday 28 Mar 2019 at 21:23:35 (+0100), Daniel Lezcano wrote:
> >  /**
> >   * struct time_in_idle - Idle time stats
> >   * @time: previous reading of the absolute time that this cpu was idle
> > @@ -82,7 +70,7 @@ struct time_in_idle {
> >   *	frequency.
> >   * @max_level: maximum cooling level. One less than total number of valid
> >   *	cpufreq frequencies.
> > - * @freq_table: Freq table in descending order of frequencies
> > + * @em: Reference on the Energy Model of the device
> >   * @cdev: thermal_cooling_device pointer to keep track of the
> >   *	registered cooling device.
> >   * @policy: cpufreq policy.
> > @@ -98,7 +86,7 @@ struct cpufreq_cooling_device {
> >  	unsigned int cpufreq_state;
> >  	unsigned int clipped_freq;
> >  	unsigned int max_level;
> > -	struct freq_table *freq_table;	/* In descending order */
> > +	struct em_perf_domain *em;
> 
> Why do you need to add this field? it will be accessible via policy->em, no?

You mean via the CPUFreq policy ? Then no, the EM isn't attached to the
CPUFreq policy. And we can't attach it directly to the CPUFreq policy
since in *theory* it is not required to map 1:1 to CPUFreq policies
(even though that _is_ true for all existing platforms). That's one of
the things this patch checks in that em_is_sane() function below.

FWIW, the idea of the design is, the EM framework is 'independent' and
it's up to the client subsystems (scheduler, IPA) to check if it actually
works for them. In the case of the scheduler, for example, we can't use
an EM that's too complex because that would cause too much overhead, so
we don't start EAS if that's not the case. See:

  https://elixir.bootlin.com/linux/latest/source/kernel/sched/topology.c#L367

In the case of IPA, we need to do something similar. We can't use an EM
that doesn't map 1:1 to CPUFreq policies, so we bail out if that's not
true, etc, ... This isn't supposed to trigger any time soon, but it's
good to have a check just to be on the safe side I think.

> 
> >  	struct thermal_cooling_device *cdev;
> >  	struct cpufreq_policy *policy;
> >  	struct list_head node;
> > @@ -121,14 +109,14 @@ static LIST_HEAD(cpufreq_cdev_list);
> >  static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,

Thanks,
Quentin
Daniel Lezcano March 29, 2019, 5:17 p.m. UTC | #3
On 29/03/2019 10:16, Quentin Perret wrote:
> Hi Daniel,
> 
> On Thursday 28 Mar 2019 at 21:23:35 (+0100), Daniel Lezcano wrote:
>>>  /**
>>>   * struct time_in_idle - Idle time stats
>>>   * @time: previous reading of the absolute time that this cpu was idle
>>> @@ -82,7 +70,7 @@ struct time_in_idle {
>>>   *	frequency.
>>>   * @max_level: maximum cooling level. One less than total number of valid
>>>   *	cpufreq frequencies.
>>> - * @freq_table: Freq table in descending order of frequencies
>>> + * @em: Reference on the Energy Model of the device
>>>   * @cdev: thermal_cooling_device pointer to keep track of the
>>>   *	registered cooling device.
>>>   * @policy: cpufreq policy.
>>> @@ -98,7 +86,7 @@ struct cpufreq_cooling_device {
>>>  	unsigned int cpufreq_state;
>>>  	unsigned int clipped_freq;
>>>  	unsigned int max_level;
>>> -	struct freq_table *freq_table;	/* In descending order */
>>> +	struct em_perf_domain *em;
>>
>> Why do you need to add this field? it will be accessible via policy->em, no?
> 
> You mean via the CPUFreq policy ? Then no, the EM isn't attached to the
> CPUFreq policy. And we can't attach it directly to the CPUFreq policy
> since in *theory* it is not required to map 1:1 to CPUFreq policies
> (even though that _is_ true for all existing platforms). That's one of
> the things this patch checks in that em_is_sane() function below.
> 
> FWIW, the idea of the design is, the EM framework is 'independent' and
> it's up to the client subsystems (scheduler, IPA) to check if it actually
> works for them. In the case of the scheduler, for example, we can't use
> an EM that's too complex because that would cause too much overhead, so
> we don't start EAS if that's not the case. See:
> 
>   https://elixir.bootlin.com/linux/latest/source/kernel/sched/topology.c#L367
> 
> In the case of IPA, we need to do something similar. We can't use an EM
> that doesn't map 1:1 to CPUFreq policies, so we bail out if that's not
> true, etc, ... This isn't supposed to trigger any time soon, but it's
> good to have a check just to be on the safe side I think.

Ok, makes sense. Thanks for the clarification.
Viresh Kumar April 10, 2019, 5:44 a.m. UTC | #4
On 28-03-19, 10:13, Quentin Perret wrote:
> +static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
> +			      unsigned long state)
> +{
> +	struct cpufreq_policy *policy;
> +	unsigned long idx;
> +
> +	/* Use the Energy Model table if available */
> +	if (cpufreq_cdev->em) {
> +		idx = cpufreq_cdev->max_level - state;
> +		return cpufreq_cdev->em->table[idx].frequency;
> +	}
> +
> +	/* Otherwise, fallback on the CPUFreq table */
> +	policy = cpufreq_cdev->policy;
> +	if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)

It is not guaranteed that the frequency table is sorted in any order, isn't it ?

> +		idx = cpufreq_cdev->max_level - state;
> +	else
> +		idx = state;
Quentin Perret April 10, 2019, 8:57 a.m. UTC | #5
On Wednesday 10 Apr 2019 at 11:14:49 (+0530), Viresh Kumar wrote:
> On 28-03-19, 10:13, Quentin Perret wrote:
> > +static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
> > +			      unsigned long state)
> > +{
> > +	struct cpufreq_policy *policy;
> > +	unsigned long idx;
> > +
> > +	/* Use the Energy Model table if available */
> > +	if (cpufreq_cdev->em) {
> > +		idx = cpufreq_cdev->max_level - state;
> > +		return cpufreq_cdev->em->table[idx].frequency;
> > +	}
> > +
> > +	/* Otherwise, fallback on the CPUFreq table */
> > +	policy = cpufreq_cdev->policy;
> > +	if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)
> 
> It is not guaranteed that the frequency table is sorted in any order, isn't it ?

Hmm, indeed... I thought cpufreq_table_validate_and_sort() was actively
sorting the table but it seems I was wrong.

But I _think_ in practice the freq table actually happens to be sorted
for the upstream cpufreq drivers with the CPUFREQ_IS_COOLING_DEV flag
set. Most of them use dev_pm_opp_init_cpufreq_table() which guarantees
the table is sorted and qoriq-cpufreq explicitly sorts the table. But
I'm not sure about qcom-cpufreq-hw ...

So, if the above is true, perhaps I could simply add a check to mandate
that policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_UNSORTED for
cpu_cooling ? That shouldn't harm the existing users.

Do you happen to know a board where the table is unsorted ? Is it a
common use-case ?

If yes, then I'll probably need to drop the dependency on cpufreq's
freq_table and use something else to convert indexes into frequencies
(PM_OPP ?). Unless we can force-sort the table in the cpufreq core, but
that might require lots of changes to lots of drivers too.

> 
> > +		idx = cpufreq_cdev->max_level - state;
> > +	else
> > +		idx = state;
> 
> -- 
> viresh

Thanks,
Quentin
Viresh Kumar April 10, 2019, 10:14 a.m. UTC | #6
On 10-04-19, 09:57, Quentin Perret wrote:
> Hmm, indeed... I thought cpufreq_table_validate_and_sort() was actively
> sorting the table but it seems I was wrong.
> 
> But I _think_ in practice the freq table actually happens to be sorted
> for the upstream cpufreq drivers with the CPUFREQ_IS_COOLING_DEV flag
> set. Most of them use dev_pm_opp_init_cpufreq_table() which guarantees
> the table is sorted and qoriq-cpufreq explicitly sorts the table. But
> I'm not sure about qcom-cpufreq-hw ...
> 
> So, if the above is true, perhaps I could simply add a check to mandate
> that policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_UNSORTED for
> cpu_cooling ? That shouldn't harm the existing users.

Right, I think most of the platforms will have it sorted anyway right now, but
you never know if one or two of them don't. Maybe just add the above conditional
and put out an error or WARN or something, so people know that something broke.
Quentin Perret April 10, 2019, 10:36 a.m. UTC | #7
On Wednesday 10 Apr 2019 at 15:44:23 (+0530), Viresh Kumar wrote:
> On 10-04-19, 09:57, Quentin Perret wrote:
> > Hmm, indeed... I thought cpufreq_table_validate_and_sort() was actively
> > sorting the table but it seems I was wrong.
> > 
> > But I _think_ in practice the freq table actually happens to be sorted
> > for the upstream cpufreq drivers with the CPUFREQ_IS_COOLING_DEV flag
> > set. Most of them use dev_pm_opp_init_cpufreq_table() which guarantees
> > the table is sorted and qoriq-cpufreq explicitly sorts the table. But
> > I'm not sure about qcom-cpufreq-hw ...
> > 
> > So, if the above is true, perhaps I could simply add a check to mandate
> > that policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_UNSORTED for
> > cpu_cooling ? That shouldn't harm the existing users.
> 
> Right, I think most of the platforms will have it sorted anyway right now, but
> you never know if one or two of them don't. Maybe just add the above conditional
> and put out an error or WARN or something, so people know that something broke.

Right, WARN + bail out should do it. I'll do the change in v2.

Thanks !
Quentin
diff mbox series

Patch

diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index f7c1f49ec87f..a74ec8269b7b 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -31,6 +31,7 @@ 
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/cpu_cooling.h>
+#include <linux/energy_model.h>
 
 #include <trace/events/thermal.h>
 
@@ -48,19 +49,6 @@ 
  *	...
  */
 
-/**
- * struct freq_table - frequency table along with power entries
- * @frequency:	frequency in KHz
- * @power:	power in mW
- *
- * This structure is built when the cooling device registers and helps
- * in translating frequency to power and vice versa.
- */
-struct freq_table {
-	u32 frequency;
-	u32 power;
-};
-
 /**
  * struct time_in_idle - Idle time stats
  * @time: previous reading of the absolute time that this cpu was idle
@@ -82,7 +70,7 @@  struct time_in_idle {
  *	frequency.
  * @max_level: maximum cooling level. One less than total number of valid
  *	cpufreq frequencies.
- * @freq_table: Freq table in descending order of frequencies
+ * @em: Reference on the Energy Model of the device
  * @cdev: thermal_cooling_device pointer to keep track of the
  *	registered cooling device.
  * @policy: cpufreq policy.
@@ -98,7 +86,7 @@  struct cpufreq_cooling_device {
 	unsigned int cpufreq_state;
 	unsigned int clipped_freq;
 	unsigned int max_level;
-	struct freq_table *freq_table;	/* In descending order */
+	struct em_perf_domain *em;
 	struct thermal_cooling_device *cdev;
 	struct cpufreq_policy *policy;
 	struct list_head node;
@@ -121,14 +109,14 @@  static LIST_HEAD(cpufreq_cdev_list);
 static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
 			       unsigned int freq)
 {
-	struct freq_table *freq_table = cpufreq_cdev->freq_table;
-	unsigned long level;
+	int i;
 
-	for (level = 1; level <= cpufreq_cdev->max_level; level++)
-		if (freq > freq_table[level].frequency)
+	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
+		if (freq > cpufreq_cdev->em->table[i].frequency)
 			break;
+	}
 
-	return level - 1;
+	return cpufreq_cdev->max_level - i - 1;
 }
 
 /**
@@ -184,105 +172,30 @@  static int cpufreq_thermal_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-/**
- * update_freq_table() - Update the freq table with power numbers
- * @cpufreq_cdev:	the cpufreq cooling device in which to update the table
- * @capacitance: dynamic power coefficient for these cpus
- *
- * Update the freq table with power numbers.  This table will be used in
- * cpu_power_to_freq() and cpu_freq_to_power() to convert between power and
- * frequency efficiently.  Power is stored in mW, frequency in KHz.  The
- * resulting table is in descending order.
- *
- * Return: 0 on success, -EINVAL if there are no OPPs for any CPUs,
- * or -ENOMEM if we run out of memory.
- */
-static int update_freq_table(struct cpufreq_cooling_device *cpufreq_cdev,
-			     u32 capacitance)
-{
-	struct freq_table *freq_table = cpufreq_cdev->freq_table;
-	struct dev_pm_opp *opp;
-	struct device *dev = NULL;
-	int num_opps = 0, cpu = cpufreq_cdev->policy->cpu, i;
-
-	dev = get_cpu_device(cpu);
-	if (unlikely(!dev)) {
-		dev_warn(&cpufreq_cdev->cdev->device,
-			 "No cpu device for cpu %d\n", cpu);
-		return -ENODEV;
-	}
-
-	num_opps = dev_pm_opp_get_opp_count(dev);
-	if (num_opps < 0)
-		return num_opps;
-
-	/*
-	 * The cpufreq table is also built from the OPP table and so the count
-	 * should match.
-	 */
-	if (num_opps != cpufreq_cdev->max_level + 1) {
-		dev_warn(dev, "Number of OPPs not matching with max_levels\n");
-		return -EINVAL;
-	}
-
-	for (i = 0; i <= cpufreq_cdev->max_level; i++) {
-		unsigned long freq = freq_table[i].frequency * 1000;
-		u32 freq_mhz = freq_table[i].frequency / 1000;
-		u64 power;
-		u32 voltage_mv;
-
-		/*
-		 * Find ceil frequency as 'freq' may be slightly lower than OPP
-		 * freq due to truncation while converting to kHz.
-		 */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		if (IS_ERR(opp)) {
-			dev_err(dev, "failed to get opp for %lu frequency\n",
-				freq);
-			return -EINVAL;
-		}
-
-		voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
-		dev_pm_opp_put(opp);
-
-		/*
-		 * Do the multiplication with MHz and millivolt so as
-		 * to not overflow.
-		 */
-		power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
-		do_div(power, 1000000000);
-
-		/* power is stored in mW */
-		freq_table[i].power = power;
-	}
-
-	return 0;
-}
-
 static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
 			     u32 freq)
 {
 	int i;
-	struct freq_table *freq_table = cpufreq_cdev->freq_table;
 
-	for (i = 1; i <= cpufreq_cdev->max_level; i++)
-		if (freq > freq_table[i].frequency)
+	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
+		if (freq > cpufreq_cdev->em->table[i].frequency)
 			break;
+	}
 
-	return freq_table[i - 1].power;
+	return cpufreq_cdev->em->table[i + 1].power;
 }
 
 static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
 			     u32 power)
 {
 	int i;
-	struct freq_table *freq_table = cpufreq_cdev->freq_table;
 
-	for (i = 1; i <= cpufreq_cdev->max_level; i++)
-		if (power > freq_table[i].power)
+	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
+		if (power > cpufreq_cdev->em->table[i].power)
 			break;
+	}
 
-	return freq_table[i - 1].frequency;
+	return cpufreq_cdev->em->table[i + 1].frequency;
 }
 
 /**
@@ -374,6 +287,28 @@  static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
 	return 0;
 }
 
+static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
+			      unsigned long state)
+{
+	struct cpufreq_policy *policy;
+	unsigned long idx;
+
+	/* Use the Energy Model table if available */
+	if (cpufreq_cdev->em) {
+		idx = cpufreq_cdev->max_level - state;
+		return cpufreq_cdev->em->table[idx].frequency;
+	}
+
+	/* Otherwise, fallback on the CPUFreq table */
+	policy = cpufreq_cdev->policy;
+	if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)
+		idx = cpufreq_cdev->max_level - state;
+	else
+		idx = state;
+
+	return policy->freq_table[idx].frequency;
+}
+
 /**
  * cpufreq_set_cur_state - callback function to set the current cooling state.
  * @cdev: thermal cooling device pointer.
@@ -398,7 +333,7 @@  static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 	if (cpufreq_cdev->cpufreq_state == state)
 		return 0;
 
-	clip_freq = cpufreq_cdev->freq_table[state].frequency;
+	clip_freq = get_state_freq(cpufreq_cdev, state);
 	cpufreq_cdev->cpufreq_state = state;
 	cpufreq_cdev->clipped_freq = clip_freq;
 
@@ -497,7 +432,7 @@  static int cpufreq_state2power(struct thermal_cooling_device *cdev,
 			       struct thermal_zone_device *tz,
 			       unsigned long state, u32 *power)
 {
-	unsigned int freq, num_cpus;
+	unsigned int freq, num_cpus, idx;
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 
 	/* Request state should be less than max_level */
@@ -506,7 +441,8 @@  static int cpufreq_state2power(struct thermal_cooling_device *cdev,
 
 	num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);
 
-	freq = cpufreq_cdev->freq_table[state].frequency;
+	idx = cpufreq_cdev->max_level - state;
+	freq = cpufreq_cdev->em->table[idx].frequency;
 	*power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus;
 
 	return 0;
@@ -559,7 +495,6 @@  static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
 	.get_cur_state = cpufreq_get_cur_state,
 	.set_cur_state = cpufreq_set_cur_state,
 };
-
 static struct thermal_cooling_device_ops cpufreq_power_cooling_ops = {
 	.get_max_state		= cpufreq_get_max_state,
 	.get_cur_state		= cpufreq_get_cur_state,
@@ -574,18 +509,31 @@  static struct notifier_block thermal_cpufreq_notifier_block = {
 	.notifier_call = cpufreq_thermal_notifier,
 };
 
-static unsigned int find_next_max(struct cpufreq_frequency_table *table,
-				  unsigned int prev_max)
-{
-	struct cpufreq_frequency_table *pos;
-	unsigned int max = 0;
+static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev,
+			      struct em_perf_domain *em) {
+	struct cpufreq_policy *policy;
+	unsigned int nr_levels;
+
+	if (!em)
+		return false;
+
+	policy = cpufreq_cdev->policy;
+	if (!cpumask_equal(policy->related_cpus, to_cpumask(em->cpus))) {
+		pr_err("The span of pd %*pbl is misaligned with cpufreq policy %*pbl\n",
+			cpumask_pr_args(to_cpumask(em->cpus)),
+			cpumask_pr_args(policy->related_cpus));
+		return false;
+	}
 
-	cpufreq_for_each_valid_entry(pos, table) {
-		if (pos->frequency > max && pos->frequency < prev_max)
-			max = pos->frequency;
+	nr_levels = cpufreq_cdev->max_level + 1;
+	if (em->nr_cap_states != nr_levels) {
+		pr_err("The number of cap states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n",
+			cpumask_pr_args(to_cpumask(em->cpus)),
+			em->nr_cap_states, nr_levels);
+		return false;
 	}
 
-	return max;
+	return true;
 }
 
 /**
@@ -593,7 +541,7 @@  static unsigned int find_next_max(struct cpufreq_frequency_table *table,
  * @np: a valid struct device_node to the cooling device device tree node
  * @policy: cpufreq policy
  * Normally this should be same as cpufreq policy->related_cpus.
- * @capacitance: dynamic power coefficient for these cpus
+ * @em: Energy Model of the cpufreq policy
  *
  * This interface function registers the cpufreq cooling device with the name
  * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
@@ -605,12 +553,13 @@  static unsigned int find_next_max(struct cpufreq_frequency_table *table,
  */
 static struct thermal_cooling_device *
 __cpufreq_cooling_register(struct device_node *np,
-			struct cpufreq_policy *policy, u32 capacitance)
+			struct cpufreq_policy *policy,
+			struct em_perf_domain *em)
 {
 	struct thermal_cooling_device *cdev;
 	struct cpufreq_cooling_device *cpufreq_cdev;
 	char dev_name[THERMAL_NAME_LENGTH];
-	unsigned int freq, i, num_cpus;
+	unsigned int i, num_cpus;
 	int ret;
 	struct thermal_cooling_device_ops *cooling_ops;
 	bool first;
@@ -644,43 +593,18 @@  __cpufreq_cooling_register(struct device_node *np,
 	/* max_level is an index, not a counter */
 	cpufreq_cdev->max_level = i - 1;
 
-	cpufreq_cdev->freq_table = kmalloc_array(i,
-					sizeof(*cpufreq_cdev->freq_table),
-					GFP_KERNEL);
-	if (!cpufreq_cdev->freq_table) {
-		cdev = ERR_PTR(-ENOMEM);
-		goto free_idle_time;
-	}
-
 	ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0) {
 		cdev = ERR_PTR(ret);
-		goto free_table;
+		goto free_idle_time;
 	}
 	cpufreq_cdev->id = ret;
 
 	snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
 		 cpufreq_cdev->id);
 
-	/* Fill freq-table in descending order of frequencies */
-	for (i = 0, freq = -1; i <= cpufreq_cdev->max_level; i++) {
-		freq = find_next_max(policy->freq_table, freq);
-		cpufreq_cdev->freq_table[i].frequency = freq;
-
-		/* Warn for duplicate entries */
-		if (!freq)
-			pr_warn("%s: table has duplicate entries\n", __func__);
-		else
-			pr_debug("%s: freq:%u KHz\n", __func__, freq);
-	}
-
-	if (capacitance) {
-		ret = update_freq_table(cpufreq_cdev, capacitance);
-		if (ret) {
-			cdev = ERR_PTR(ret);
-			goto remove_ida;
-		}
-
+	if (em_is_sane(cpufreq_cdev, em)) {
+		cpufreq_cdev->em = em;
 		cooling_ops = &cpufreq_power_cooling_ops;
 	} else {
 		cooling_ops = &cpufreq_cooling_ops;
@@ -691,7 +615,7 @@  __cpufreq_cooling_register(struct device_node *np,
 	if (IS_ERR(cdev))
 		goto remove_ida;
 
-	cpufreq_cdev->clipped_freq = cpufreq_cdev->freq_table[0].frequency;
+	cpufreq_cdev->clipped_freq = get_state_freq(cpufreq_cdev, 0);
 	cpufreq_cdev->cdev = cdev;
 
 	mutex_lock(&cooling_list_lock);
@@ -708,8 +632,6 @@  __cpufreq_cooling_register(struct device_node *np,
 
 remove_ida:
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
-free_table:
-	kfree(cpufreq_cdev->freq_table);
 free_idle_time:
 	kfree(cpufreq_cdev->idle_time);
 free_cdev:
@@ -731,7 +653,7 @@  __cpufreq_cooling_register(struct device_node *np,
 struct thermal_cooling_device *
 cpufreq_cooling_register(struct cpufreq_policy *policy)
 {
-	return __cpufreq_cooling_register(NULL, policy, 0);
+	return __cpufreq_cooling_register(NULL, policy, NULL);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
 
@@ -759,7 +681,6 @@  of_cpufreq_cooling_register(struct cpufreq_policy *policy)
 {
 	struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
 	struct thermal_cooling_device *cdev = NULL;
-	u32 capacitance = 0;
 
 	if (!np) {
 		pr_err("cpu_cooling: OF node not available for cpu%d\n",
@@ -768,10 +689,9 @@  of_cpufreq_cooling_register(struct cpufreq_policy *policy)
 	}
 
 	if (of_find_property(np, "#cooling-cells", NULL)) {
-		of_property_read_u32(np, "dynamic-power-coefficient",
-				     &capacitance);
+		struct em_perf_domain *em = em_cpu_get(policy->cpu);
 
-		cdev = __cpufreq_cooling_register(np, policy, capacitance);
+		cdev = __cpufreq_cooling_register(np, policy, em);
 		if (IS_ERR(cdev)) {
 			pr_err("cpu_cooling: cpu%d failed to register as cooling device: %ld\n",
 			       policy->cpu, PTR_ERR(cdev));
@@ -813,7 +733,6 @@  void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 	thermal_cooling_device_unregister(cpufreq_cdev->cdev);
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
 	kfree(cpufreq_cdev->idle_time);
-	kfree(cpufreq_cdev->freq_table);
 	kfree(cpufreq_cdev);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);