diff mbox

[v4,10/10] cpufreq: intel_pstate: Use CPPC to get max performance

Message ID 1474485552-141429-11-git-send-email-srinivas.pandruvada@linux.intel.com (mailing list archive)
State Changes Requested, archived
Headers show

Commit Message

srinivas pandruvada Sept. 21, 2016, 7:19 p.m. UTC
This change uses acpi cppc_lib interface to get CPPC performance limits.
Once CPPC limits of all online cores are read, first check if there is
difference in max performance. If there is a difference, then the
scheduler interface is called to update per cpu priority and enable
ITMT feature.

Here sched_set_itmt_core_prio() is called to set priorities and
sched_set_itmt_support() is called to enable ITMT feature.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 drivers/cpufreq/Kconfig.x86    |   1 +
 drivers/cpufreq/intel_pstate.c | 103 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 103 insertions(+), 1 deletion(-)

Comments

Rafael J. Wysocki Sept. 21, 2016, 8:30 p.m. UTC | #1
On Wed, Sep 21, 2016 at 9:19 PM, Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
> This change uses acpi cppc_lib interface to get CPPC performance limits.
> Once CPPC limits of all online cores are read, first check if there is
> difference in max performance. If there is a difference, then the
> scheduler interface is called to update per cpu priority and enable
> ITMT feature.
>
> Here sched_set_itmt_core_prio() is called to set priorities and
> sched_set_itmt_support() is called to enable ITMT feature.
>
> Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> ---
>  drivers/cpufreq/Kconfig.x86    |   1 +
>  drivers/cpufreq/intel_pstate.c | 103 ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 103 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
> index adbd1de..3328c6b 100644
> --- a/drivers/cpufreq/Kconfig.x86
> +++ b/drivers/cpufreq/Kconfig.x86
> @@ -6,6 +6,7 @@ config X86_INTEL_PSTATE
>         bool "Intel P state control"
>         depends on X86
>         select ACPI_PROCESSOR if ACPI
> +       select ACPI_CPPC_LIB if X86_64 && ACPI

Do we need to select CPPC here if SCHED_ITMT is unset?

>         help
>            This driver provides a P state for Intel core processors.
>           The driver implements an internal governor and will become
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index c877e70..d226a64 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -44,6 +44,7 @@
>
>  #ifdef CONFIG_ACPI
>  #include <acpi/processor.h>
> +#include <acpi/cppc_acpi.h>
>  #endif
>
>  #define FRAC_BITS 8
> @@ -195,6 +196,7 @@ struct _pid {
>   * @sample:            Storage for storing last Sample data
>   * @acpi_perf_data:    Stores ACPI perf information read from _PSS
>   * @valid_pss_table:   Set to true for valid ACPI _PSS entries found
> + * @cppc_perf:         Stores CPPC performance information
>   *
>   * This structure stores per CPU instance data for all CPUs.
>   */
> @@ -218,6 +220,7 @@ struct cpudata {
>  #ifdef CONFIG_ACPI
>         struct acpi_processor_performance acpi_perf_data;
>         bool valid_pss_table;
> +       struct cppc_perf_caps *cppc_perf;
>  #endif
>         unsigned int iowait_boost;
>  };
> @@ -377,14 +380,105 @@ static bool intel_pstate_get_ppc_enable_status(void)
>         return acpi_ppc;
>  }
>

The new code below is only useful if CONFIG_SCHED_ITMT is set, so
maybe it's better to put it into a #ifdef block?

> +/* Mask of CPUs for which CPCC data has been read */
> +static cpumask_t cppc_read_cpu_mask;
> +
> +/*
> + * Can't call sched_set_itmt_support() in hotcpu notifier callback path
> + * as this function uses hotplug locks in its path. So call from
> + * a work function.
> + */
> +static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
> +{
> +       sched_set_itmt_support(true);
> +}
> +
> +static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
> +
> +static void intel_pstate_check_and_enable_itmt(int cpu)
> +{
> +       /*
> +        * For checking whether there is any difference in the maximum
> +        * performance for each CPU, need to wait till we have CPPC
> +        * data from all CPUs called from the cpufreq core. If there is a
> +        * difference in the maximum performance, then we have ITMT support.
> +        * If ITMT is supported, update the scheduler core priority for each
> +        * CPU and call to enable the ITMT feature.
> +        */
> +       if (cpumask_subset(topology_core_cpumask(cpu), &cppc_read_cpu_mask)) {
> +               int cpu_index;
> +               int max_prio;
> +               struct cpudata *cpu;
> +               bool itmt_support = false;
> +
> +               cpu = all_cpu_data[cpumask_first(&cppc_read_cpu_mask)];
> +               max_prio = cpu->cppc_perf->highest_perf;
> +               for_each_cpu(cpu_index, &cppc_read_cpu_mask) {
> +                       cpu = all_cpu_data[cpu_index];
> +                       if (max_prio != cpu->cppc_perf->highest_perf) {
> +                               itmt_support = true;
> +                               break;
> +                       }
> +               }
> +
> +               if (!itmt_support)
> +                       return;
> +
> +               for_each_cpu(cpu_index, &cppc_read_cpu_mask) {
> +                       cpu = all_cpu_data[cpu_index];
> +                       sched_set_itmt_core_prio(cpu->cppc_perf->highest_perf,
> +                                                cpu_index);
> +               }

My current understanding is that we need to rebuild sched domains
after setting the priorities, so what if there are two CPU packages
and there are highest_perf differences in both, and we first enumerate
the first package entirely before getting to the second one?

In that case we'll schedule the work item after enumerating the first
package and it may rebuild the sched domains before all priorities are
set for the second package, may it not?

This seems to require some more consideration.

> +               /*
> +                * Since this function is in the hotcpu notifier callback
> +                * path, submit a task to workqueue to call
> +                * sched_set_itmt_support().
> +                */
> +               schedule_work(&sched_itmt_work);

It doesn't make sense to do this more than once IMO and what if we
attempt to schedule the work item again when it has been scheduled
once already?  Don't we need any protection here?

> +       }
> +}
> +
> +/*
> + * Process ACPI CPPC information. Currently it is only used to for enabling
> + * ITMT feature. This driver still uses MSRs to manage HWP, not CPPC.
> + */
> +static void intel_pstate_process_acpi_cppc(struct cpufreq_policy *policy)
> +{
> +       struct cpudata *cpu;
> +       int ret;
> +
> +       cpu = all_cpu_data[policy->cpu];
> +       cpu->cppc_perf = kzalloc(sizeof(struct cppc_perf_caps), GFP_KERNEL);
> +       if (!cpu->cppc_perf)
> +               return;
> +
> +       ret = cppc_get_perf_caps(policy->cpu, cpu->cppc_perf);
> +       if (ret) {
> +               kfree(cpu->cppc_perf);
> +               cpu->cppc_perf = NULL;
> +               return;
> +       }
> +
> +       pr_debug("cpu:%d H:0x%x N:0x%x L:0x%x\n", policy->cpu,
> +                cpu->cppc_perf->highest_perf, cpu->cppc_perf->nominal_perf,
> +                cpu->cppc_perf->lowest_perf);
> +
> +       /* Mark that the CPPC data for the policy->cpu is read */
> +       cpumask_set_cpu(policy->cpu, &cppc_read_cpu_mask);
> +
> +       intel_pstate_check_and_enable_itmt(policy->cpu);
> +}
> +
>  static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
>  {
>         struct cpudata *cpu;
>         int ret;
>         int i;
>
> -       if (hwp_active)
> +       if (hwp_active) {
> +               intel_pstate_process_acpi_cppc(policy);
>                 return;
> +       }
>
>         if (!intel_pstate_get_ppc_enable_status())
>                 return;
> @@ -450,6 +544,13 @@ static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
>         struct cpudata *cpu;
>
>         cpu = all_cpu_data[policy->cpu];
> +
> +       if (cpu->cppc_perf) {
> +               cpumask_clear_cpu(policy->cpu, &cppc_read_cpu_mask);
> +               kfree(cpu->cppc_perf);
> +               cpu->cppc_perf = NULL;
> +       }
> +
>         if (!cpu->valid_pss_table)
>                 return;
>
> --

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tim Chen Sept. 22, 2016, 6:50 p.m. UTC | #2
On Wed, 2016-09-21 at 22:30 +0200, Rafael J. Wysocki wrote:
> On Wed, Sep 21, 2016 at 9:19 PM, Srinivas Pandruvada
> <srinivas.pandruvada@linux.intel.com> wrote:
> > 
> > 
> > +
> > +static void intel_pstate_check_and_enable_itmt(int cpu)
> > +{
> > +       /*
> > +        * For checking whether there is any difference in the maximum
> > +        * performance for each CPU, need to wait till we have CPPC
> > +        * data from all CPUs called from the cpufreq core. If there is a
> > +        * difference in the maximum performance, then we have ITMT support.
> > +        * If ITMT is supported, update the scheduler core priority for each
> > +        * CPU and call to enable the ITMT feature.
> > +        */
> > +       if (cpumask_subset(topology_core_cpumask(cpu), &cppc_read_cpu_mask)) {
> > +               int cpu_index;
> > +               int max_prio;
> > +               struct cpudata *cpu;
> > +               bool itmt_support = false;
> > +
> > +               cpu = all_cpu_data[cpumask_first(&cppc_read_cpu_mask)];
> > +               max_prio = cpu->cppc_perf->highest_perf;
> > +               for_each_cpu(cpu_index, &cppc_read_cpu_mask) {
> > +                       cpu = all_cpu_data[cpu_index];
> > +                       if (max_prio != cpu->cppc_perf->highest_perf) {
> > +                               itmt_support = true;
> > +                               break;
> > +                       }
> > +               }
> > +
> > +               if (!itmt_support)
> > +                       return;
> > +
> > +               for_each_cpu(cpu_index, &cppc_read_cpu_mask) {
> > +                       cpu = all_cpu_data[cpu_index];
> > +                       sched_set_itmt_core_prio(cpu->cppc_perf->highest_perf,
> > +                                                cpu_index);
> > +               }
> My current understanding is that we need to rebuild sched domains
> after setting the priorities, 

No, that's not true.  We need to rebuild the sched domains only
when the sched domain flags are changed, not when we are changing
the priorities.  Only the sched domain flag is a property of
the sched domain. CPU priority values are not part of sched domain.

Morten had similar question about whether we need to rebuild sched domain
when we change cpu priorities when we first post the patches. 
Peter has explained that it wasn't necessary.
http://lkml.iu.edu/hypermail/linux/kernel/1608.3/01753.html



> so what if there are two CPU packages
> and there are highest_perf differences in both, and we first enumerate
> the first package entirely before getting to the second one?
> 
> In that case we'll schedule the work item after enumerating the first
> package and it may rebuild the sched domains before all priorities are
> set for the second package, may it not?

That is not a problem.  For the second package, all the cpu priorities
are initialized to the same value.  So even if we start to do 
asym_packing in the scheduler for the whole system, 
on the second package, all the cpus are treated equally by the scheduler.
We will operate as if there is no favored core till we update the
priorities of the cpu on the second package.

That said, we don't enable ITMT automatically for 2 package system.
So the explicit sysctl command to enable ITMT and cause the sched domain
rebuild for 2 package system is most likely to come after
we have discovered and set all the cpu priorities.

> 
> This seems to require some more consideration.
> 
> > 
> > +               /*
> > +                * Since this function is in the hotcpu notifier callback
> > +                * path, submit a task to workqueue to call
> > +                * sched_set_itmt_support().
> > +                */
> > +               schedule_work(&sched_itmt_work);
> It doesn't make sense to do this more than once IMO and what if we
> attempt to schedule the work item again when it has been scheduled
> once already?  Don't we need any protection here?

It is not a problem for sched_set_itmt_support to be called more than
once.

First, we will ignore the second call if sched_itmt_capable has already
been set to the same value in the previous sched_set_itmt_support call.
Secondly, the call to update sched_itmt_capable
is protected by the itmt_update_mutex.

Thanks.

Tim

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner Sept. 22, 2016, 6:56 p.m. UTC | #3
On Thu, 22 Sep 2016, Tim Chen wrote:
> On Wed, 2016-09-21 at 22:30 +0200, Rafael J. Wysocki wrote:
> > My current understanding is that we need to rebuild sched domains
> > after setting the priorities, 
> 
> No, that's not true.  We need to rebuild the sched domains only
> when the sched domain flags are changed, not when we are changing
> the priorities.  Only the sched domain flag is a property of
> the sched domain. CPU priority values are not part of sched domain.
> 
> Morten had similar question about whether we need to rebuild sched domain
> when we change cpu priorities when we first post the patches. 
> Peter has explained that it wasn't necessary.
> http://lkml.iu.edu/hypermail/linux/kernel/1608.3/01753.html

And why is there no explanation in form of a comment in the code?

Thanks,

	tglx
Tim Chen Sept. 22, 2016, 7:01 p.m. UTC | #4
On Thu, 2016-09-22 at 20:56 +0200, Thomas Gleixner wrote:
> On Thu, 22 Sep 2016, Tim Chen wrote:
> > 
> > On Wed, 2016-09-21 at 22:30 +0200, Rafael J. Wysocki wrote:
> > > 
> > > My current understanding is that we need to rebuild sched domains
> > > after setting the priorities, 
> > No, that's not true.  We need to rebuild the sched domains only
> > when the sched domain flags are changed, not when we are changing
> > the priorities.  Only the sched domain flag is a property of
> > the sched domain. CPU priority values are not part of sched domain.
> > 
> > Morten had similar question about whether we need to rebuild sched domain
> > when we change cpu priorities when we first post the patches. 
> > Peter has explained that it wasn't necessary.
> > http://lkml.iu.edu/hypermail/linux/kernel/1608.3/01753.html
> And why is there no explanation in form of a comment in the code?

Sure, I'll add a comment.

Thanks.

Tim


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Sept. 22, 2016, 8:58 p.m. UTC | #5
On Thu, Sep 22, 2016 at 8:50 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
> On Wed, 2016-09-21 at 22:30 +0200, Rafael J. Wysocki wrote:
>> On Wed, Sep 21, 2016 at 9:19 PM, Srinivas Pandruvada
>> <srinivas.pandruvada@linux.intel.com> wrote:
>> >
>> >
>> > +
>> > +static void intel_pstate_check_and_enable_itmt(int cpu)
>> > +{
>> > +       /*
>> > +        * For checking whether there is any difference in the maximum
>> > +        * performance for each CPU, need to wait till we have CPPC
>> > +        * data from all CPUs called from the cpufreq core. If there is a
>> > +        * difference in the maximum performance, then we have ITMT support.
>> > +        * If ITMT is supported, update the scheduler core priority for each
>> > +        * CPU and call to enable the ITMT feature.
>> > +        */
>> > +       if (cpumask_subset(topology_core_cpumask(cpu), &cppc_read_cpu_mask)) {
>> > +               int cpu_index;
>> > +               int max_prio;
>> > +               struct cpudata *cpu;
>> > +               bool itmt_support = false;
>> > +
>> > +               cpu = all_cpu_data[cpumask_first(&cppc_read_cpu_mask)];
>> > +               max_prio = cpu->cppc_perf->highest_perf;
>> > +               for_each_cpu(cpu_index, &cppc_read_cpu_mask) {
>> > +                       cpu = all_cpu_data[cpu_index];
>> > +                       if (max_prio != cpu->cppc_perf->highest_perf) {
>> > +                               itmt_support = true;
>> > +                               break;
>> > +                       }
>> > +               }
>> > +
>> > +               if (!itmt_support)
>> > +                       return;
>> > +
>> > +               for_each_cpu(cpu_index, &cppc_read_cpu_mask) {
>> > +                       cpu = all_cpu_data[cpu_index];
>> > +                       sched_set_itmt_core_prio(cpu->cppc_perf->highest_perf,
>> > +                                                cpu_index);
>> > +               }
>> My current understanding is that we need to rebuild sched domains
>> after setting the priorities,
>
> No, that's not true.  We need to rebuild the sched domains only
> when the sched domain flags are changed, not when we are changing
> the priorities.  Only the sched domain flag is a property of
> the sched domain. CPU priority values are not part of sched domain.
>
> Morten had similar question about whether we need to rebuild sched domain
> when we change cpu priorities when we first post the patches.
> Peter has explained that it wasn't necessary.
> http://lkml.iu.edu/hypermail/linux/kernel/1608.3/01753.html

So to me this means that sched domains need to be rebuilt in two cases
by the ITMT code:
(1) When the "ITMT capable" flag changes.
(2) When the sysctl setting changes.

In which case I'm not sure why intel_pstate_check_and_enable_itmt()
has to be so complicated.

It seems to only need to (a) set the priority for the current CPU and
(b) invoke sched_set_itmt_support() (via the work item) to set the
"ITMT capable" flag if it finds out that ITMT should be enabled.

And it may be better to enable ITMT at the _OSC exchange time (if the
platform acknowledges support).

>> so what if there are two CPU packages
>> and there are highest_perf differences in both, and we first enumerate
>> the first package entirely before getting to the second one?
>>
>> In that case we'll schedule the work item after enumerating the first
>> package and it may rebuild the sched domains before all priorities are
>> set for the second package, may it not?
>
> That is not a problem.  For the second package, all the cpu priorities
> are initialized to the same value.  So even if we start to do
> asym_packing in the scheduler for the whole system,
> on the second package, all the cpus are treated equally by the scheduler.
> We will operate as if there is no favored core till we update the
> priorities of the cpu on the second package.

OK

But updating those priorities after we have set the "ITMT capable"
flag is not a problem?  Nobody is going to be confused and so on?

> That said, we don't enable ITMT automatically for 2 package system.
> So the explicit sysctl command to enable ITMT and cause the sched domain
> rebuild for 2 package system is most likely to come after
> we have discovered and set all the cpu priorities.

Right, but if that behavior is relied on, there should be a comment
about that in the code (and relying on it would be kind of fragile for
that matter).

>>
>> This seems to require some more consideration.
>>
>> >
>> > +               /*
>> > +                * Since this function is in the hotcpu notifier callback
>> > +                * path, submit a task to workqueue to call
>> > +                * sched_set_itmt_support().
>> > +                */
>> > +               schedule_work(&sched_itmt_work);
>> It doesn't make sense to do this more than once IMO and what if we
>> attempt to schedule the work item again when it has been scheduled
>> once already?  Don't we need any protection here?
>
> It is not a problem for sched_set_itmt_support to be called more than
> once.

While it is not incorrect, it also is not particularly useful to
schedule a work item just to find out later that it had nothing to do
to begin with.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tim Chen Sept. 22, 2016, 9:41 p.m. UTC | #6
On Thu, 2016-09-22 at 22:58 +0200, Rafael J. Wysocki wrote:

> > > so what if there are two CPU packages
> > > and there are highest_perf differences in both, and we first enumerate
> > > the first package entirely before getting to the second one?
> > > 
> > > In that case we'll schedule the work item after enumerating the first
> > > package and it may rebuild the sched domains before all priorities are
> > > set for the second package, may it not?
> > That is not a problem.  For the second package, all the cpu priorities
> > are initialized to the same value.  So even if we start to do
> > asym_packing in the scheduler for the whole system,
> > on the second package, all the cpus are treated equally by the scheduler.
> > We will operate as if there is no favored core till we update the
> > priorities of the cpu on the second package.
> OK
> 
> But updating those priorities after we have set the "ITMT capable"
> flag is not a problem?  Nobody is going to be confused and so on?
> 

Not a problem.  The worst thing that could happen is we schedule a job
to a cpu with a lesser max turbo freq first while the priorities update are in
progress.

> > 
> > That said, we don't enable ITMT automatically for 2 package system.
> > So the explicit sysctl command to enable ITMT and cause the sched domain
> > rebuild for 2 package system is most likely to come after
> > we have discovered and set all the cpu priorities.
> Right, but if that behavior is relied on, there should be a comment
> about that in the code (and relying on it would be kind of fragile for
> that matter).

No, we don't rely on this behavior of not enabling ITMT automatically
for 2 package system.  We could enable ITMT for 2
package system by default if we want to.  Then asym_packing will just
consider the second package's cpus to be equal priorities if they haven't
been set.  

> 
> > 
> > > 
> > > 
> > > This seems to require some more consideration.
> > > 
> > > > 
> > > > 
> > > > +               /*
> > > > +                * Since this function is in the hotcpu notifier callback
> > > > +                * path, submit a task to workqueue to call
> > > > +                * sched_set_itmt_support().
> > > > +                */
> > > > +               schedule_work(&sched_itmt_work);
> > > It doesn't make sense to do this more than once IMO and what if we
> > > attempt to schedule the work item again when it has been scheduled
> > > once already?  Don't we need any protection here?
> > It is not a problem for sched_set_itmt_support to be called more than
> > once.
> While it is not incorrect, it also is not particularly useful to
> schedule a work item just to find out later that it had nothing to do
> to begin with.

Setting ITMT capability is done per socket during system boot.  So there is no
performance impact at all so it should not be an issue.

Tim
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index adbd1de..3328c6b 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -6,6 +6,7 @@  config X86_INTEL_PSTATE
        bool "Intel P state control"
        depends on X86
        select ACPI_PROCESSOR if ACPI
+       select ACPI_CPPC_LIB if X86_64 && ACPI
        help
           This driver provides a P state for Intel core processors.
 	  The driver implements an internal governor and will become
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c877e70..d226a64 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -44,6 +44,7 @@ 
 
 #ifdef CONFIG_ACPI
 #include <acpi/processor.h>
+#include <acpi/cppc_acpi.h>
 #endif
 
 #define FRAC_BITS 8
@@ -195,6 +196,7 @@  struct _pid {
  * @sample:		Storage for storing last Sample data
  * @acpi_perf_data:	Stores ACPI perf information read from _PSS
  * @valid_pss_table:	Set to true for valid ACPI _PSS entries found
+ * @cppc_perf:		Stores CPPC performance information
  *
  * This structure stores per CPU instance data for all CPUs.
  */
@@ -218,6 +220,7 @@  struct cpudata {
 #ifdef CONFIG_ACPI
 	struct acpi_processor_performance acpi_perf_data;
 	bool valid_pss_table;
+	struct cppc_perf_caps *cppc_perf;
 #endif
 	unsigned int iowait_boost;
 };
@@ -377,14 +380,105 @@  static bool intel_pstate_get_ppc_enable_status(void)
 	return acpi_ppc;
 }
 
+/* Mask of CPUs for which CPCC data has been read */
+static cpumask_t cppc_read_cpu_mask;
+
+/*
+ * Can't call sched_set_itmt_support() in hotcpu notifier callback path
+ * as this function uses hotplug locks in its path. So call from
+ * a work function.
+ */
+static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
+{
+	sched_set_itmt_support(true);
+}
+
+static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
+
+static void intel_pstate_check_and_enable_itmt(int cpu)
+{
+	/*
+	 * For checking whether there is any difference in the maximum
+	 * performance for each CPU, need to wait till we have CPPC
+	 * data from all CPUs called from the cpufreq core. If there is a
+	 * difference in the maximum performance, then we have ITMT support.
+	 * If ITMT is supported, update the scheduler core priority for each
+	 * CPU and call to enable the ITMT feature.
+	 */
+	if (cpumask_subset(topology_core_cpumask(cpu), &cppc_read_cpu_mask)) {
+		int cpu_index;
+		int max_prio;
+		struct cpudata *cpu;
+		bool itmt_support = false;
+
+		cpu = all_cpu_data[cpumask_first(&cppc_read_cpu_mask)];
+		max_prio = cpu->cppc_perf->highest_perf;
+		for_each_cpu(cpu_index, &cppc_read_cpu_mask) {
+			cpu = all_cpu_data[cpu_index];
+			if (max_prio != cpu->cppc_perf->highest_perf) {
+				itmt_support = true;
+				break;
+			}
+		}
+
+		if (!itmt_support)
+			return;
+
+		for_each_cpu(cpu_index, &cppc_read_cpu_mask) {
+			cpu = all_cpu_data[cpu_index];
+			sched_set_itmt_core_prio(cpu->cppc_perf->highest_perf,
+						 cpu_index);
+		}
+		/*
+		 * Since this function is in the hotcpu notifier callback
+		 * path, submit a task to workqueue to call
+		 * sched_set_itmt_support().
+		 */
+		schedule_work(&sched_itmt_work);
+	}
+}
+
+/*
+ * Process ACPI CPPC information. Currently it is only used to for enabling
+ * ITMT feature. This driver still uses MSRs to manage HWP, not CPPC.
+ */
+static void intel_pstate_process_acpi_cppc(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu;
+	int ret;
+
+	cpu = all_cpu_data[policy->cpu];
+	cpu->cppc_perf = kzalloc(sizeof(struct cppc_perf_caps), GFP_KERNEL);
+	if (!cpu->cppc_perf)
+		return;
+
+	ret = cppc_get_perf_caps(policy->cpu, cpu->cppc_perf);
+	if (ret) {
+		kfree(cpu->cppc_perf);
+		cpu->cppc_perf = NULL;
+		return;
+	}
+
+	pr_debug("cpu:%d H:0x%x N:0x%x L:0x%x\n", policy->cpu,
+		 cpu->cppc_perf->highest_perf, cpu->cppc_perf->nominal_perf,
+		 cpu->cppc_perf->lowest_perf);
+
+	/* Mark that the CPPC data for the policy->cpu is read */
+	cpumask_set_cpu(policy->cpu, &cppc_read_cpu_mask);
+
+	intel_pstate_check_and_enable_itmt(policy->cpu);
+}
+
 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 {
 	struct cpudata *cpu;
 	int ret;
 	int i;
 
-	if (hwp_active)
+	if (hwp_active) {
+		intel_pstate_process_acpi_cppc(policy);
 		return;
+	}
 
 	if (!intel_pstate_get_ppc_enable_status())
 		return;
@@ -450,6 +544,13 @@  static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 	struct cpudata *cpu;
 
 	cpu = all_cpu_data[policy->cpu];
+
+	if (cpu->cppc_perf) {
+		cpumask_clear_cpu(policy->cpu, &cppc_read_cpu_mask);
+		kfree(cpu->cppc_perf);
+		cpu->cppc_perf = NULL;
+	}
+
 	if (!cpu->valid_pss_table)
 		return;