diff mbox

[v4] cpufreq: Add SFI based cpufreq driver support

Message ID 20141104170031.GA25991@intel-desktop (mailing list archive)
State Changes Requested, archived
Headers show

Commit Message

Kasagar, Srinidhi Nov. 4, 2014, 5 p.m. UTC
This adds the SFI based cpu freq driver for some of the Intel's
Silvermont based Atom architectures like Z34xx and Z35xx.

Signed-off-by: Rudramuni, Vishwesh M <vishwesh.m.rudramuni@intel.com>
Signed-off-by: Srinidhi Kasagar <srinidhi.kasagar@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
v2:
* rephrase the Kconfig text
* merge the header file as its not being shared with anyone
* remove the unnecessary checks in sfi_cpufreq_target
* style changes in kzalloc
* fix the potential memory leak sfi_cpufreq_cpu_init
* move the cpufreq driver flag to the structure definition
* remove the redundant init of policy->cpu and policy->cur
* remove the unnecessary check while init the freq table
* merge the for loops in cpu_init
* use cpufreq_generic_attr rather than the custom one

v3:
* Add this change log
* remove the left over not used cached_freq var
* merge yet another for loop in cpu_init

v4:
* remove unnecessary pr_debug
* rephrase Kconfig text
* remove unused macro SFI_CPU_MAX
* Add Viresh's Acked-by

 drivers/cpufreq/Kconfig.x86   |   10 ++
 drivers/cpufreq/Makefile      |    1 +
 drivers/cpufreq/sfi-cpufreq.c |  317 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 328 insertions(+)
 create mode 100644 drivers/cpufreq/sfi-cpufreq.c

Comments

Len Brown Nov. 7, 2014, 6:18 a.m. UTC | #1
On Tue, Nov 4, 2014 at 12:00 PM, Srinidhi Kasagar
<srinidhi.kasagar@intel.com> wrote:
> This adds the SFI based cpu freq driver for some of the Intel's
> Silvermont based Atom architectures like Z34xx and Z35xx.
>
> Signed-off-by: Rudramuni, Vishwesh M <vishwesh.m.rudramuni@intel.com>
> Signed-off-by: Srinidhi Kasagar <srinidhi.kasagar@intel.com>
> Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
...
>
>  drivers/cpufreq/Kconfig.x86   |   10 ++
>  drivers/cpufreq/Makefile      |    1 +

Please update MAINTAINERS to indicate who will maintain this driver.

>  drivers/cpufreq/sfi-cpufreq.c |  317 +++++++++++++++++++++++++++++++++++++++++

As this driver is called sfi-cpufreq.c
Its private functions and data structures below should best be called
sfi_cpufreq_* instead of
sfi_processor_* -- which looks like they are calling some other sub-system...

>  3 files changed, 328 insertions(+)
>  create mode 100644 drivers/cpufreq/sfi-cpufreq.c
>
> diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
> index 89ae88f91895..8860e8062b53 100644
> --- a/drivers/cpufreq/Kconfig.x86
> +++ b/drivers/cpufreq/Kconfig.x86
> @@ -57,6 +57,16 @@ config X86_ACPI_CPUFREQ_CPB
>           By enabling this option the acpi_cpufreq driver provides the old
>           entry in addition to the new boost ones, for compatibility reasons.
>
> +config X86_SFI_CPUFREQ
> +       tristate "SFI Processor P-States driver"
> +       depends on X86_INTEL_MID && SFI
> +       help
> +         This adds a CPUFreq driver for some Silvermont based Intel Atom
> +         platforms (like INTEL_MID) which enumerates processor performance
> +         states through SFI.

s/enumerates/enumerate/

Also, perhaps add the real HW names that are present in the commit message?

> +
> +         If in doubt, say N.
> +
>  config ELAN_CPUFREQ
>         tristate "AMD Elan SC400 and SC410"
>         depends on MELAN
...
> diff --git a/drivers/cpufreq/sfi-cpufreq.c b/drivers/cpufreq/sfi-cpufreq.c
> new file mode 100644
> index 000000000000..d5e11dca739d
> --- /dev/null
> +++ b/drivers/cpufreq/sfi-cpufreq.c
> @@ -0,0 +1,317 @@
> +/*
> + *  SFI Processor P-States Driver
> + *  Based on ACPI Processor P-States Driver
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License.
> + *
> + *  This program is distributed in the hope that it will be useful, but
> + *  WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + *  General Public License for more details.
> + *
> + *  Author: Vishwesh M Rudramuni <vishwesh.m.rudramuni@intel.com>
> + *  Author: Srinidhi Kasagar <srinidhi.kasagar@intel.com>
> + */
> +
> +#include <linux/cpufreq.h>
> +#include <linux/cpuidle.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/sfi.h>
> +#include <linux/slab.h>
> +#include <linux/smp.h>
> +
> +#include <asm/msr.h>
> +#include <asm/processor.h>
> +
> +#define SFI_FREQ_MAX           32

Delete this #define, and the array where it is used.
Neither are necessary.

> +#define INTEL_MSR_RANGE                0xffff

I see this was copied INTEL_MSR_RANGE from acpi-cpufreq.c, but
two wrongs don't make a right.  Instead it should be something like

msr-index.h
#define MSR_IA32_PERF_CTL               0x00000199
+#define INTEL_PERF_CTL_MASK      0xffff

as it is (Intel) architecture generic (and it is a mask, not a range:-)

> +#define INTEL_MSR_BUSRATIO_MASK        0xff00

Please delete this #define, and the code that references it.

If this definition were architectural, we'd also see it in msr-index.h
under the definition of PERF_STATUS msr.
But it isn't architectural.  I don't like seeing it manufactured for this code,
and I don't like seeing the unreliable PERF_STATUS MSR accessed.

For the driver.get current frequency, it would be better to either
simply return the last value written to PERF_CTRL, or if you really
want to get fance, use APERF/MPERF to actually calcuate the frequency,
like intel_pstate.c does.
In either case, this #define and the code it references should go.

> +
> +static DEFINE_PER_CPU(struct sfi_cpufreq_data *, drv_data);
> +static DEFINE_MUTEX(performance_mutex);
> +static struct sfi_freq_table_entry sfi_cpufreq_array[SFI_FREQ_MAX];
> +
> +/* sfi_perf_data is a pointer to percpu data. */
> +static struct sfi_processor_performance *sfi_perf_data;
> +static struct cpufreq_driver sfi_cpufreq_driver;
> +static int sfi_cpufreq_num;
> +
> +/* Performance management */
> +struct sfi_processor_px {
> +       u32 core_frequency;     /* megahertz */
> +       u32 transition_latency; /* microseconds */
> +       u32 control;    /* control value */
> +};

delete the definition of sfi_processor_px.
it is already defined in sfi.h as sfi_freq_table_entry.

> +
> +struct sfi_processor_performance {
> +       unsigned int state;
> +       unsigned int state_count;
> +       struct sfi_processor_px *states;
> +};
> +
> +struct sfi_cpufreq_data {
> +       struct sfi_processor_performance *sfi_data;
> +       struct cpufreq_frequency_table *freq_table;
> +};
> +
> +static int parse_freq(struct sfi_table_header *table)
> +{
> +       struct sfi_table_simple *sb;
> +       struct sfi_freq_table_entry *pentry;
> +       int total_len;
> +
> +       sb = (struct sfi_table_simple *)table;
> +       if (!sb) {
> +               pr_warn("SFI: Unable to map frequency table\n");
> +               return -ENODEV;

Delete this check, it is impossible for it to execute.
The calling function, sfi_table_parse()
already checked for !table.

> +       }
> +
> +       if (!sfi_cpufreq_num) {

Why is there a check for sfi_cpufreq_num?
Do you expect to find more than one SFI_SIG_FREQ,
and here you are implementing a policy of using the 1st
and ignoring the 2nd?

> +               sfi_cpufreq_num = SFI_GET_NUM_ENTRIES(sb,
> +                        struct sfi_freq_table_entry);
> +               pentry = (struct sfi_freq_table_entry *)sb->pentry;
> +               total_len = sfi_cpufreq_num * sizeof(*pentry);
> +               memcpy(sfi_cpufreq_array, pentry, total_len);

you went to the trouble of defining SFI_FREQ_MAX,
but I don't see a check here to make sure that you are
not copying over the end of that array...

Why are you copying the entries to sfi_cpu_freq_array[]
in the first place?  Wouldn't it be a better delete
SFI_FREQ_MAX, delete this copy, and simply read the table
entries from where they already are?

> +       }
> +
> +       return 0;
> +}
> +
> +static int sfi_processor_get_performance_states(
> +                       struct sfi_processor_performance *performance)
> +{
> +       int result = 0;
> +       int i;
> +
> +       performance->state_count = sfi_cpufreq_num;
> +       performance->states =
> +           kmalloc(sizeof(struct sfi_processor_px) * sfi_cpufreq_num,
> +                   GFP_KERNEL);
> +       if (!performance->states)
> +               result = -ENOMEM;

don't you mean *return* -ENOMEM?
If you continue here, below you're going to try to write to the structure
you just failed to allocate....

But again, unclear you need to allocate this memory in the first place...

> +
> +       /* Populate the P-states info from the SFI table */
> +       for (i = 0; i < sfi_cpufreq_num; i++) {
> +               performance->states[i].core_frequency =
> +                       sfi_cpufreq_array[i].freq_mhz;
> +               performance->states[i].transition_latency =
> +                       sfi_cpufreq_array[i].latency;
> +               performance->states[i].control =
> +                       sfi_cpufreq_array[i].ctrl_val;
> +               pr_debug("State [%d]: core_frequency[%d] transition_latency[%d] control[0x%x]\n",
> +                       i,
> +                       (u32) performance->states[i].core_frequency,
> +                       (u32) performance->states[i].transition_latency,
> +                       (u32) performance->states[i].control);
> +       }
> +
> +       return result;

It really does seem that sfi_processor_get_performance_states()
can be replaced with 1 line in your parse routine to point
performance->states to the address of the 0th entry in sfi_freq table.

> +}
> +
> +static int sfi_processor_register_performance(
> +                       struct sfi_processor_performance *performance)
> +{
> +       mutex_lock(&performance_mutex);
> +
> +       /* parse the freq table from SFI */
> +       sfi_cpufreq_num = 0;
> +       sfi_table_parse(SFI_SIG_FREQ, NULL, NULL, parse_freq);

why do you ignore sfi_table_parse()'s possible error return value?

> +
> +       sfi_processor_get_performance_states(performance);

again, should look at error return values,
but even better to delete the called routine entirely.

> +
> +       mutex_unlock(&performance_mutex);
> +
> +       return 0;
> +}
> +
> +void sfi_processor_unregister_performance(
> +                       struct sfi_processor_performance *performance)
> +{
> +       mutex_lock(&performance_mutex);
> +       kfree(performance->states);
> +       mutex_unlock(&performance_mutex);
> +}
> +
> +static unsigned extract_freq(u32 msr, unsigned int cpu)

delete this routine.

> +{
> +       struct cpufreq_frequency_table *pos;
> +       struct sfi_processor_performance *perf;
> +       struct sfi_cpufreq_data *data = per_cpu(drv_data, cpu);
> +       u32 sfi_ctrl;
> +
> +       msr &= INTEL_MSR_BUSRATIO_MASK;
> +       perf = data->sfi_data;
> +
> +       cpufreq_for_each_entry(pos, data->freq_table) {
> +               sfi_ctrl = perf->states[pos->driver_data].control
> +                       & INTEL_MSR_BUSRATIO_MASK;
> +               if (sfi_ctrl == msr)
> +                       return pos->frequency;
> +       }
> +       return data->freq_table[0].frequency;
> +}
> +
> +static u32 get_cur_val(const struct cpumask *mask)
> +{

delete this routine

> +       u32 val, dummy;
> +
> +       if (unlikely(cpumask_empty(mask)))
> +               return 0;
> +
> +       rdmsr_on_cpu(cpumask_any(mask), MSR_IA32_PERF_STATUS, &val, &dummy);
> +
> +       return val;
> +}
> +
> +static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
> +{

replace this routine with one that
simply returns the latest requested freq on that cpu.
or if you want to get fancy, calculate actual average frequency
like intel_pstate.c does.

> +       return extract_freq(get_cur_val(cpumask_of(cpu)), cpu);
> +}
> +
> +static int sfi_cpufreq_target(struct cpufreq_policy *policy,
> +                               unsigned int index)
> +{
> +       struct sfi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
> +       struct sfi_processor_performance *perf;
> +       unsigned int next_perf_state = 0; /* Index into perf table */
> +       u32 lo, hi;
> +
> +       perf = data->sfi_data;
> +       next_perf_state = data->freq_table[index].driver_data;
> +
> +       rdmsr_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, &lo, &hi);
> +       lo = (lo & ~INTEL_MSR_RANGE) |
> +               ((u32) perf->states[next_perf_state].control & INTEL_MSR_RANGE);
> +       wrmsr_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, lo, hi);
> +
> +       perf->state = next_perf_state;
> +
> +       return 0;
> +}
> +
> +static int sfi_cpufreq_cpu_init(struct cpufreq_policy *policy)
> +{
> +       unsigned int i, result = 0, valid_states = 0;
> +       unsigned int cpu = policy->cpu;
> +       struct sfi_cpufreq_data *data;
> +       struct sfi_processor_performance *perf;
> +
> +       data = kzalloc(sizeof(*data), GFP_KERNEL);
> +       if (!data)
> +               return -ENOMEM;
> +
> +       data->sfi_data = per_cpu_ptr(sfi_perf_data, cpu);
> +       per_cpu(drv_data, cpu) = data;
> +
> +       result = sfi_processor_register_performance(data->sfi_data);
> +       if (result)
> +               goto err_free;
> +
> +       perf = data->sfi_data;
> +       policy->shared_type = CPUFREQ_SHARED_TYPE_HW;
> +
> +       /* capability check */
> +       if (perf->state_count <= 1) {

yes, I know you copied this from acpi-cpufreq.c, but
wouldn't it make more sense for register_performance(), which discovers
the number of states and makes the allocation, to handle this internally
and simply return success/fail?

> +               pr_err("No P-States\n");
> +               result = -ENODEV;
> +               goto err_unreg;
> +       }
> +
...

thanks,
Len Brown, Intel Open Source Technology Center
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kasagar, Srinidhi Nov. 10, 2014, 10:56 a.m. UTC | #2
On Fri, Nov 07, 2014 at 01:18:47AM -0500, Len Brown wrote:
> On Tue, Nov 4, 2014 at 12:00 PM, Srinidhi Kasagar
> <srinidhi.kasagar@intel.com> wrote:
> > This adds the SFI based cpu freq driver for some of the Intel's
> > Silvermont based Atom architectures like Z34xx and Z35xx.
> >
> > Signed-off-by: Rudramuni, Vishwesh M <vishwesh.m.rudramuni@intel.com>
> > Signed-off-by: Srinidhi Kasagar <srinidhi.kasagar@intel.com>
> > Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
> ...
> >
> >  drivers/cpufreq/Kconfig.x86   |   10 ++
> >  drivers/cpufreq/Makefile      |    1 +
> 
> Please update MAINTAINERS to indicate who will maintain this driver.

OK..

[...]

> > +config X86_SFI_CPUFREQ
> > +       tristate "SFI Processor P-States driver"
> > +       depends on X86_INTEL_MID && SFI
> > +       help
> > +         This adds a CPUFreq driver for some Silvermont based Intel Atom
> > +         platforms (like INTEL_MID) which enumerates processor performance
> > +         states through SFI.
> 
> s/enumerates/enumerate/
> 
> Also, perhaps add the real HW names that are present in the commit message?

Ok, will fix.

[..]

> > +#include <asm/msr.h>
> > +#include <asm/processor.h>
> > +
> > +#define SFI_FREQ_MAX           32
> 
> Delete this #define, and the array where it is used.
> Neither are necessary.
> 
> > +#define INTEL_MSR_RANGE                0xffff
> 
> I see this was copied INTEL_MSR_RANGE from acpi-cpufreq.c, but
> two wrongs don't make a right.  Instead it should be something like
> 
> msr-index.h
> #define MSR_IA32_PERF_CTL               0x00000199
> +#define INTEL_PERF_CTL_MASK      0xffff
> 
> as it is (Intel) architecture generic (and it is a mask, not a range:-)

Yes, make sense..Will fix.

> 
> > +#define INTEL_MSR_BUSRATIO_MASK        0xff00
> 
> Please delete this #define, and the code that references it.
> 
> If this definition were architectural, we'd also see it in msr-index.h
> under the definition of PERF_STATUS msr.
> But it isn't architectural.  I don't like seeing it manufactured for this code,
> and I don't like seeing the unreliable PERF_STATUS MSR accessed.
> 
> For the driver.get current frequency, it would be better to either
> simply return the last value written to PERF_CTRL, or if you really
> want to get fance, use APERF/MPERF to actually calcuate the frequency,
> like intel_pstate.c does.
> In either case, this #define and the code it references should go.

Yes, can be done. Will fix the code by reading PERF_CTRL MSR rather.

> 
> > +
> > +static DEFINE_PER_CPU(struct sfi_cpufreq_data *, drv_data);
> > +static DEFINE_MUTEX(performance_mutex);
> > +static struct sfi_freq_table_entry sfi_cpufreq_array[SFI_FREQ_MAX];
> > +
> > +/* sfi_perf_data is a pointer to percpu data. */
> > +static struct sfi_processor_performance *sfi_perf_data;
> > +static struct cpufreq_driver sfi_cpufreq_driver;
> > +static int sfi_cpufreq_num;
> > +
> > +/* Performance management */
> > +struct sfi_processor_px {
> > +       u32 core_frequency;     /* megahertz */
> > +       u32 transition_latency; /* microseconds */
> > +       u32 control;    /* control value */
> > +};
> 
> delete the definition of sfi_processor_px.
> it is already defined in sfi.h as sfi_freq_table_entry.

Yes, my bad. Not noticed. Thanks. Will reuse it.

[...]

> > +static int parse_freq(struct sfi_table_header *table)
> > +{
> > +       struct sfi_table_simple *sb;
> > +       struct sfi_freq_table_entry *pentry;
> > +       int total_len;
> > +
> > +       sb = (struct sfi_table_simple *)table;
> > +       if (!sb) {
> > +               pr_warn("SFI: Unable to map frequency table\n");
> > +               return -ENODEV;
> 
> Delete this check, it is impossible for it to execute.
> The calling function, sfi_table_parse()
> already checked for !table.

Yes, the check does not make sense. Will remove.

> 
> > +       }
> > +
> > +       if (!sfi_cpufreq_num) {
> 
> Why is there a check for sfi_cpufreq_num?
> Do you expect to find more than one SFI_SIG_FREQ,
> and here you are implementing a policy of using the 1st
> and ignoring the 2nd?
No, will remove it.

> 
> > +               sfi_cpufreq_num = SFI_GET_NUM_ENTRIES(sb,
> > +                        struct sfi_freq_table_entry);
> > +               pentry = (struct sfi_freq_table_entry *)sb->pentry;
> > +               total_len = sfi_cpufreq_num * sizeof(*pentry);
> > +               memcpy(sfi_cpufreq_array, pentry, total_len);
> 
> you went to the trouble of defining SFI_FREQ_MAX,
> but I don't see a check here to make sure that you are
> not copying over the end of that array...
> 
> Why are you copying the entries to sfi_cpu_freq_array[]
> in the first place?  Wouldn't it be a better delete
> SFI_FREQ_MAX, delete this copy, and simply read the table
> entries from where they already are?

OK. Will fix.

[...]

> > +       performance->state_count = sfi_cpufreq_num;
> > +       performance->states =
> > +           kmalloc(sizeof(struct sfi_processor_px) * sfi_cpufreq_num,
> > +                   GFP_KERNEL);
> > +       if (!performance->states)
> > +               result = -ENOMEM;
> 
> don't you mean *return* -ENOMEM?
> If you continue here, below you're going to try to write to the structure
> you just failed to allocate....
> 
> But again, unclear you need to allocate this memory in the first place...

The function itself is unnecessary. Will remove it entirely.

> 
> > +
> > +       /* Populate the P-states info from the SFI table */
> > +       for (i = 0; i < sfi_cpufreq_num; i++) {
> > +               performance->states[i].core_frequency =
> > +                       sfi_cpufreq_array[i].freq_mhz;
> > +               performance->states[i].transition_latency =
> > +                       sfi_cpufreq_array[i].latency;
> > +               performance->states[i].control =
> > +                       sfi_cpufreq_array[i].ctrl_val;
> > +               pr_debug("State [%d]: core_frequency[%d] transition_latency[%d] control[0x%x]\n",
> > +                       i,
> > +                       (u32) performance->states[i].core_frequency,
> > +                       (u32) performance->states[i].transition_latency,
> > +                       (u32) performance->states[i].control);
> > +       }
> > +
> > +       return result;
> 
> It really does seem that sfi_processor_get_performance_states()
> can be replaced with 1 line in your parse routine to point
> performance->states to the address of the 0th entry in sfi_freq table.

Right. will fix it too.

> 
> > +}
> > +
> > +static int sfi_processor_register_performance(
> > +                       struct sfi_processor_performance *performance)
> > +{
> > +       mutex_lock(&performance_mutex);
> > +
> > +       /* parse the freq table from SFI */
> > +       sfi_cpufreq_num = 0;
> > +       sfi_table_parse(SFI_SIG_FREQ, NULL, NULL, parse_freq);
> 
> why do you ignore sfi_table_parse()'s possible error return value?

Let me check. In parse_freq, if i dont check for sb, then it may
not make sense.

> 
> > +
> > +       sfi_processor_get_performance_states(performance);
> 
> again, should look at error return values,
> but even better to delete the called routine entirely.

Yes, will remove the entire routine.

[...]

> > +
> > +static unsigned extract_freq(u32 msr, unsigned int cpu)
> 
> delete this routine.
> 
> > +{
> > +       struct cpufreq_frequency_table *pos;
> > +       struct sfi_processor_performance *perf;
> > +       struct sfi_cpufreq_data *data = per_cpu(drv_data, cpu);
> > +       u32 sfi_ctrl;
> > +
> > +       msr &= INTEL_MSR_BUSRATIO_MASK;
> > +       perf = data->sfi_data;
> > +
> > +       cpufreq_for_each_entry(pos, data->freq_table) {
> > +               sfi_ctrl = perf->states[pos->driver_data].control
> > +                       & INTEL_MSR_BUSRATIO_MASK;
> > +               if (sfi_ctrl == msr)
> > +                       return pos->frequency;
> > +       }
> > +       return data->freq_table[0].frequency;
> > +}
> > +
> > +static u32 get_cur_val(const struct cpumask *mask)
> > +{
> 
> delete this routine
> 
> > +       u32 val, dummy;
> > +
> > +       if (unlikely(cpumask_empty(mask)))
> > +               return 0;
> > +
> > +       rdmsr_on_cpu(cpumask_any(mask), MSR_IA32_PERF_STATUS, &val, &dummy);
> > +
> > +       return val;
> > +}
> > +
> > +static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
> > +{
> 
> replace this routine with one that
> simply returns the latest requested freq on that cpu.
> or if you want to get fancy, calculate actual average frequency
> like intel_pstate.c does.

Not sure if in understood correctly. Do you mean I should not
even read from PERF_CTL in get_cur_freq_on_cpu, rather just
return the cached last requested frequency? I'm not sure
of the behavior in case if we offline and online a cpu.

[...]

> > +       /* capability check */
> > +       if (perf->state_count <= 1) {
> 
> yes, I know you copied this from acpi-cpufreq.c, but
> wouldn't it make more sense for register_performance(), which discovers
> the number of states and makes the allocation, to handle this internally
> and simply return success/fail?
Yes, will fix.

srinidhi
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Len Brown Nov. 10, 2014, 7:10 p.m. UTC | #3
> Not sure if in understood correctly. Do you mean I should not
> even read from PERF_CTL in get_cur_freq_on_cpu, rather just
> return the cached last requested frequency? I'm not sure
> of the behavior in case if we offline and online a cpu.

Best to simply cache the last written value in software,
that way it can be read quickly w/o even switching to
a processor to do the local MSR read.

The cpufreq get-current-frequency interface is somewhat bogus.
It generally doesn't make sense on a lot of modern hardware for the
driver to actually know the current frequency -- as the driver is only making
a request and does not have total control over the frequency...
But we need to answer something b/c there are people still
using the legacy cpufreq sysfs interface, and they expect a number.

cheers,
Len Brown, Intel Open Source Technology Center
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kasagar, Srinidhi Nov. 18, 2014, 5:39 a.m. UTC | #4
On Mon, Nov 10, 2014 at 02:10:44PM -0500, Len Brown wrote:
> > Not sure if in understood correctly. Do you mean I should not
> > even read from PERF_CTL in get_cur_freq_on_cpu, rather just
> > return the cached last requested frequency? I'm not sure
> > of the behavior in case if we offline and online a cpu.
> 
> Best to simply cache the last written value in software,
> that way it can be read quickly w/o even switching to
> a processor to do the local MSR read.
But it will be wrong after offline-online sequence. I have
sent v5 version of the patch. Please check.

> 
> The cpufreq get-current-frequency interface is somewhat bogus.
> It generally doesn't make sense on a lot of modern hardware for the
> driver to actually know the current frequency -- as the driver is only making
> a request and does not have total control over the frequency...
> But we need to answer something b/c there are people still
> using the legacy cpufreq sysfs interface, and they expect a number.

Agree.

Srinidhi
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 89ae88f91895..8860e8062b53 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -57,6 +57,16 @@  config X86_ACPI_CPUFREQ_CPB
 	  By enabling this option the acpi_cpufreq driver provides the old
 	  entry in addition to the new boost ones, for compatibility reasons.
 
+config X86_SFI_CPUFREQ
+	tristate "SFI Processor P-States driver"
+	depends on X86_INTEL_MID && SFI
+	help
+	  This adds a CPUFreq driver for some Silvermont based Intel Atom
+	  platforms (like INTEL_MID) which enumerates processor performance
+	  states through SFI.
+
+	  If in doubt, say N.
+
 config ELAN_CPUFREQ
 	tristate "AMD Elan SC400 and SC410"
 	depends on MELAN
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index db6d9a2fea4d..c3b51efd4a85 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -41,6 +41,7 @@  obj-$(CONFIG_X86_P4_CLOCKMOD)		+= p4-clockmod.o
 obj-$(CONFIG_X86_CPUFREQ_NFORCE2)	+= cpufreq-nforce2.o
 obj-$(CONFIG_X86_INTEL_PSTATE)		+= intel_pstate.o
 obj-$(CONFIG_X86_AMD_FREQ_SENSITIVITY)	+= amd_freq_sensitivity.o
+obj-$(CONFIG_X86_SFI_CPUFREQ)		+= sfi-cpufreq.o
 
 ##################################################################################
 # ARM SoC drivers
diff --git a/drivers/cpufreq/sfi-cpufreq.c b/drivers/cpufreq/sfi-cpufreq.c
new file mode 100644
index 000000000000..d5e11dca739d
--- /dev/null
+++ b/drivers/cpufreq/sfi-cpufreq.c
@@ -0,0 +1,317 @@ 
+/*
+ *  SFI Processor P-States Driver
+ *  Based on ACPI Processor P-States Driver
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Author: Vishwesh M Rudramuni <vishwesh.m.rudramuni@intel.com>
+ *  Author: Srinidhi Kasagar <srinidhi.kasagar@intel.com>
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sfi.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+
+#include <asm/msr.h>
+#include <asm/processor.h>
+
+#define SFI_FREQ_MAX		32
+#define INTEL_MSR_RANGE		0xffff
+#define INTEL_MSR_BUSRATIO_MASK	0xff00
+
+static DEFINE_PER_CPU(struct sfi_cpufreq_data *, drv_data);
+static DEFINE_MUTEX(performance_mutex);
+static struct sfi_freq_table_entry sfi_cpufreq_array[SFI_FREQ_MAX];
+
+/* sfi_perf_data is a pointer to percpu data. */
+static struct sfi_processor_performance *sfi_perf_data;
+static struct cpufreq_driver sfi_cpufreq_driver;
+static int sfi_cpufreq_num;
+
+/* Performance management */
+struct sfi_processor_px {
+	u32 core_frequency;	/* megahertz */
+	u32 transition_latency;	/* microseconds */
+	u32 control;	/* control value */
+};
+
+struct sfi_processor_performance {
+	unsigned int state;
+	unsigned int state_count;
+	struct sfi_processor_px *states;
+};
+
+struct sfi_cpufreq_data {
+	struct sfi_processor_performance *sfi_data;
+	struct cpufreq_frequency_table *freq_table;
+};
+
+static int parse_freq(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_freq_table_entry *pentry;
+	int total_len;
+
+	sb = (struct sfi_table_simple *)table;
+	if (!sb) {
+		pr_warn("SFI: Unable to map frequency table\n");
+		return -ENODEV;
+	}
+
+	if (!sfi_cpufreq_num) {
+		sfi_cpufreq_num = SFI_GET_NUM_ENTRIES(sb,
+			 struct sfi_freq_table_entry);
+		pentry = (struct sfi_freq_table_entry *)sb->pentry;
+		total_len = sfi_cpufreq_num * sizeof(*pentry);
+		memcpy(sfi_cpufreq_array, pentry, total_len);
+	}
+
+	return 0;
+}
+
+static int sfi_processor_get_performance_states(
+			struct sfi_processor_performance *performance)
+{
+	int result = 0;
+	int i;
+
+	performance->state_count = sfi_cpufreq_num;
+	performance->states =
+	    kmalloc(sizeof(struct sfi_processor_px) * sfi_cpufreq_num,
+		    GFP_KERNEL);
+	if (!performance->states)
+		result = -ENOMEM;
+
+	/* Populate the P-states info from the SFI table */
+	for (i = 0; i < sfi_cpufreq_num; i++) {
+		performance->states[i].core_frequency =
+			sfi_cpufreq_array[i].freq_mhz;
+		performance->states[i].transition_latency =
+			sfi_cpufreq_array[i].latency;
+		performance->states[i].control =
+			sfi_cpufreq_array[i].ctrl_val;
+		pr_debug("State [%d]: core_frequency[%d] transition_latency[%d] control[0x%x]\n",
+			i,
+			(u32) performance->states[i].core_frequency,
+			(u32) performance->states[i].transition_latency,
+			(u32) performance->states[i].control);
+	}
+
+	return result;
+}
+
+static int sfi_processor_register_performance(
+			struct sfi_processor_performance *performance)
+{
+	mutex_lock(&performance_mutex);
+
+	/* parse the freq table from SFI */
+	sfi_cpufreq_num = 0;
+	sfi_table_parse(SFI_SIG_FREQ, NULL, NULL, parse_freq);
+
+	sfi_processor_get_performance_states(performance);
+
+	mutex_unlock(&performance_mutex);
+
+	return 0;
+}
+
+void sfi_processor_unregister_performance(
+			struct sfi_processor_performance *performance)
+{
+	mutex_lock(&performance_mutex);
+	kfree(performance->states);
+	mutex_unlock(&performance_mutex);
+}
+
+static unsigned extract_freq(u32 msr, unsigned int cpu)
+{
+	struct cpufreq_frequency_table *pos;
+	struct sfi_processor_performance *perf;
+	struct sfi_cpufreq_data *data = per_cpu(drv_data, cpu);
+	u32 sfi_ctrl;
+
+	msr &= INTEL_MSR_BUSRATIO_MASK;
+	perf = data->sfi_data;
+
+	cpufreq_for_each_entry(pos, data->freq_table) {
+		sfi_ctrl = perf->states[pos->driver_data].control
+			& INTEL_MSR_BUSRATIO_MASK;
+		if (sfi_ctrl == msr)
+			return pos->frequency;
+	}
+	return data->freq_table[0].frequency;
+}
+
+static u32 get_cur_val(const struct cpumask *mask)
+{
+	u32 val, dummy;
+
+	if (unlikely(cpumask_empty(mask)))
+		return 0;
+
+	rdmsr_on_cpu(cpumask_any(mask), MSR_IA32_PERF_STATUS, &val, &dummy);
+
+	return val;
+}
+
+static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
+{
+	return extract_freq(get_cur_val(cpumask_of(cpu)), cpu);
+}
+
+static int sfi_cpufreq_target(struct cpufreq_policy *policy,
+				unsigned int index)
+{
+	struct sfi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
+	struct sfi_processor_performance *perf;
+	unsigned int next_perf_state = 0; /* Index into perf table */
+	u32 lo, hi;
+
+	perf = data->sfi_data;
+	next_perf_state = data->freq_table[index].driver_data;
+
+	rdmsr_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, &lo, &hi);
+	lo = (lo & ~INTEL_MSR_RANGE) |
+		((u32) perf->states[next_perf_state].control & INTEL_MSR_RANGE);
+	wrmsr_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, lo, hi);
+
+	perf->state = next_perf_state;
+
+	return 0;
+}
+
+static int sfi_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int i, result = 0, valid_states = 0;
+	unsigned int cpu = policy->cpu;
+	struct sfi_cpufreq_data *data;
+	struct sfi_processor_performance *perf;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->sfi_data = per_cpu_ptr(sfi_perf_data, cpu);
+	per_cpu(drv_data, cpu) = data;
+
+	result = sfi_processor_register_performance(data->sfi_data);
+	if (result)
+		goto err_free;
+
+	perf = data->sfi_data;
+	policy->shared_type = CPUFREQ_SHARED_TYPE_HW;
+
+	/* capability check */
+	if (perf->state_count <= 1) {
+		pr_err("No P-States\n");
+		result = -ENODEV;
+		goto err_unreg;
+	}
+
+	data->freq_table = kzalloc(sizeof(*data->freq_table) *
+				(perf->state_count+1), GFP_KERNEL);
+	if (!data->freq_table) {
+		result = -ENOMEM;
+		goto err_unreg;
+	}
+
+	policy->cpuinfo.transition_latency = 0;
+	for (i = 0; i < perf->state_count; i++) {
+		/* detect transition latency */
+		if ((perf->states[i].transition_latency * 1000) >
+		    policy->cpuinfo.transition_latency)
+			policy->cpuinfo.transition_latency =
+				perf->states[i].transition_latency * 1000;
+
+		/* initialize the freq table */
+		data->freq_table[valid_states].driver_data = i;
+		data->freq_table[valid_states].frequency =
+				perf->states[i].core_frequency * 1000;
+		valid_states++;
+
+		pr_debug("     %cP%d: %d MHz, %d uS\n",
+			(i == perf->state ? '*' : ' '), i,
+			(u32) perf->states[i].core_frequency,
+			(u32) perf->states[i].transition_latency);
+	}
+
+	data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
+	perf->state = 0;
+
+	result = cpufreq_table_validate_and_show(policy, data->freq_table);
+	if (result)
+		goto err_freqfree;
+
+	pr_debug("CPU%u - SFI performance management activated.\n", cpu);
+
+	return result;
+
+err_freqfree:
+	kfree(data->freq_table);
+err_unreg:
+	sfi_processor_unregister_performance(perf);
+err_free:
+	kfree(data);
+	per_cpu(drv_data, cpu) = NULL;
+
+	return result;
+}
+
+static int sfi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	struct sfi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
+
+	per_cpu(drv_data, policy->cpu) = NULL;
+	sfi_processor_unregister_performance(data->sfi_data);
+	kfree(data->freq_table);
+	kfree(data);
+
+	return 0;
+}
+
+static struct cpufreq_driver sfi_cpufreq_driver = {
+	.flags		= CPUFREQ_CONST_LOOPS,
+	.get		= get_cur_freq_on_cpu,
+	.verify		= cpufreq_generic_frequency_table_verify,
+	.target_index	= sfi_cpufreq_target,
+	.init		= sfi_cpufreq_cpu_init,
+	.exit		= sfi_cpufreq_cpu_exit,
+	.name		= "sfi-cpufreq",
+	.attr		= cpufreq_generic_attr,
+};
+
+static int __init sfi_cpufreq_init(void)
+{
+	sfi_perf_data = alloc_percpu(struct sfi_processor_performance);
+	if (!sfi_perf_data) {
+		pr_err("Memory allocation error for sfi_perf_data.\n");
+		return -ENOMEM;
+	}
+
+	return cpufreq_register_driver(&sfi_cpufreq_driver);
+}
+late_initcall(sfi_cpufreq_init);
+
+static void __exit sfi_cpufreq_exit(void)
+{
+	cpufreq_unregister_driver(&sfi_cpufreq_driver);
+	free_percpu(sfi_perf_data);
+}
+module_exit(sfi_cpufreq_exit);
+
+MODULE_AUTHOR("Vishwesh M Rudramuni <vishwesh.m.rudramuni@intel.com>");
+MODULE_DESCRIPTION("SFI P-States Driver");
+MODULE_LICENSE("GPL");