diff mbox

[v2,2/2] intel_pstate: Set core to min P state during core offline

Message ID 1394831037-15553-3-git-send-email-dirk.j.brandewie@intel.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

dirk.brandewie@gmail.com March 14, 2014, 9:03 p.m. UTC
From: Dirk Brandewie <dirk.j.brandewie@intel.com>

Change to use ->stop() callback to do clean up during CPU
hotplug. The requested P state for an offline core will be used by the
hardware coordination function to select the package P state. If the
core is under load when it is offlined it will fix the package P state
floor to the requested P state of offline core.

Reported-by: Patrick Marlier <patrick.marlier@gmail.com>
Signed-off-by: Dirk Brandewie <dirk.j.brandewie@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

Comments

Viresh Kumar March 18, 2014, 5:44 a.m. UTC | #1
On Sat, Mar 15, 2014 at 2:33 AM,  <dirk.brandewie@gmail.com> wrote:
> From: Dirk Brandewie <dirk.j.brandewie@intel.com>
>
> Change to use ->stop() callback to do clean up during CPU
> hotplug. The requested P state for an offline core will be used by the
> hardware coordination function to select the package P state. If the
> core is under load when it is offlined it will fix the package P state
> floor to the requested P state of offline core.
>
> Reported-by: Patrick Marlier <patrick.marlier@gmail.com>
> Signed-off-by: Dirk Brandewie <dirk.j.brandewie@intel.com>
> ---
>  drivers/cpufreq/intel_pstate.c | 20 +++++++++++++-------
>  1 file changed, 13 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 2cd36b9..e9092fd 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -447,7 +447,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate)
>         if (limits.no_turbo)
>                 val |= (u64)1 << 32;
>
> -       wrmsrl(MSR_IA32_PERF_CTL, val);
> +       wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
>  }
>
>  static struct cpu_defaults core_params = {
> @@ -773,16 +773,22 @@ static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
>         return 0;
>  }
>
> -static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
> +static int intel_pstate_cpu_stop(struct cpufreq_policy *policy)
>  {
> -       int cpu = policy->cpu;
> +       int cpu_num = policy->cpu;
> +       struct cpudata *cpu = all_cpu_data[cpu_num];
> +
> +       pr_info("intel_pstate CPU %d exiting\n", cpu_num);
> +
> +       del_timer(&all_cpu_data[cpu_num]->timer);
> +       intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
> +       kfree(all_cpu_data[cpu_num]);
> +       all_cpu_data[cpu_num] = NULL;
>
> -       del_timer(&all_cpu_data[cpu]->timer);
> -       kfree(all_cpu_data[cpu]);
> -       all_cpu_data[cpu] = NULL;
>         return 0;
>  }
>
> +
>  static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
>  {
>         struct cpudata *cpu;
> @@ -818,7 +824,7 @@ static struct cpufreq_driver intel_pstate_driver = {
>         .setpolicy      = intel_pstate_set_policy,
>         .get            = intel_pstate_get,
>         .init           = intel_pstate_cpu_init,
> -       .exit           = intel_pstate_cpu_exit,
> +       .stop           = intel_pstate_cpu_stop,

Probably, keep exit as is and only change P-state in stop(). So that
allocation of resources happen in init() and they are freed in exit()?
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
dirk.brandewie@gmail.com March 18, 2014, 3:01 p.m. UTC | #2
On 03/17/2014 10:44 PM, Viresh Kumar wrote:
> On Sat, Mar 15, 2014 at 2:33 AM,  <dirk.brandewie@gmail.com> wrote:
>> +
>>   static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
>>   {
>>          struct cpudata *cpu;
>> @@ -818,7 +824,7 @@ static struct cpufreq_driver intel_pstate_driver = {
>>          .setpolicy      = intel_pstate_set_policy,
>>          .get            = intel_pstate_get,
>>          .init           = intel_pstate_cpu_init,
>> -       .exit           = intel_pstate_cpu_exit,
>> +       .stop           = intel_pstate_cpu_stop,
>
> Probably, keep exit as is and only change P-state in stop(). So that
> allocation of resources happen in init() and they are freed in exit()?
>
I looked at doing just that but it junked up the code.  if stop() is called
during PREPARE then init() will be called via __cpufreq_add_dev() in the ONLINE
and DOWN_FAILED case. So once stop() is called the driver will be ready for
init() to be called exactly like when exit() is called.


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat March 18, 2014, 6:52 p.m. UTC | #3
On 03/18/2014 08:31 PM, Dirk Brandewie wrote:
> On 03/17/2014 10:44 PM, Viresh Kumar wrote:
>> On Sat, Mar 15, 2014 at 2:33 AM,  <dirk.brandewie@gmail.com> wrote:
>>> +
>>>   static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
>>>   {
>>>          struct cpudata *cpu;
>>> @@ -818,7 +824,7 @@ static struct cpufreq_driver intel_pstate_driver = {
>>>          .setpolicy      = intel_pstate_set_policy,
>>>          .get            = intel_pstate_get,
>>>          .init           = intel_pstate_cpu_init,
>>> -       .exit           = intel_pstate_cpu_exit,
>>> +       .stop           = intel_pstate_cpu_stop,
>>
>> Probably, keep exit as is and only change P-state in stop(). So that
>> allocation of resources happen in init() and they are freed in exit()?
>>
> I looked at doing just that but it junked up the code.  if stop() is called
> during PREPARE then init() will be called via __cpufreq_add_dev() in the
> ONLINE
> and DOWN_FAILED case. So once stop() is called the driver will be ready for
> init() to be called exactly like when exit() is called.
>

I'm sorry, but that didn't make much sense to me. Can you be a little
more specific as to what problems you hit while trying to have a
->stop() which sets min P state and a separate ->exit() which frees
the resources? I think we can achieve this with almost no trouble.

If you ignore the failure case (such as DOWN_FAILED) for now, do you
still see any serious roadblocks?

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
dirk.brandewie@gmail.com March 18, 2014, 7:44 p.m. UTC | #4
On 03/18/2014 11:52 AM, Srivatsa S. Bhat wrote:
> On 03/18/2014 08:31 PM, Dirk Brandewie wrote:
>> On 03/17/2014 10:44 PM, Viresh Kumar wrote:
>>> On Sat, Mar 15, 2014 at 2:33 AM,  <dirk.brandewie@gmail.com> wrote:
>>>> +
>>>>    static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
>>>>    {
>>>>           struct cpudata *cpu;
>>>> @@ -818,7 +824,7 @@ static struct cpufreq_driver intel_pstate_driver = {
>>>>           .setpolicy      = intel_pstate_set_policy,
>>>>           .get            = intel_pstate_get,
>>>>           .init           = intel_pstate_cpu_init,
>>>> -       .exit           = intel_pstate_cpu_exit,
>>>> +       .stop           = intel_pstate_cpu_stop,
>>>
>>> Probably, keep exit as is and only change P-state in stop(). So that
>>> allocation of resources happen in init() and they are freed in exit()?
>>>
>> I looked at doing just that but it junked up the code.  if stop() is called
>> during PREPARE then init() will be called via __cpufreq_add_dev() in the
>> ONLINE
>> and DOWN_FAILED case. So once stop() is called the driver will be ready for
>> init() to be called exactly like when exit() is called.
>>
>
> I'm sorry, but that didn't make much sense to me. Can you be a little
> more specific as to what problems you hit while trying to have a
> ->stop() which sets min P state and a separate ->exit() which frees
> the resources? I think we can achieve this with almost no trouble.
>

There was no problem per se.  In stop() all I really needed to do is stop the
timer and set the P state to MIN.

At init time I need to allocate memory and start timer.  If stopping the timer
and deallocating memory are separated then I need code in init() to detect
this case.

Moving all the clean up to stop() make my code simpler, covers the
failure case and maintains the behaviour expected by the core.

> If you ignore the failure case (such as DOWN_FAILED) for now, do you
> still see any serious roadblocks?

Why would I ignore a valid failure case?

>
> Regards,
> Srivatsa S. Bhat
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat March 18, 2014, 8:15 p.m. UTC | #5
On 03/19/2014 01:14 AM, Dirk Brandewie wrote:
> On 03/18/2014 11:52 AM, Srivatsa S. Bhat wrote:
>> On 03/18/2014 08:31 PM, Dirk Brandewie wrote:
>>> On 03/17/2014 10:44 PM, Viresh Kumar wrote:
>>>> On Sat, Mar 15, 2014 at 2:33 AM,  <dirk.brandewie@gmail.com> wrote:
>>>>> +
>>>>>    static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
>>>>>    {
>>>>>           struct cpudata *cpu;
>>>>> @@ -818,7 +824,7 @@ static struct cpufreq_driver
>>>>> intel_pstate_driver = {
>>>>>           .setpolicy      = intel_pstate_set_policy,
>>>>>           .get            = intel_pstate_get,
>>>>>           .init           = intel_pstate_cpu_init,
>>>>> -       .exit           = intel_pstate_cpu_exit,
>>>>> +       .stop           = intel_pstate_cpu_stop,
>>>>
>>>> Probably, keep exit as is and only change P-state in stop(). So that
>>>> allocation of resources happen in init() and they are freed in exit()?
>>>>
>>> I looked at doing just that but it junked up the code.  if stop() is
>>> called
>>> during PREPARE then init() will be called via __cpufreq_add_dev() in the
>>> ONLINE
>>> and DOWN_FAILED case. So once stop() is called the driver will be
>>> ready for
>>> init() to be called exactly like when exit() is called.
>>>
>>
>> I'm sorry, but that didn't make much sense to me. Can you be a little
>> more specific as to what problems you hit while trying to have a
>> ->stop() which sets min P state and a separate ->exit() which frees
>> the resources? I think we can achieve this with almost no trouble.
>>
> 
> There was no problem per se.  In stop() all I really needed to do is
> stop the
> timer and set the P state to MIN.
> 
> At init time I need to allocate memory and start timer.  If stopping the
> timer
> and deallocating memory are separated then I need code in init() to detect
> this case.
> 
> Moving all the clean up to stop() make my code simpler, covers the
> failure case and maintains the behaviour expected by the core.
> 
>> If you ignore the failure case (such as DOWN_FAILED) for now, do you
>> still see any serious roadblocks?
> 
> Why would I ignore a valid failure case?
> 

Of course you shouldn't ignore it. I was just trying to make it easier
to think about the design without complicating it with arcane failure
cases right at the outset, that's all.

Now that I looked at it again, I see your point. The problem is that
by the DOWN_PREPARE stage, the core would have completed only half the
tear-down (via __cpufreq_remove_dev_prepare()), but on failure, it tries
to do a full init (via __cpufreq_add_dev()). I would say that's actually
not a great design from the cpufreq core perspective, but perhaps we can
fix it at a later point in time if it is that painful to endure.

So yes, now I understand see why you do all the teardown in ->stop(),
to workaround the somewhat inconvenient rollback performed by the
cpufreq core. Your approach looks good to me.

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar March 19, 2014, 5:20 a.m. UTC | #6
On 19 March 2014 01:14, Dirk Brandewie <dirk.brandewie@gmail.com> wrote:
> There was no problem per se.  In stop() all I really needed to do is stop
> the
> timer and set the P state to MIN.
>
> At init time I need to allocate memory and start timer.  If stopping the
> timer
> and deallocating memory are separated then I need code in init() to detect
> this case.

Sorry, I didn't understood what exactly is special here :(

If we return failure from CPU_POST_DEAD for some reason without
calling exit() then you will have memory leak in your init() as we are
allocating memory without checking if we already have that (nothing wrong
in it though as other parts of kernel should handle things properly here).

Probably the situation would be exactly same if we divide the exit path into
stop and exit routines, which I still feel is the right way forward. Because
ideally cpufreq shouldn't call init() if it hasn't called exit() (If
it is doing that
right now then its wrong and can be fixed). And so you must do the cleanup
in exit()..
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
dirk.brandewie@gmail.com March 19, 2014, 3:32 p.m. UTC | #7
On 03/18/2014 10:20 PM, Viresh Kumar wrote:
> On 19 March 2014 01:14, Dirk Brandewie <dirk.brandewie@gmail.com> wrote:
>> There was no problem per se.  In stop() all I really needed to do is stop
>> the
>> timer and set the P state to MIN.
>>
>> At init time I need to allocate memory and start timer.  If stopping the
>> timer
>> and deallocating memory are separated then I need code in init() to detect
>> this case.
>
> Sorry, I didn't understood what exactly is special here :(
>
> If we return failure from CPU_POST_DEAD for some reason without
> calling exit() then you will have memory leak in your init() as we are
> allocating memory without checking if we already have that (nothing wrong
> in it though as other parts of kernel should handle things properly here).

No.  If you got the CPU_POST_DEAD callback CPU_DOWN_PREPARE has already
succeeded.  init() is called on the CPU_ONLINE and CPU_DOWN_FAILED path.

The issue is there is a two part teardown that can fail and the teardown
fail will be followed by a call to init().

If the timer is not running (stopped in stop()) then there is no reason to
have the memory around. If CPU_DOWN_PREPARE happens followed by CPU_DOWN_FAILED
then intel_pstate is ready for init() to be called with no special case
code.  This maintains the semantics the core expects.


>
> Probably the situation would be exactly same if we divide the exit path into
> stop and exit routines, which I still feel is the right way forward. Because
> ideally cpufreq shouldn't call init() if it hasn't called exit() (If
> it is doing that
> right now then its wrong and can be fixed). And so you must do the cleanup
> in exit()..
>

The core *is* doing this on the CPU_DOWN_FAILED path ATM.

On the CPU_DOWN_FAILED path the core should be undoing the work it did in the
CPU_DOWN_PREPARE path this would require another callback to drivers to let
them "restart" after a call to stop() as well.

I don't think it is worth that level of effort IMHO.

--Dirk
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 2cd36b9..e9092fd 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -447,7 +447,7 @@  static void core_set_pstate(struct cpudata *cpudata, int pstate)
 	if (limits.no_turbo)
 		val |= (u64)1 << 32;
 
-	wrmsrl(MSR_IA32_PERF_CTL, val);
+	wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
 }
 
 static struct cpu_defaults core_params = {
@@ -773,16 +773,22 @@  static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+static int intel_pstate_cpu_stop(struct cpufreq_policy *policy)
 {
-	int cpu = policy->cpu;
+	int cpu_num = policy->cpu;
+	struct cpudata *cpu = all_cpu_data[cpu_num];
+
+	pr_info("intel_pstate CPU %d exiting\n", cpu_num);
+
+	del_timer(&all_cpu_data[cpu_num]->timer);
+	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
+	kfree(all_cpu_data[cpu_num]);
+	all_cpu_data[cpu_num] = NULL;
 
-	del_timer(&all_cpu_data[cpu]->timer);
-	kfree(all_cpu_data[cpu]);
-	all_cpu_data[cpu] = NULL;
 	return 0;
 }
 
+
 static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 {
 	struct cpudata *cpu;
@@ -818,7 +824,7 @@  static struct cpufreq_driver intel_pstate_driver = {
 	.setpolicy	= intel_pstate_set_policy,
 	.get		= intel_pstate_get,
 	.init		= intel_pstate_cpu_init,
-	.exit		= intel_pstate_cpu_exit,
+	.stop		= intel_pstate_cpu_stop,
 	.name		= "intel_pstate",
 };