Message ID | 1476707572-32215-4-git-send-email-pmladek@suse.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Zhang Rui |
Headers | show |
On Mon, 17 Oct 2016 14:32:52 +0200 Petr Mladek <pmladek@suse.com> wrote: > From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > Hi Sebastian, I applied this patchset on 4.9-rc1 and run some cpu online/offline loops test while injecting idle, e.g. 25%. I got system hang after a few cycles. Still looking into root cause. Thanks, Jacob > This is a conversation to the new hotplug state machine with > the difference that CPU_DEAD becomes CPU_PREDOWN. > > At the same time it makes the handling of the two states symmetrical. > stop_power_clamp_worker() is called unconditionally and the > controversial error message is removed. > > Finally, the hotplug state callbacks are removed after the > powerclamping is stopped to avoid a potential race. > > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > [pmladek@suse.com: Fixed the possible race in powerclamp_exit()] > Signed-off-by: Petr Mladek <pmladek@suse.com> > --- > drivers/thermal/intel_powerclamp.c | 69 > +++++++++++++++++++------------------- 1 file changed, 35 > insertions(+), 34 deletions(-) > > diff --git a/drivers/thermal/intel_powerclamp.c > b/drivers/thermal/intel_powerclamp.c index a94f7c849a4e..390e50b97324 > 100644 --- a/drivers/thermal/intel_powerclamp.c > +++ b/drivers/thermal/intel_powerclamp.c > @@ -622,43 +622,35 @@ static void end_power_clamp(void) > } > } > > -static int powerclamp_cpu_callback(struct notifier_block *nfb, > - unsigned long action, void *hcpu) > +static int powerclamp_cpu_online(unsigned int cpu) > { > - unsigned long cpu = (unsigned long)hcpu; > + if (clamping == false) > + return 0; > + start_power_clamp_worker(cpu); > + /* prefer BSP as controlling CPU */ > + if (cpu == 0) { > + control_cpu = 0; > + smp_mb(); > + } > + return 0; > +} > > - if (false == clamping) > - goto exit_ok; > +static int powerclamp_cpu_predown(unsigned int cpu) > +{ > + if (clamping == false) > + return 0; > > - switch (action) { > - case CPU_ONLINE: > - start_power_clamp_worker(cpu); > - /* prefer BSP as controlling CPU */ > - if (cpu == 0) { > - control_cpu = 0; > - smp_mb(); > - } > - break; > - case CPU_DEAD: > - if (test_bit(cpu, cpu_clamping_mask)) { > - pr_err("cpu %lu dead but powerclamping > thread is not\n", > - cpu); > - stop_power_clamp_worker(cpu); > - } > - if (cpu == control_cpu) { > - control_cpu = smp_processor_id(); > - smp_mb(); > - } > - } > + stop_power_clamp_worker(cpu); > + if (cpu != control_cpu) > + return 0; > > -exit_ok: > - return NOTIFY_OK; > + control_cpu = cpumask_first(cpu_online_mask); > + if (control_cpu == cpu) > + control_cpu = cpumask_next(cpu, cpu_online_mask); > + smp_mb(); > + return 0; > } > > -static struct notifier_block powerclamp_cpu_notifier = { > - .notifier_call = powerclamp_cpu_callback, > -}; > - > static int powerclamp_get_max_state(struct thermal_cooling_device > *cdev, unsigned long *state) > { > @@ -788,6 +780,8 @@ static inline void > powerclamp_create_debug_files(void) > debugfs_remove_recursive(debug_dir); } > > +static enum cpuhp_state hp_state; > + > static int __init powerclamp_init(void) > { > int retval; > @@ -805,7 +799,14 @@ static int __init powerclamp_init(void) > > /* set default limit, maybe adjusted during runtime based on > feedback */ window_size = 2; > - register_hotcpu_notifier(&powerclamp_cpu_notifier); > + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, > + > "thermal/intel_powerclamp:online", > + powerclamp_cpu_online, > + powerclamp_cpu_predown); > + if (retval < 0) > + goto exit_free; > + > + hp_state = retval; > > worker_data = alloc_percpu(struct powerclamp_worker_data); > if (!worker_data) { > @@ -830,7 +831,7 @@ static int __init powerclamp_init(void) > exit_free_thread: > free_percpu(worker_data); > exit_unregister: > - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); > + cpuhp_remove_state_nocalls(hp_state); > exit_free: > kfree(cpu_clamping_mask); > return retval; > @@ -839,8 +840,8 @@ static int __init powerclamp_init(void) > > static void __exit powerclamp_exit(void) > { > - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); > end_power_clamp(); > + cpuhp_remove_state_nocalls(hp_state); > free_percpu(worker_data); > thermal_cooling_device_unregister(cooling_dev); > kfree(cpu_clamping_mask); [Jacob Pan] -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri 2016-10-21 13:21:18, Jacob Pan wrote: > On Mon, 17 Oct 2016 14:32:52 +0200 > Petr Mladek <pmladek@suse.com> wrote: > > > From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > > > Hi Sebastian, > I applied this patchset on 4.9-rc1 and run some cpu online/offline > loops test while injecting idle, e.g. 25%. I got system hang after a > few cycles. Still looking into root cause. You might need the patch ("sched/fair: Fix sched domains NULL deference in select_idle_sibling()") from linux-tip, see https://lkml.kernel.org/r/tip-9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441@git.kernel.org I have mentioned it in the cover letter. I am sorry if it was not visible enough. Best Regards, Petr -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 24 Oct 2016 17:48:07 +0200 Petr Mladek <pmladek@suse.com> wrote: > On Fri 2016-10-21 13:21:18, Jacob Pan wrote: > > On Mon, 17 Oct 2016 14:32:52 +0200 > > Petr Mladek <pmladek@suse.com> wrote: > > > > > From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > > > > > Hi Sebastian, > > I applied this patchset on 4.9-rc1 and run some cpu online/offline > > loops test while injecting idle, e.g. 25%. I got system hang after a > > few cycles. Still looking into root cause. > > You might need the patch > ("sched/fair: Fix sched domains NULL deference in > select_idle_sibling()") from linux-tip, see > https://lkml.kernel.org/r/tip-9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441@git.kernel.org > > I have mentioned it in the cover letter. I am sorry if it was not > visible enough. Thanks for point it out again, I missed it. Now rc2 has this fix so I tried again cpuhp seems to work well now. However, I found suspend to ram does not work with this patchset. Have you tested it with "echo mem > /sys/power/state" while doing idle injection? It fails to enter s3 on my system. -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon 2016-10-24 09:55:29, Jacob Pan wrote: > On Mon, 24 Oct 2016 17:48:07 +0200 > Petr Mladek <pmladek@suse.com> wrote: > > > On Fri 2016-10-21 13:21:18, Jacob Pan wrote: > > > On Mon, 17 Oct 2016 14:32:52 +0200 > > > Petr Mladek <pmladek@suse.com> wrote: > > > > > > > From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > > > > > > > Hi Sebastian, > > > I applied this patchset on 4.9-rc1 and run some cpu online/offline > > > loops test while injecting idle, e.g. 25%. I got system hang after a > > > few cycles. Still looking into root cause. > > > > You might need the patch > > ("sched/fair: Fix sched domains NULL deference in > > select_idle_sibling()") from linux-tip, see > > https://lkml.kernel.org/r/tip-9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441@git.kernel.org > > > > I have mentioned it in the cover letter. I am sorry if it was not > > visible enough. > Thanks for point it out again, I missed it. > > Now rc2 has this fix so I tried again cpuhp seems to work well now. > > However, I found suspend to ram does not work with this patchset. Have > you tested it with "echo mem > /sys/power/state" while doing idle > injection? > It fails to enter s3 on my system. Hmm, I haven't tested the system suspend. I am sorry but I am snowed under some other tasks this week and will be on the Plumbers conference the next week. I hope that I will have some time to look at it then. In each case, I wonder if the problem is caused by the conversion to the kthread worker or by the CPU hotplug state conversion. Best Regards, Petr -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2016-10-27 16:53:48 [+0200], Petr Mladek wrote: > > In each case, I wonder if the problem is caused by the conversion > to the kthread worker or by the CPU hotplug state conversion. drop the hotplug patch and you will see. > Best Regards, > Petr Sebastian -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index a94f7c849a4e..390e50b97324 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -622,43 +622,35 @@ static void end_power_clamp(void) } } -static int powerclamp_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int powerclamp_cpu_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; + if (clamping == false) + return 0; + start_power_clamp_worker(cpu); + /* prefer BSP as controlling CPU */ + if (cpu == 0) { + control_cpu = 0; + smp_mb(); + } + return 0; +} - if (false == clamping) - goto exit_ok; +static int powerclamp_cpu_predown(unsigned int cpu) +{ + if (clamping == false) + return 0; - switch (action) { - case CPU_ONLINE: - start_power_clamp_worker(cpu); - /* prefer BSP as controlling CPU */ - if (cpu == 0) { - control_cpu = 0; - smp_mb(); - } - break; - case CPU_DEAD: - if (test_bit(cpu, cpu_clamping_mask)) { - pr_err("cpu %lu dead but powerclamping thread is not\n", - cpu); - stop_power_clamp_worker(cpu); - } - if (cpu == control_cpu) { - control_cpu = smp_processor_id(); - smp_mb(); - } - } + stop_power_clamp_worker(cpu); + if (cpu != control_cpu) + return 0; -exit_ok: - return NOTIFY_OK; + control_cpu = cpumask_first(cpu_online_mask); + if (control_cpu == cpu) + control_cpu = cpumask_next(cpu, cpu_online_mask); + smp_mb(); + return 0; } -static struct notifier_block powerclamp_cpu_notifier = { - .notifier_call = powerclamp_cpu_callback, -}; - static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state) { @@ -788,6 +780,8 @@ static inline void powerclamp_create_debug_files(void) debugfs_remove_recursive(debug_dir); } +static enum cpuhp_state hp_state; + static int __init powerclamp_init(void) { int retval; @@ -805,7 +799,14 @@ static int __init powerclamp_init(void) /* set default limit, maybe adjusted during runtime based on feedback */ window_size = 2; - register_hotcpu_notifier(&powerclamp_cpu_notifier); + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "thermal/intel_powerclamp:online", + powerclamp_cpu_online, + powerclamp_cpu_predown); + if (retval < 0) + goto exit_free; + + hp_state = retval; worker_data = alloc_percpu(struct powerclamp_worker_data); if (!worker_data) { @@ -830,7 +831,7 @@ static int __init powerclamp_init(void) exit_free_thread: free_percpu(worker_data); exit_unregister: - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); + cpuhp_remove_state_nocalls(hp_state); exit_free: kfree(cpu_clamping_mask); return retval; @@ -839,8 +840,8 @@ static int __init powerclamp_init(void) static void __exit powerclamp_exit(void) { - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); end_power_clamp(); + cpuhp_remove_state_nocalls(hp_state); free_percpu(worker_data); thermal_cooling_device_unregister(cooling_dev); kfree(cpu_clamping_mask);