diff mbox

thermal/intel_powerclamp: convert to smpboot thread

Message ID 1457710701-3014-1-git-send-email-bigeasy@linutronix.de (mailing list archive)
State Not Applicable, archived
Delegated to: Zhang Rui
Headers show

Commit Message

Sebastian Andrzej Siewior March 11, 2016, 3:38 p.m. UTC
Oh boy oh boy. This thing runs at SCHED_FIFO MAX_USER_RT_PRIO/2 and
stops at mwait_idle_with_hints(). Why bother with /2?
There are a few things I haven't fully decoded. For instance why is it
looking at local_softirq_pending()?
The timer is probably here if mwait would let it sleep too long.

I tried to convert it over to smpboot thread so we don't have that CPU
notifier stuff to fire the cpu threads during hotplug events.

the smp_mb() barriers are not documented - I just shifted the code to
the left.

The code / logic itself could be a little more inteligent and only wake
up the threads for the CPUs that are about to idle but it seems it is
done on all of the at once unless I missed something.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: linux-pm@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/thermal/intel_powerclamp.c | 315 ++++++++++++++++---------------------
 1 file changed, 136 insertions(+), 179 deletions(-)

Comments

Jacob Pan April 6, 2016, 4:47 p.m. UTC | #1
On Fri, 11 Mar 2016 16:38:21 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
Apologize for the delay, I missed it until Rui reminded me.
> Oh boy oh boy. This thing runs at SCHED_FIFO MAX_USER_RT_PRIO/2 and
> stops at mwait_idle_with_hints(). Why bother with /2?
> There are a few things I haven't fully decoded. For instance why is it
> looking at local_softirq_pending()?
the idea is to instrument some kind of quality of service. Idle
injection is a best effort based mechanism. So we don't want to affect
high priority realtime tasks nor softirq.
> The timer is probably here if mwait would let it sleep too long.
> 
not sure i understand. could you explain?
> I tried to convert it over to smpboot thread so we don't have that CPU
> notifier stuff to fire the cpu threads during hotplug events.
> 
there is another patchset to convert it to kthread worker. any
advantage of smpboot thread?
http://comments.gmane.org/gmane.linux.kernel.mm/144964
> the smp_mb() barriers are not documented - I just shifted the code to
> the left.
> 
good point, it was for making control variables visible to other CPUs
immediately.
> The code / logic itself could be a little more inteligent and only
> wake up the threads for the CPUs that are about to idle but it seems
> it is done on all of the at once unless I missed something.
> 
you mean only force idle when there is no natural idle? i thought
about that but hard to coordinate and enforce the duration of each
idle period. Any specific pointers?
> Cc: Zhang Rui <rui.zhang@intel.com>
> Cc: Eduardo Valentin <edubezval@gmail.com>
> Cc: linux-pm@vger.kernel.org
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  drivers/thermal/intel_powerclamp.c | 315
> ++++++++++++++++--------------------- 1 file changed, 136
> insertions(+), 179 deletions(-)
> 
> diff --git a/drivers/thermal/intel_powerclamp.c
> b/drivers/thermal/intel_powerclamp.c index 6c79588251d5..e04f7631426a
> 100644 --- a/drivers/thermal/intel_powerclamp.c
> +++ b/drivers/thermal/intel_powerclamp.c
> @@ -51,6 +51,7 @@
>  #include <linux/debugfs.h>
>  #include <linux/seq_file.h>
>  #include <linux/sched/rt.h>
> +#include <linux/smpboot.h>
>  
>  #include <asm/nmi.h>
>  #include <asm/msr.h>
> @@ -85,9 +86,9 @@ static unsigned int control_cpu; /* The cpu
> assigned to collect stat and update
>  				  * can be offlined.
>  				  */
>  static bool clamping;
> +static DEFINE_PER_CPU(struct task_struct *, clamp_kthreads);
> +static DEFINE_PER_CPU(struct timer_list, clamp_timer);
>  
> -
> -static struct task_struct * __percpu *powerclamp_thread;
>  static struct thermal_cooling_device *cooling_dev;
>  static unsigned long *cpu_clamping_mask;  /* bit map for tracking
> per cpu
>  					   * clamping thread
> @@ -368,100 +369,82 @@ static bool
> powerclamp_adjust_controls(unsigned int target_ratio, return
> set_target_ratio + guard <= current_ratio; }
>  
> -static int clamp_thread(void *arg)
> +static void clamp_thread_fn(unsigned int cpu)
>  {
> -	int cpunr = (unsigned long)arg;
> -	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
> -	static const struct sched_param param = {
> -		.sched_priority = MAX_USER_RT_PRIO/2,
> -	};
>  	unsigned int count = 0;
>  	unsigned int target_ratio;
> +	int sleeptime;
> +	unsigned long target_jiffies;
> +	unsigned int guard;
> +	unsigned int compensation = 0;
> +	int interval; /* jiffies to sleep for each attempt */
> +	unsigned int duration_jiffies = msecs_to_jiffies(duration);
> +	unsigned int window_size_now;
> +	struct timer_list *wake_timer = per_cpu_ptr(&clamp_timer,
> cpu); 
> -	set_bit(cpunr, cpu_clamping_mask);
> -	set_freezable();
> -	init_timer_on_stack(&wakeup_timer);
> -	sched_setscheduler(current, SCHED_FIFO, &param);
> +	/*
> +	 * make sure user selected ratio does not take effect until
> +	 * the next round. adjust target_ratio if user has changed
> +	 * target such that we can converge quickly.
> +	 */
> +	target_ratio = set_target_ratio;
> +	guard = 1 + target_ratio/20;
> +	window_size_now = window_size;
> +	count++;
>  
> -	while (true == clamping && !kthread_should_stop() &&
> -		cpu_online(cpunr)) {
> -		int sleeptime;
> -		unsigned long target_jiffies;
> -		unsigned int guard;
> -		unsigned int compensation = 0;
> -		int interval; /* jiffies to sleep for each attempt */
> -		unsigned int duration_jiffies =
> msecs_to_jiffies(duration);
> -		unsigned int window_size_now;
> +	/*
> +	 * systems may have different ability to enter package level
> +	 * c-states, thus we need to compensate the injected idle
> ratio
> +	 * to achieve the actual target reported by the HW.
> +	 */
> +	compensation = get_compensation(target_ratio);
> +	interval = duration_jiffies*100/(target_ratio+compensation);
>  
> -		try_to_freeze();
> -		/*
> -		 * make sure user selected ratio does not take
> effect until
> -		 * the next round. adjust target_ratio if user has
> changed
> -		 * target such that we can converge quickly.
> -		 */
> -		target_ratio = set_target_ratio;
> -		guard = 1 + target_ratio/20;
> -		window_size_now = window_size;
> -		count++;
> -
> -		/*
> -		 * systems may have different ability to enter
> package level
> -		 * c-states, thus we need to compensate the injected
> idle ratio
> -		 * to achieve the actual target reported by the HW.
> -		 */
> -		compensation = get_compensation(target_ratio);
> -		interval =
> duration_jiffies*100/(target_ratio+compensation); -
> -		/* align idle time */
> -		target_jiffies = roundup(jiffies, interval);
> -		sleeptime = target_jiffies - jiffies;
> -		if (sleeptime <= 0)
> -			sleeptime = 1;
> -		schedule_timeout_interruptible(sleeptime);
> -		/*
> -		 * only elected controlling cpu can collect stats
> and update
> -		 * control parameters.
> -		 */
> -		if (cpunr == control_cpu
> && !(count%window_size_now)) {
> -			should_skip =
> -
> powerclamp_adjust_controls(target_ratio,
> -							guard,
> window_size_now);
> -			smp_mb();
> -		}
> -
> -		if (should_skip)
> -			continue;
> -
> -		target_jiffies = jiffies + duration_jiffies;
> -		mod_timer(&wakeup_timer, target_jiffies);
> -		if (unlikely(local_softirq_pending()))
> -			continue;
> -		/*
> -		 * stop tick sched during idle time, interrupts are
> still
> -		 * allowed. thus jiffies are updated properly.
> -		 */
> -		preempt_disable();
> -		/* mwait until target jiffies is reached */
> -		while (time_before(jiffies, target_jiffies)) {
> -			unsigned long ecx = 1;
> -			unsigned long eax = target_mwait;
> -
> -			/*
> -			 * REVISIT: may call enter_idle() to notify
> drivers who
> -			 * can save power during cpu idle. same for
> exit_idle()
> -			 */
> -			local_touch_nmi();
> -			stop_critical_timings();
> -			mwait_idle_with_hints(eax, ecx);
> -			start_critical_timings();
> -			atomic_inc(&idle_wakeup_counter);
> -		}
> -		preempt_enable();
> +	/* align idle time */
> +	target_jiffies = roundup(jiffies, interval);
> +	sleeptime = target_jiffies - jiffies;
> +	if (sleeptime <= 0)
> +		sleeptime = 1;
> +	schedule_timeout_interruptible(sleeptime);
> +	/*
> +	 * only elected controlling cpu can collect stats and update
> +	 * control parameters.
> +	 */
> +	if (cpu == control_cpu && !(count%window_size_now)) {
> +		should_skip =
> +			powerclamp_adjust_controls(target_ratio,
> +						   guard,
> window_size_now);
> +		smp_mb();
>  	}
> -	del_timer_sync(&wakeup_timer);
> -	clear_bit(cpunr, cpu_clamping_mask);
>  
> -	return 0;
> +	if (should_skip)
> +		return;
> +
> +	target_jiffies = jiffies + duration_jiffies;
> +	mod_timer(wake_timer, target_jiffies);
> +	if (unlikely(local_softirq_pending()))
> +		return;
> +	/*
> +	 * stop tick sched during idle time, interrupts are still
> +	 * allowed. thus jiffies are updated properly.
> +	 */
> +	preempt_disable();
> +	/* mwait until target jiffies is reached */
> +	while (time_before(jiffies, target_jiffies)) {
> +		unsigned long ecx = 1;
> +		unsigned long eax = target_mwait;
> +
> +		/*
> +		 * REVISIT: may call enter_idle() to notify drivers
> who
> +		 * can save power during cpu idle. same for
> exit_idle()
> +		 */
> +		local_touch_nmi();
> +		stop_critical_timings();
> +		mwait_idle_with_hints(eax, ecx);
> +		start_critical_timings();
> +		atomic_inc(&idle_wakeup_counter);
> +	}
> +	preempt_enable();
>  }
>  
>  /*
> @@ -505,10 +488,64 @@ static void poll_pkg_cstate(struct work_struct
> *dummy) schedule_delayed_work(&poll_pkg_cstate_work, HZ);
>  }
>  
> +static void clamp_thread_setup(unsigned int cpu)
> +{
> +	struct timer_list *wake_timer;
> +	static struct sched_param param = {
> +		.sched_priority = MAX_USER_RT_PRIO/2,
> +	};
> +
> +	sched_setscheduler(current, SCHED_FIFO, &param);
> +	wake_timer = per_cpu_ptr(&clamp_timer, cpu);
> +
> +	setup_timer(wake_timer, noop_timer, 0);
> +}
> +
> +static void clamp_thread_unpark(unsigned int cpu)
> +{
> +	set_bit(cpu, cpu_clamping_mask);
> +	if (cpu == 0) {
> +		control_cpu = 0;
> +		smp_mb();
> +	}
> +}
> +
> +static void clamp_thread_park(unsigned int cpu)
> +{
> +	clear_bit(cpu, cpu_clamping_mask);
> +	if (cpu == control_cpu) {
> +		control_cpu = cpumask_any_but(cpu_online_mask, cpu);
> +		smp_mb();
> +	}
> +	del_timer_sync(per_cpu_ptr(&clamp_timer, cpu));
> +}
> +
> +static void clamp_thread_cleanup(unsigned int cpu, bool online)
> +{
> +	if (!online)
> +		return;
> +	clamp_thread_park(cpu);
> +}
> +
> +static int clamp_thread_should_run(unsigned int cpu)
> +{
> +	return clamping == true;
> +}
> +
> +static struct smp_hotplug_thread clamp_threads = {
> +	.store			= &clamp_kthreads,
> +	.setup			= clamp_thread_setup,
> +	.cleanup		= clamp_thread_cleanup,
> +	.thread_should_run	= clamp_thread_should_run,
> +	.thread_fn		= clamp_thread_fn,
> +	.park			= clamp_thread_park,
> +	.unpark			= clamp_thread_unpark,
> +	.thread_comm		= "kidle_inject/%u",
> +};
> +
>  static int start_power_clamp(void)
>  {
> -	unsigned long cpu;
> -	struct task_struct *thread;
> +	unsigned int cpu;
>  
>  	/* check if pkg cstate counter is completely 0, abort in
> this case */ if (!has_pkg_state_counter()) {
> @@ -528,23 +565,9 @@ static int start_power_clamp(void)
>  	clamping = true;
>  	schedule_delayed_work(&poll_pkg_cstate_work, 0);
>  
> -	/* start one thread per online cpu */
> -	for_each_online_cpu(cpu) {
> -		struct task_struct **p =
> -			per_cpu_ptr(powerclamp_thread, cpu);
> +	for_each_online_cpu(cpu)
> +		wake_up_process(per_cpu_ptr(clamp_kthreads, cpu));
>  
> -		thread = kthread_create_on_node(clamp_thread,
> -						(void *) cpu,
> -						cpu_to_node(cpu),
> -						"kidle_inject/%ld",
> cpu);
> -		/* bind to cpu here */
> -		if (likely(!IS_ERR(thread))) {
> -			kthread_bind(thread, cpu);
> -			wake_up_process(thread);
> -			*p = thread;
> -		}
> -
> -	}
>  	put_online_cpus();
>  
>  	return 0;
> @@ -552,9 +575,6 @@ static int start_power_clamp(void)
>  
>  static void end_power_clamp(void)
>  {
> -	int i;
> -	struct task_struct *thread;
> -
>  	clamping = false;
>  	/*
>  	 * make clamping visible to other cpus and give per cpu
> clamping threads @@ -562,63 +582,8 @@ static void
> end_power_clamp(void) */
>  	smp_mb();
>  	msleep(20);
> -	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
> -		for_each_set_bit(i, cpu_clamping_mask,
> num_possible_cpus()) {
> -			pr_debug("clamping thread for cpu %d alive,
> kill\n", i);
> -			thread = *per_cpu_ptr(powerclamp_thread, i);
> -			kthread_stop(thread);
> -		}
> -	}
>  }
>  
> -static int powerclamp_cpu_callback(struct notifier_block *nfb,
> -				unsigned long action, void *hcpu)
> -{
> -	unsigned long cpu = (unsigned long)hcpu;
> -	struct task_struct *thread;
> -	struct task_struct **percpu_thread =
> -		per_cpu_ptr(powerclamp_thread, cpu);
> -
> -	if (false == clamping)
> -		goto exit_ok;
> -
> -	switch (action) {
> -	case CPU_ONLINE:
> -		thread = kthread_create_on_node(clamp_thread,
> -						(void *) cpu,
> -						cpu_to_node(cpu),
> -						"kidle_inject/%lu",
> cpu);
> -		if (likely(!IS_ERR(thread))) {
> -			kthread_bind(thread, cpu);
> -			wake_up_process(thread);
> -			*percpu_thread = thread;
> -		}
> -		/* prefer BSP as controlling CPU */
> -		if (cpu == 0) {
> -			control_cpu = 0;
> -			smp_mb();
> -		}
> -		break;
> -	case CPU_DEAD:
> -		if (test_bit(cpu, cpu_clamping_mask)) {
> -			pr_err("cpu %lu dead but powerclamping
> thread is not\n",
> -				cpu);
> -			kthread_stop(*percpu_thread);
> -		}
> -		if (cpu == control_cpu) {
> -			control_cpu = smp_processor_id();
> -			smp_mb();
> -		}
> -	}
> -
> -exit_ok:
> -	return NOTIFY_OK;
> -}
> -
> -static struct notifier_block powerclamp_cpu_notifier = {
> -	.notifier_call = powerclamp_cpu_callback,
> -};
> -
>  static int powerclamp_get_max_state(struct thermal_cooling_device
> *cdev, unsigned long *state)
>  {
> @@ -788,19 +753,15 @@ static int __init powerclamp_init(void)
>  
>  	/* set default limit, maybe adjusted during runtime based on
> feedback */ window_size = 2;
> -	register_hotcpu_notifier(&powerclamp_cpu_notifier);
> -
> -	powerclamp_thread = alloc_percpu(struct task_struct *);
> -	if (!powerclamp_thread) {
> -		retval = -ENOMEM;
> -		goto exit_unregister;
> -	}
> +	retval = smpboot_register_percpu_thread(&clamp_threads);
> +	if (retval)
> +		goto exit_free;
>  
>  	cooling_dev =
> thermal_cooling_device_register("intel_powerclamp", NULL,
> &powerclamp_cooling_ops); if (IS_ERR(cooling_dev)) {
>  		retval = -ENODEV;
> -		goto exit_free_thread;
> +		goto exit_free_smp_thread;
>  	}
>  
>  	if (!duration)
> @@ -809,11 +770,8 @@ static int __init powerclamp_init(void)
>  	powerclamp_create_debug_files();
>  
>  	return 0;
> -
> -exit_free_thread:
> -	free_percpu(powerclamp_thread);
> -exit_unregister:
> -	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
> +exit_free_smp_thread:
> +	smpboot_unregister_percpu_thread(&clamp_threads);
>  exit_free:
>  	kfree(cpu_clamping_mask);
>  	return retval;
> @@ -822,9 +780,8 @@ module_init(powerclamp_init);
>  
>  static void __exit powerclamp_exit(void)
>  {
> -	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
>  	end_power_clamp();
> -	free_percpu(powerclamp_thread);
> +	smpboot_unregister_percpu_thread(&clamp_threads);
>  	thermal_cooling_device_unregister(cooling_dev);
>  	kfree(cpu_clamping_mask);
>  

[Jacob Pan]
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sebastian Andrzej Siewior April 11, 2016, 12:47 p.m. UTC | #2
On 04/06/2016 06:47 PM, Jacob Pan wrote:
> On Fri, 11 Mar 2016 16:38:21 +0100
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> Apologize for the delay, I missed it until Rui reminded me.
>> Oh boy oh boy. This thing runs at SCHED_FIFO MAX_USER_RT_PRIO/2 and
>> stops at mwait_idle_with_hints(). Why bother with /2?
>> There are a few things I haven't fully decoded. For instance why is it
>> looking at local_softirq_pending()?
> the idea is to instrument some kind of quality of service. Idle
> injection is a best effort based mechanism. So we don't want to affect
> high priority realtime tasks nor softirq.

But you disturb RT tasks with prio 1. Also I am not sure if you see the
sofirq bits. The softirq runs before any (RT) task so the `pending '
should be 0. Unless the work is delegated to ksoftirqd.

>> The timer is probably here if mwait would let it sleep too long.
>>
> not sure i understand. could you explain?

The timer invokes noop_timer() which does nothing so the only thing you
want is the interrupt. Your mwait_idle_with_hints() could let you sleep
say for one second. But the timer ensures that you wake up no later than
100us.

>> I tried to convert it over to smpboot thread so we don't have that CPU
>> notifier stuff to fire the cpu threads during hotplug events.
>>
> there is another patchset to convert it to kthread worker. any
> advantage of smpboot thread?
> http://comments.gmane.org/gmane.linux.kernel.mm/144964

It partly does the same thing except you still have your hotplug
notifier which I wanted to get rid off. However it looks better than
before.
If you do prefer the kworker thingy then please switch from CPU_DEAD to
CPU_DOWN_PREPARE (and add CPU_DOWN_FAILED to CPU_ONLINE).
With those changes I should have no further problem with it :)
Any ETA for (either of those patches) upstream?

>> The code / logic itself could be a little more inteligent and only
>> wake up the threads for the CPUs that are about to idle but it seems
>> it is done on all of the at once unless I missed something.
>>
> you mean only force idle when there is no natural idle? i thought
> about that but hard to coordinate and enforce the duration of each
> idle period. Any specific pointers?

Implement it as an idle driver. So it would be invoked once the system
goes idle as an alternative to (the default) mwait. However the fact
that you (seem to) go idle even if there is work to do seems that you
aim a different goal than idle if there is nothing left.

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jacob Pan April 11, 2016, 3:19 p.m. UTC | #3
On Mon, 11 Apr 2016 14:47:12 +0200
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> On 04/06/2016 06:47 PM, Jacob Pan wrote:
> > On Fri, 11 Mar 2016 16:38:21 +0100
> > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> > Apologize for the delay, I missed it until Rui reminded me.
> >> Oh boy oh boy. This thing runs at SCHED_FIFO MAX_USER_RT_PRIO/2 and
> >> stops at mwait_idle_with_hints(). Why bother with /2?
> >> There are a few things I haven't fully decoded. For instance why
> >> is it looking at local_softirq_pending()?
> > the idea is to instrument some kind of quality of service. Idle
> > injection is a best effort based mechanism. So we don't want to
> > affect high priority realtime tasks nor softirq.
> 
> But you disturb RT tasks with prio 1. Also I am not sure if you see
> the sofirq bits. The softirq runs before any (RT) task so the
> `pending ' should be 0. Unless the work is delegated to ksoftirqd.
> 
I agree softirq runs before RT but there could be a gap between
raise_softirq() (set pending) and run softirq(). So it is possible
(thus unlikely()) our RT task runs after pending bits set but before
softirq runs. correct?
> >> The timer is probably here if mwait would let it sleep too long.
> >>
> > not sure i understand. could you explain?
> 
> The timer invokes noop_timer() which does nothing so the only thing
> you want is the interrupt. Your mwait_idle_with_hints() could let you
> sleep say for one second. But the timer ensures that you wake up no
> later than 100us.
> 
yeah, that is the idea to make sure we don't oversleep. You mean we can
optimize this but avoid extra wake ups? e.g. cancel timer if we already
sleep enough time?
> >> I tried to convert it over to smpboot thread so we don't have that
> >> CPU notifier stuff to fire the cpu threads during hotplug events.
> >>
> > there is another patchset to convert it to kthread worker. any
> > advantage of smpboot thread?
> > http://comments.gmane.org/gmane.linux.kernel.mm/144964
> 
> It partly does the same thing except you still have your hotplug
> notifier which I wanted to get rid off. However it looks better than
> before.
> If you do prefer the kworker thingy then please switch from CPU_DEAD
> to CPU_DOWN_PREPARE (and add CPU_DOWN_FAILED to CPU_ONLINE).
> With those changes I should have no further problem with it :)
> Any ETA for (either of those patches) upstream?
> 
+Petr
I do prefer not to keep track of CPU hotplug events. Let me do some
testing.
> >> The code / logic itself could be a little more inteligent and only
> >> wake up the threads for the CPUs that are about to idle but it
> >> seems it is done on all of the at once unless I missed something.
> >>
> > you mean only force idle when there is no natural idle? i thought
> > about that but hard to coordinate and enforce the duration of each
> > idle period. Any specific pointers?
> 
> Implement it as an idle driver. So it would be invoked once the system
> goes idle as an alternative to (the default) mwait. However the fact
> that you (seem to) go idle even if there is work to do seems that you
> aim a different goal than idle if there is nothing left.
> 
Right, we use the same idle inner loop as the idle task but powerclamp
driver aims at aligned, forced, and controlled idle time to manage
power/thermal envelop.
I also had an attempt to do this in CFS sched class.
https://lkml.org/lkml/2015/11/2/756
As it was suggested to be able to stop scheduler tick during force idle
time (which cannot do in  the current powerclamp code).

Jacob
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sebastian Andrzej Siewior April 11, 2016, 4:04 p.m. UTC | #4
On 04/11/2016 05:19 PM, Jacob Pan wrote:
>> But you disturb RT tasks with prio 1. Also I am not sure if you see
>> the sofirq bits. The softirq runs before any (RT) task so the
>> `pending ' should be 0. Unless the work is delegated to ksoftirqd.
>>
> I agree softirq runs before RT but there could be a gap between
> raise_softirq() (set pending) and run softirq(). So it is possible
> (thus unlikely()) our RT task runs after pending bits set but before
> softirq runs. correct?

If you raise_softirq() then softirqs are run on return from IRQ code.
If raise them while holding a BH lock then then they are run after you
drop the BH lock / enable BH again.
If the softirq processing is deferred to ksoftirqd *then* you see the
pending bits.

>>>> The timer is probably here if mwait would let it sleep too long.
>>>>
>>> not sure i understand. could you explain?
>>
>> The timer invokes noop_timer() which does nothing so the only thing
>> you want is the interrupt. Your mwait_idle_with_hints() could let you
>> sleep say for one second. But the timer ensures that you wake up no
>> later than 100us.
>>
> yeah, that is the idea to make sure we don't oversleep. You mean we can
> optimize this but avoid extra wake ups? e.g. cancel timer if we already
> sleep enough time?

No, just stated / summarized what I *assumed* the timer was doing and
just confirmed it.
I don't see a way how you can cancel the timer. And that is why I
suggest to run as an idle handler because those can sleep endless :)
But since you have RT priority you need to make sure that a process
with lower priority manages to get on the CPU at some point.

>>>> I tried to convert it over to smpboot thread so we don't have that
>>>> CPU notifier stuff to fire the cpu threads during hotplug events.
>>>>
>>> there is another patchset to convert it to kthread worker. any
>>> advantage of smpboot thread?
>>> http://comments.gmane.org/gmane.linux.kernel.mm/144964
>>
>> It partly does the same thing except you still have your hotplug
>> notifier which I wanted to get rid off. However it looks better than
>> before.
>> If you do prefer the kworker thingy then please switch from CPU_DEAD
>> to CPU_DOWN_PREPARE (and add CPU_DOWN_FAILED to CPU_ONLINE).
>> With those changes I should have no further problem with it :)
>> Any ETA for (either of those patches) upstream?
>>
> +Petr
> I do prefer not to keep track of CPU hotplug events. Let me do some
> testing.
Okay. Please keep me posted where you stand on this. If you go for the
kwork series then I will try to make a patch which replaces CPU_DEAD to
CPU_DOWN_PREPARE in order to make it symmetrical (and from what it
looks, there is no need to run at CPU_DEAD time).

>> Implement it as an idle driver. So it would be invoked once the system
>> goes idle as an alternative to (the default) mwait. However the fact
>> that you (seem to) go idle even if there is work to do seems that you
>> aim a different goal than idle if there is nothing left.
>>
> Right, we use the same idle inner loop as the idle task but powerclamp
> driver aims at aligned, forced, and controlled idle time to manage
> power/thermal envelop.
> I also had an attempt to do this in CFS sched class.
> https://lkml.org/lkml/2015/11/2/756
> As it was suggested to be able to stop scheduler tick during force idle
> time (which cannot do in  the current powerclamp code).

Right. So if you have FULL_NO_HZ and an idle opportunity then you won't
be able to sleep for very long. I think you will basically interrupt the
idle loop with your idle loop and then *your* timer / noop_timer wakes
the system to switch over to the idle loop.

> Jacob

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Petr Mladek April 12, 2016, 8:21 a.m. UTC | #5
On Mon 2016-04-11 18:04:37, Sebastian Andrzej Siewior wrote:
> On 04/11/2016 05:19 PM, Jacob Pan wrote:
> >>>> I tried to convert it over to smpboot thread so we don't have that
> >>>> CPU notifier stuff to fire the cpu threads during hotplug events.
> >>>>
> >>> there is another patchset to convert it to kthread worker. any
> >>> advantage of smpboot thread?
> >>> http://comments.gmane.org/gmane.linux.kernel.mm/144964
> >>
> >> It partly does the same thing except you still have your hotplug
> >> notifier which I wanted to get rid off. However it looks better than
> >> before.
> >> If you do prefer the kworker thingy then please switch from CPU_DEAD
> >> to CPU_DOWN_PREPARE (and add CPU_DOWN_FAILED to CPU_ONLINE).
> >> With those changes I should have no further problem with it :)
> >> Any ETA for (either of those patches) upstream?

It depends on some improvements of the kthread worker API. I hope
that we are getting close. I would like to send another iteration
later this week.

Best Regards,
Petr

> > +Petr
> > I do prefer not to keep track of CPU hotplug events. Let me do some
> > testing.
> Okay. Please keep me posted where you stand on this. If you go for the
> kwork series then I will try to make a patch which replaces CPU_DEAD to
> CPU_DOWN_PREPARE in order to make it symmetrical (and from what it
> looks, there is no need to run at CPU_DEAD time).
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jacob Pan April 13, 2016, 2:20 p.m. UTC | #6
On Mon, 11 Apr 2016 18:04:37 +0200
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> If you raise_softirq() then softirqs are run on return from IRQ code.
> If raise them while holding a BH lock then then they are run after you
> drop the BH lock / enable BH again.
> If the softirq processing is deferred to ksoftirqd *then* you see the
> pending bits.
> 
you are right. there is no need to check in the thread context. the
forced idle only yield to softirqs that are not in ksoftirqd.
thanks for pointing it out. i will remove it.
> >>>> The timer is probably here if mwait would let it sleep too long.
> >>>>  
> >>> not sure i understand. could you explain?  
> >>
> >> The timer invokes noop_timer() which does nothing so the only thing
> >> you want is the interrupt. Your mwait_idle_with_hints() could let
> >> you sleep say for one second. But the timer ensures that you wake
> >> up no later than 100us.
> >>  
> > yeah, that is the idea to make sure we don't oversleep. You mean we
> > can optimize this but avoid extra wake ups? e.g. cancel timer if we
> > already sleep enough time?  
> 
> No, just stated / summarized what I *assumed* the timer was doing and
> just confirmed it.
> I don't see a way how you can cancel the timer.
i don't either, you have to wake up first to cancel.
> And that is why I
> suggest to run as an idle handler because those can sleep endless :)
can you be more specific about idle handler? and we don't want to sleep
more than user required length since the goal is to  get all cpus sleep
and wake up the same time to maximize the overlapped idle states.
> But since you have RT priority you need to make sure that a process
> with lower priority manages to get on the CPU at some point.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 6c79588251d5..e04f7631426a 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -51,6 +51,7 @@ 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/sched/rt.h>
+#include <linux/smpboot.h>
 
 #include <asm/nmi.h>
 #include <asm/msr.h>
@@ -85,9 +86,9 @@  static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 				  * can be offlined.
 				  */
 static bool clamping;
+static DEFINE_PER_CPU(struct task_struct *, clamp_kthreads);
+static DEFINE_PER_CPU(struct timer_list, clamp_timer);
 
-
-static struct task_struct * __percpu *powerclamp_thread;
 static struct thermal_cooling_device *cooling_dev;
 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
 					   * clamping thread
@@ -368,100 +369,82 @@  static bool powerclamp_adjust_controls(unsigned int target_ratio,
 	return set_target_ratio + guard <= current_ratio;
 }
 
-static int clamp_thread(void *arg)
+static void clamp_thread_fn(unsigned int cpu)
 {
-	int cpunr = (unsigned long)arg;
-	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
-	static const struct sched_param param = {
-		.sched_priority = MAX_USER_RT_PRIO/2,
-	};
 	unsigned int count = 0;
 	unsigned int target_ratio;
+	int sleeptime;
+	unsigned long target_jiffies;
+	unsigned int guard;
+	unsigned int compensation = 0;
+	int interval; /* jiffies to sleep for each attempt */
+	unsigned int duration_jiffies = msecs_to_jiffies(duration);
+	unsigned int window_size_now;
+	struct timer_list *wake_timer = per_cpu_ptr(&clamp_timer, cpu);
 
-	set_bit(cpunr, cpu_clamping_mask);
-	set_freezable();
-	init_timer_on_stack(&wakeup_timer);
-	sched_setscheduler(current, SCHED_FIFO, &param);
+	/*
+	 * make sure user selected ratio does not take effect until
+	 * the next round. adjust target_ratio if user has changed
+	 * target such that we can converge quickly.
+	 */
+	target_ratio = set_target_ratio;
+	guard = 1 + target_ratio/20;
+	window_size_now = window_size;
+	count++;
 
-	while (true == clamping && !kthread_should_stop() &&
-		cpu_online(cpunr)) {
-		int sleeptime;
-		unsigned long target_jiffies;
-		unsigned int guard;
-		unsigned int compensation = 0;
-		int interval; /* jiffies to sleep for each attempt */
-		unsigned int duration_jiffies = msecs_to_jiffies(duration);
-		unsigned int window_size_now;
+	/*
+	 * systems may have different ability to enter package level
+	 * c-states, thus we need to compensate the injected idle ratio
+	 * to achieve the actual target reported by the HW.
+	 */
+	compensation = get_compensation(target_ratio);
+	interval = duration_jiffies*100/(target_ratio+compensation);
 
-		try_to_freeze();
-		/*
-		 * make sure user selected ratio does not take effect until
-		 * the next round. adjust target_ratio if user has changed
-		 * target such that we can converge quickly.
-		 */
-		target_ratio = set_target_ratio;
-		guard = 1 + target_ratio/20;
-		window_size_now = window_size;
-		count++;
-
-		/*
-		 * systems may have different ability to enter package level
-		 * c-states, thus we need to compensate the injected idle ratio
-		 * to achieve the actual target reported by the HW.
-		 */
-		compensation = get_compensation(target_ratio);
-		interval = duration_jiffies*100/(target_ratio+compensation);
-
-		/* align idle time */
-		target_jiffies = roundup(jiffies, interval);
-		sleeptime = target_jiffies - jiffies;
-		if (sleeptime <= 0)
-			sleeptime = 1;
-		schedule_timeout_interruptible(sleeptime);
-		/*
-		 * only elected controlling cpu can collect stats and update
-		 * control parameters.
-		 */
-		if (cpunr == control_cpu && !(count%window_size_now)) {
-			should_skip =
-				powerclamp_adjust_controls(target_ratio,
-							guard, window_size_now);
-			smp_mb();
-		}
-
-		if (should_skip)
-			continue;
-
-		target_jiffies = jiffies + duration_jiffies;
-		mod_timer(&wakeup_timer, target_jiffies);
-		if (unlikely(local_softirq_pending()))
-			continue;
-		/*
-		 * stop tick sched during idle time, interrupts are still
-		 * allowed. thus jiffies are updated properly.
-		 */
-		preempt_disable();
-		/* mwait until target jiffies is reached */
-		while (time_before(jiffies, target_jiffies)) {
-			unsigned long ecx = 1;
-			unsigned long eax = target_mwait;
-
-			/*
-			 * REVISIT: may call enter_idle() to notify drivers who
-			 * can save power during cpu idle. same for exit_idle()
-			 */
-			local_touch_nmi();
-			stop_critical_timings();
-			mwait_idle_with_hints(eax, ecx);
-			start_critical_timings();
-			atomic_inc(&idle_wakeup_counter);
-		}
-		preempt_enable();
+	/* align idle time */
+	target_jiffies = roundup(jiffies, interval);
+	sleeptime = target_jiffies - jiffies;
+	if (sleeptime <= 0)
+		sleeptime = 1;
+	schedule_timeout_interruptible(sleeptime);
+	/*
+	 * only elected controlling cpu can collect stats and update
+	 * control parameters.
+	 */
+	if (cpu == control_cpu && !(count%window_size_now)) {
+		should_skip =
+			powerclamp_adjust_controls(target_ratio,
+						   guard, window_size_now);
+		smp_mb();
 	}
-	del_timer_sync(&wakeup_timer);
-	clear_bit(cpunr, cpu_clamping_mask);
 
-	return 0;
+	if (should_skip)
+		return;
+
+	target_jiffies = jiffies + duration_jiffies;
+	mod_timer(wake_timer, target_jiffies);
+	if (unlikely(local_softirq_pending()))
+		return;
+	/*
+	 * stop tick sched during idle time, interrupts are still
+	 * allowed. thus jiffies are updated properly.
+	 */
+	preempt_disable();
+	/* mwait until target jiffies is reached */
+	while (time_before(jiffies, target_jiffies)) {
+		unsigned long ecx = 1;
+		unsigned long eax = target_mwait;
+
+		/*
+		 * REVISIT: may call enter_idle() to notify drivers who
+		 * can save power during cpu idle. same for exit_idle()
+		 */
+		local_touch_nmi();
+		stop_critical_timings();
+		mwait_idle_with_hints(eax, ecx);
+		start_critical_timings();
+		atomic_inc(&idle_wakeup_counter);
+	}
+	preempt_enable();
 }
 
 /*
@@ -505,10 +488,64 @@  static void poll_pkg_cstate(struct work_struct *dummy)
 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 }
 
+static void clamp_thread_setup(unsigned int cpu)
+{
+	struct timer_list *wake_timer;
+	static struct sched_param param = {
+		.sched_priority = MAX_USER_RT_PRIO/2,
+	};
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	wake_timer = per_cpu_ptr(&clamp_timer, cpu);
+
+	setup_timer(wake_timer, noop_timer, 0);
+}
+
+static void clamp_thread_unpark(unsigned int cpu)
+{
+	set_bit(cpu, cpu_clamping_mask);
+	if (cpu == 0) {
+		control_cpu = 0;
+		smp_mb();
+	}
+}
+
+static void clamp_thread_park(unsigned int cpu)
+{
+	clear_bit(cpu, cpu_clamping_mask);
+	if (cpu == control_cpu) {
+		control_cpu = cpumask_any_but(cpu_online_mask, cpu);
+		smp_mb();
+	}
+	del_timer_sync(per_cpu_ptr(&clamp_timer, cpu));
+}
+
+static void clamp_thread_cleanup(unsigned int cpu, bool online)
+{
+	if (!online)
+		return;
+	clamp_thread_park(cpu);
+}
+
+static int clamp_thread_should_run(unsigned int cpu)
+{
+	return clamping == true;
+}
+
+static struct smp_hotplug_thread clamp_threads = {
+	.store			= &clamp_kthreads,
+	.setup			= clamp_thread_setup,
+	.cleanup		= clamp_thread_cleanup,
+	.thread_should_run	= clamp_thread_should_run,
+	.thread_fn		= clamp_thread_fn,
+	.park			= clamp_thread_park,
+	.unpark			= clamp_thread_unpark,
+	.thread_comm		= "kidle_inject/%u",
+};
+
 static int start_power_clamp(void)
 {
-	unsigned long cpu;
-	struct task_struct *thread;
+	unsigned int cpu;
 
 	/* check if pkg cstate counter is completely 0, abort in this case */
 	if (!has_pkg_state_counter()) {
@@ -528,23 +565,9 @@  static int start_power_clamp(void)
 	clamping = true;
 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
 
-	/* start one thread per online cpu */
-	for_each_online_cpu(cpu) {
-		struct task_struct **p =
-			per_cpu_ptr(powerclamp_thread, cpu);
+	for_each_online_cpu(cpu)
+		wake_up_process(per_cpu_ptr(clamp_kthreads, cpu));
 
-		thread = kthread_create_on_node(clamp_thread,
-						(void *) cpu,
-						cpu_to_node(cpu),
-						"kidle_inject/%ld", cpu);
-		/* bind to cpu here */
-		if (likely(!IS_ERR(thread))) {
-			kthread_bind(thread, cpu);
-			wake_up_process(thread);
-			*p = thread;
-		}
-
-	}
 	put_online_cpus();
 
 	return 0;
@@ -552,9 +575,6 @@  static int start_power_clamp(void)
 
 static void end_power_clamp(void)
 {
-	int i;
-	struct task_struct *thread;
-
 	clamping = false;
 	/*
 	 * make clamping visible to other cpus and give per cpu clamping threads
@@ -562,63 +582,8 @@  static void end_power_clamp(void)
 	 */
 	smp_mb();
 	msleep(20);
-	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
-		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
-			pr_debug("clamping thread for cpu %d alive, kill\n", i);
-			thread = *per_cpu_ptr(powerclamp_thread, i);
-			kthread_stop(thread);
-		}
-	}
 }
 
-static int powerclamp_cpu_callback(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
-{
-	unsigned long cpu = (unsigned long)hcpu;
-	struct task_struct *thread;
-	struct task_struct **percpu_thread =
-		per_cpu_ptr(powerclamp_thread, cpu);
-
-	if (false == clamping)
-		goto exit_ok;
-
-	switch (action) {
-	case CPU_ONLINE:
-		thread = kthread_create_on_node(clamp_thread,
-						(void *) cpu,
-						cpu_to_node(cpu),
-						"kidle_inject/%lu", cpu);
-		if (likely(!IS_ERR(thread))) {
-			kthread_bind(thread, cpu);
-			wake_up_process(thread);
-			*percpu_thread = thread;
-		}
-		/* prefer BSP as controlling CPU */
-		if (cpu == 0) {
-			control_cpu = 0;
-			smp_mb();
-		}
-		break;
-	case CPU_DEAD:
-		if (test_bit(cpu, cpu_clamping_mask)) {
-			pr_err("cpu %lu dead but powerclamping thread is not\n",
-				cpu);
-			kthread_stop(*percpu_thread);
-		}
-		if (cpu == control_cpu) {
-			control_cpu = smp_processor_id();
-			smp_mb();
-		}
-	}
-
-exit_ok:
-	return NOTIFY_OK;
-}
-
-static struct notifier_block powerclamp_cpu_notifier = {
-	.notifier_call = powerclamp_cpu_callback,
-};
-
 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
 {
@@ -788,19 +753,15 @@  static int __init powerclamp_init(void)
 
 	/* set default limit, maybe adjusted during runtime based on feedback */
 	window_size = 2;
-	register_hotcpu_notifier(&powerclamp_cpu_notifier);
-
-	powerclamp_thread = alloc_percpu(struct task_struct *);
-	if (!powerclamp_thread) {
-		retval = -ENOMEM;
-		goto exit_unregister;
-	}
+	retval = smpboot_register_percpu_thread(&clamp_threads);
+	if (retval)
+		goto exit_free;
 
 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 						&powerclamp_cooling_ops);
 	if (IS_ERR(cooling_dev)) {
 		retval = -ENODEV;
-		goto exit_free_thread;
+		goto exit_free_smp_thread;
 	}
 
 	if (!duration)
@@ -809,11 +770,8 @@  static int __init powerclamp_init(void)
 	powerclamp_create_debug_files();
 
 	return 0;
-
-exit_free_thread:
-	free_percpu(powerclamp_thread);
-exit_unregister:
-	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
+exit_free_smp_thread:
+	smpboot_unregister_percpu_thread(&clamp_threads);
 exit_free:
 	kfree(cpu_clamping_mask);
 	return retval;
@@ -822,9 +780,8 @@  module_init(powerclamp_init);
 
 static void __exit powerclamp_exit(void)
 {
-	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 	end_power_clamp();
-	free_percpu(powerclamp_thread);
+	smpboot_unregister_percpu_thread(&clamp_threads);
 	thermal_cooling_device_unregister(cooling_dev);
 	kfree(cpu_clamping_mask);