diff mbox series

[3/6] psi: eliminate lazy clock mode

Message ID 20181214171508.7791-4-surenb@google.com (mailing list archive)
State New, archived
Headers show
Series psi: pressure stall monitors | expand

Commit Message

Suren Baghdasaryan Dec. 14, 2018, 5:15 p.m. UTC
From: Johannes Weiner <hannes@cmpxchg.org>

psi currently stops its periodic 2s aggregation runs when there has
not been any task activity, and wakes it back up later from the
scheduler when the system returns from the idle state.

The coordination between the aggregation worker and the scheduler is
minimal: the scheduler has to nudge the worker if it's not running,
and the worker will reschedule itself periodically until it detects no
more activity.

The polling patches will complicate this, because they introduce
another aggregation mode for high-frequency polling that also
eventually times out if the worker sees no more activity of interest.
That means the scheduler portion would have to coordinate three state
transitions - idle to regular, regular to polling, idle to polling -
with the worker's timeouts and self-rescheduling. The additional
overhead from this is undesirable in the scheduler hotpath.

Eliminate the idle mode and keep the worker doing 2s update intervals
at all times. This eliminates worker coordination from the scheduler
completely. The polling patches will then add it back to switch
between regular mode and high-frequency polling mode.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 kernel/sched/psi.c | 55 +++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 33 deletions(-)

Comments

Peter Zijlstra Dec. 17, 2018, 2:57 p.m. UTC | #1
On Fri, Dec 14, 2018 at 09:15:05AM -0800, Suren Baghdasaryan wrote:
> Eliminate the idle mode and keep the worker doing 2s update intervals
> at all times.

That sounds like a bad deal.. esp. so for battery powered devices like
say Andoird.

In general the push has been to always idle everything, see NOHZ and
NOHZ_FULL and all the work that's being put into getting rid of any and
all period work.
Suren Baghdasaryan Dec. 18, 2018, 1:10 a.m. UTC | #2
On Mon, Dec 17, 2018 at 6:58 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Dec 14, 2018 at 09:15:05AM -0800, Suren Baghdasaryan wrote:
> > Eliminate the idle mode and keep the worker doing 2s update intervals
> > at all times.
>
> That sounds like a bad deal.. esp. so for battery powered devices like
> say Andoird.
>
> In general the push has been to always idle everything, see NOHZ and
> NOHZ_FULL and all the work that's being put into getting rid of any and
> all period work.

Thanks for the feedback Peter! The removal of idle mode is unfortunate
but so far we could not find an elegant solution to handle 3 states
(IDLE / REGULAR / POLLING) without additional synchronization inside
the hotpath. The issue, as I remember it, was that while scheduling a
regular update inside psi_group_change() (IDLE to REGULAR transition)
we might override an earlier update being scheduled inside
psi_update_work(). I think we can solve that by using
mod_delayed_work_on() inside psi_update_work() but I might be missing
some other race. I'll discuss this again with Johannes and see if we
can synchronize all states using only atomic operations on clock_mode.

> --
> You received this message because you are subscribed to the Google Groups "kernel-team" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
diff mbox series

Patch

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index fe24de3fbc93..d2b9c9a1a62f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -248,18 +248,10 @@  static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 	}
 }
 
-static void calc_avgs(unsigned long avg[3], int missed_periods,
-		      u64 time, u64 period)
+static void calc_avgs(unsigned long avg[3], u64 time, u64 period)
 {
 	unsigned long pct;
 
-	/* Fill in zeroes for periods of no activity */
-	if (missed_periods) {
-		avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
-		avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
-		avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
-	}
-
 	/* Sample the most recent active period */
 	pct = div_u64(time * 100, period);
 	pct *= FIXED_1;
@@ -268,10 +260,9 @@  static void calc_avgs(unsigned long avg[3], int missed_periods,
 	avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
 
-static bool update_stats(struct psi_group *group)
+static void update_stats(struct psi_group *group)
 {
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
-	unsigned long missed_periods = 0;
 	unsigned long nonidle_total = 0;
 	u64 now, expires, period;
 	int cpu;
@@ -321,8 +312,6 @@  static bool update_stats(struct psi_group *group)
 	expires = group->next_update;
 	if (now < expires)
 		goto out;
-	if (now - expires > psi_period)
-		missed_periods = div_u64(now - expires, psi_period);
 
 	/*
 	 * The periodic clock tick can get delayed for various
@@ -331,8 +320,8 @@  static bool update_stats(struct psi_group *group)
 	 * But the deltas we sample out of the per-cpu buckets above
 	 * are based on the actual time elapsing between clock ticks.
 	 */
-	group->next_update = expires + ((1 + missed_periods) * psi_period);
-	period = now - (group->last_update + (missed_periods * psi_period));
+	group->next_update = expires + psi_period;
+	period = now - group->last_update;
 	group->last_update = now;
 
 	for (s = 0; s < NR_PSI_STATES - 1; s++) {
@@ -359,18 +348,18 @@  static bool update_stats(struct psi_group *group)
 		if (sample > period)
 			sample = period;
 		group->total_prev[s] += sample;
-		calc_avgs(group->avg[s], missed_periods, sample, period);
+		calc_avgs(group->avg[s], sample, period);
 	}
 out:
 	mutex_unlock(&group->stat_lock);
-	return nonidle_total;
 }
 
 static void psi_update_work(struct work_struct *work)
 {
 	struct delayed_work *dwork;
 	struct psi_group *group;
-	bool nonidle;
+	unsigned long delay = 0;
+	u64 now;
 
 	dwork = to_delayed_work(work);
 	group = container_of(dwork, struct psi_group, clock_work);
@@ -383,17 +372,12 @@  static void psi_update_work(struct work_struct *work)
 	 * go - see calc_avgs() and missed_periods.
 	 */
 
-	nonidle = update_stats(group);
-
-	if (nonidle) {
-		unsigned long delay = 0;
-		u64 now;
+	update_stats(group);
 
-		now = sched_clock();
-		if (group->next_update > now)
-			delay = nsecs_to_jiffies(group->next_update - now) + 1;
-		schedule_delayed_work(dwork, delay);
-	}
+	now = sched_clock();
+	if (group->next_update > now)
+		delay = nsecs_to_jiffies(group->next_update - now) + 1;
+	schedule_delayed_work(dwork, delay);
 }
 
 static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -480,9 +464,6 @@  static void psi_group_change(struct psi_group *group, int cpu,
 			groupc->tasks[t]++;
 
 	write_seqcount_end(&groupc->seq);
-
-	if (!delayed_work_pending(&group->clock_work))
-		schedule_delayed_work(&group->clock_work, PSI_FREQ);
 }
 
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -619,6 +600,8 @@  int psi_cgroup_alloc(struct cgroup *cgroup)
 	if (!cgroup->psi.pcpu)
 		return -ENOMEM;
 	group_init(&cgroup->psi);
+	schedule_delayed_work(&cgroup->psi.clock_work, PSI_FREQ);
+
 	return 0;
 }
 
@@ -761,12 +744,18 @@  static const struct file_operations psi_cpu_fops = {
 	.release        = single_release,
 };
 
-static int __init psi_proc_init(void)
+static int __init psi_late_init(void)
 {
+	if (static_branch_likely(&psi_disabled))
+		return 0;
+
+	schedule_delayed_work(&psi_system.clock_work, PSI_FREQ);
+
 	proc_mkdir("pressure", NULL);
 	proc_create("pressure/io", 0, NULL, &psi_io_fops);
 	proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
 	proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+
 	return 0;
 }
-module_init(psi_proc_init);
+module_init(psi_late_init);