@@ -137,6 +137,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_ARM_CCN_ONLINE,
CPUHP_AP_PERF_ARM_L2X0_ONLINE,
CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
+ CPUHP_AP_TMIGR_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
CPUHP_AP_ONLINE_DYN,
@@ -15,5 +15,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROAD
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
+obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
@@ -30,6 +30,7 @@
#include <asm/irq_regs.h>
+#include "timer_migration.h"
#include "tick-internal.h"
#include <trace/events/timer.h>
@@ -680,11 +681,46 @@ static void tick_nohz_restart(struct tic
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
+#ifdef CONFIG_SMP
+static u64
+tick_tmigr_idle(struct tick_sched *ts, u64 next_global, u64 next_local)
+{
+ ts->tmigr_idle = 1;
+
+ /*
+ * If next_global is after next_local, event does not have to
+ * be queued in the timer migration hierarchy, but cpu needs
+ * to be marked as idle.
+ */
+ if (next_global >= next_local)
+ next_global = KTIME_MAX;
+
+ next_global = tmigr_cpu_idle(next_global);
+
+ return min_t(u64, next_local, next_global);
+}
+
+static void tick_tmigr_stop_idle(struct tick_sched *ts)
+{
+ if (ts->tmigr_idle) {
+ ts->tmigr_idle = 0;
+ tmigr_cpu_activate();
+ }
+}
+#else
+static u64
+tick_tmigr_idle(struct tick_sched *ts, u64 next_global, u64 next_local)
+{
+ return min_t(u64, next_global, next_local);
+}
+static inline void tick_tmigr_stop_idle(struct tick_sched *ts) { }
+#endif /*CONFIG_SMP*/
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- u64 basemono, next_tick, next_local, next_global, next_rcu, delta, expires;
+ u64 basemono, next_local, next_global, next_rcu, delta, expires;
unsigned long basejiff;
ktime_t tick;
@@ -693,7 +729,12 @@ static ktime_t tick_nohz_stop_sched_tick
if (rcu_needs_cpu(basemono, &next_rcu) ||
arch_needs_cpu() || irq_work_needs_cpu()) {
- next_tick = basemono + TICK_NSEC;
+ /*
+ * If anyone needs the CPU, treat this as a local
+ * timer expiring in a jiffy.
+ */
+ next_global = KTIME_MAX;
+ next_local = basemono + TICK_NSEC;
} else {
/*
* Get the next pending timer. If high resolution
@@ -704,21 +745,48 @@ static ktime_t tick_nohz_stop_sched_tick
*/
next_local = get_next_timer_interrupt(basejiff, basemono,
&next_global);
- next_local = min(next_local, next_global);
- ts->next_timer = next_local;
- /* Take the next rcu event into account */
- next_tick = next_rcu < next_local ? next_rcu : next_local;
+ /*
+ * Take RCU into account. Even though rcu_needs_cpu()
+ * returned false, RCU might need a wake up event
+ * after all, as reflected in the value of next_rcu.
+ */
+ next_local = min_t(u64, next_rcu, next_local);
}
/*
+ * The CPU might be the last one going inactive in the timer
+ * migration hierarchy. Mark the CPU inactive and get the next
+ * timer event returned, if the next local event is more than a
+ * tick away.
+ *
+ * If the CPU is the last one to go inactive, then the earliest
+ * event (next local or next global event in the hierarchy) is
+ * returned.
+ *
+ * Otherwise the next local event is returned and the next global
+ * event of this CPU is queued in the hierarchy.
+ */
+ delta = next_local - basemono;
+ if (delta > (u64)TICK_NSEC)
+ next_local = tick_tmigr_idle(ts, next_local, next_global);
+
+ ts->next_timer = next_local;
+
+ /*
* If the tick is due in the next period, keep it ticking or
* force prod the timer.
*/
- delta = next_tick - basemono;
+ delta = next_local - basemono;
if (delta <= (u64)TICK_NSEC) {
tick = 0;
/*
+ * If the CPU deactivated itself in the timer migration
+ * hierarchy, activate it again.
+ */
+ tick_tmigr_stop_idle(ts);
+
+ /*
* Tell the timer code that the base is not idle, i.e. undo
* the effect of get_next_timer_interrupt():
*/
@@ -782,7 +850,7 @@ static ktime_t tick_nohz_stop_sched_tick
else
expires = KTIME_MAX;
- expires = min_t(u64, expires, next_tick);
+ expires = min_t(u64, expires, next_local);
tick = expires;
/* Skip reprogram of event if its not changed */
@@ -1047,6 +1115,8 @@ void tick_nohz_idle_exit(void)
ts->inidle = 0;
+ tick_tmigr_stop_idle(ts);
+
if (ts->idle_active || ts->tick_stopped)
now = ktime_get();
@@ -1126,8 +1196,12 @@ static inline void tick_nohz_irq_enter(v
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
- if (!ts->idle_active && !ts->tick_stopped)
+ if (!ts->idle_active && !ts->tick_stopped && !ts->tmigr_idle)
return;
+
+ /* FIMXE: Is this really needed ???? */
+ tick_tmigr_stop_idle(ts);
+
now = ktime_get();
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
@@ -27,7 +27,9 @@ enum tick_nohz_mode {
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
+ * @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
+ * @tmigr_idle: Indicator that the CPU is idle vs. timer migration
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
@@ -46,6 +48,7 @@ struct tick_sched {
ktime_t last_tick;
int inidle;
int tick_stopped;
+ int tmigr_idle;
unsigned long idle_jiffies;
unsigned long idle_calls;
unsigned long idle_sleeps;
@@ -51,6 +51,7 @@
#include <asm/timex.h>
#include <asm/io.h>
+#include "timer_migration.h"
#include "tick-internal.h"
#define CREATE_TRACE_POINTS
@@ -185,6 +186,10 @@ EXPORT_SYMBOL(jiffies_64);
#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * If multiple bases need to be locked, use the base ordering for lock
+ * nesting, i.e. lowest number first.
+ */
# define NR_BASES 3
# define BASE_LOCAL 0
# define BASE_GLOBAL 1
@@ -216,7 +221,7 @@ unsigned int sysctl_timer_migration = 1;
void timers_update_migration(bool update_nohz)
{
- bool on = sysctl_timer_migration && tick_nohz_active;
+ bool on = sysctl_timer_migration && tick_nohz_active && tmigr_enabled;
unsigned int cpu;
/* Avoid the loop, if nothing to update */
@@ -1619,13 +1624,27 @@ static int collect_expired_timers(struct
}
return __collect_expired_timers(base, heads);
}
-#else
+
+static inline void __run_timers(struct timer_base *base);
+
+#ifdef CONFIG_SMP
+void timer_expire_remote(unsigned int cpu)
+{
+ struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
+
+ spin_lock_irq(&base->lock);
+ __run_timers(base);
+ spin_unlock_irq(&base->lock);
+}
+#endif
+
+#else /* CONFIG_NO_HZ_COMMON */
static inline int collect_expired_timers(struct timer_base *base,
struct hlist_head *heads)
{
return __collect_expired_timers(base, heads);
}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* Called from the timer interrupt handler to charge one tick to the current
@@ -1689,11 +1708,16 @@ static void run_timer_base(int index, bo
*/
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
+
run_timer_base(BASE_LOCAL, false);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
run_timer_base(BASE_GLOBAL, false);
run_timer_base(BASE_DEF, true);
+
+ if (base->nohz_active)
+ tmigr_handle_remote();
}
}
@@ -1909,6 +1933,7 @@ void __init init_timers(void)
{
init_timer_cpus();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
+ tmigr_init();
}
/**
@@ -0,0 +1,642 @@
+/*
+ * Infrastructure for migrateable timers
+ *
+ * Copyright(C) 2016 linutronix GmbH
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpuhotplug.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/timerqueue.h>
+#include <linux/timer.h>
+
+#include "timer_migration.h"
+#include "tick-internal.h"
+
+#ifdef DEBUG
+# define DBG_BUG_ON(x) BUG_ON(x)
+#else
+# define DBG_BUG_ON(x)
+#endif
+
+/* Per group capacity. Must be a power of 2! */
+static const unsigned int tmigr_childs_per_group = 8;
+
+bool tmigr_enabled __read_mostly;
+static unsigned int tmigr_hierarchy_levels __read_mostly;
+static unsigned int tmigr_crossnode_level __read_mostly;
+static struct list_head *tmigr_level_list __read_mostly;
+
+static DEFINE_MUTEX(tmigr_mutex);
+
+static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
+
+static void tmigr_add_evt(struct tmigr_group *group, struct tmigr_event *evt)
+{
+ /*
+ * Can be called with @evt == NULL, an already queued @evt or
+ * an event that do not need to be queued (expires ==
+ * KTIME_MAX)
+ */
+ if (!evt || !RB_EMPTY_NODE(&evt->nextevt.node) ||
+ evt->nextevt.expires == KTIME_MAX)
+ return;
+
+ /* @group->group event must not be queued in the parent group */
+ DBG_BUG_ON(!RB_EMPTY_NODE(&group->groupevt.nextevt.node));
+
+ /* If this is the new first to expire event, update group event */
+ if (timerqueue_add(&group->events, &evt->nextevt)) {
+ group->groupevt.nextevt.expires = evt->nextevt.expires;
+ group->groupevt.cpu = evt->cpu;
+ }
+}
+
+static void tmigr_remove_evt(struct tmigr_group *group, struct tmigr_event *evt)
+{
+ struct timerqueue_node *next;
+ struct tmigr_event *nextevt;
+ bool first;
+
+ /*
+ * It's safe to modify the group event of this group, because it is
+ * not queued in the parent group.
+ */
+ DBG_BUG_ON(!RB_EMPTY_NODE(&group->groupevt.nextevt.node));
+
+ /* Remove the child event, if pending */
+ if (!evt || RB_EMPTY_NODE(&evt->nextevt.node))
+ return;
+ /*
+ * If this was the last queued event in the group, clear
+ * the group event. If this was the first event to expire,
+ * update the group.
+ */
+ first = (timerqueue_getnext(&group->events) == &evt->nextevt);
+
+ if (!timerqueue_del(&group->events, &evt->nextevt)) {
+ group->groupevt.nextevt.expires = KTIME_MAX;
+ group->groupevt.cpu = TMIGR_NONE;
+ } else if (first) {
+ next = timerqueue_getnext(&group->events);
+ nextevt = container_of(next, struct tmigr_event, nextevt);
+ group->groupevt.nextevt.expires = nextevt->nextevt.expires;
+ group->groupevt.cpu = nextevt->cpu;
+ }
+}
+
+static void tmigr_update_remote(unsigned int cpu, u64 now, unsigned long jif)
+{
+ struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
+ struct tmigr_group *group = tmc->tmgroup;
+ u64 next_local, next_global;
+
+ /*
+ * Here the migrator CPU races with the target CPU. The migrator
+ * removed @tmc->nextevt from the group's queue, but it then dropped
+ * the group lock. Concurrently the target CPU might have serviced
+ * an interrupt and therefore have called tmigr_cpu_activate() and
+ * possibly tmigr_cpu_idle() which requeued CPUs @tmc into @group.
+ *
+ * Must hold @tmc->lock for changing @tmc->nextevt and @group->lock
+ * to protect the timer queue of @group.
+ */
+ raw_spin_lock_irq(&tmc->lock);
+ raw_spin_lock(&group->lock);
+
+ /*
+ * If the cpu went offline or marked itself active again, nothing
+ * more to do.
+ */
+ if (!tmc->online || cpumask_test_cpu(cpu, group->cpus))
+ goto done;
+
+ /*
+ * Although __timgr_handle_remote() just dequeued the event, still
+ * the target CPU might have added it again after the lock got
+ * dropped. If it's queued the group queue is up to date.
+ */
+ if (!RB_EMPTY_NODE(&tmc->cpuevt.nextevt.node))
+ goto done;
+
+ /*
+ * Recalculate next event. Needs to be calculated while holding the
+ * lock because the first expiring global timer could have been
+ * removed since the last evaluation.
+ */
+ next_local = get_next_timer_interrupt(jif, now, &next_global);
+
+ /*
+ * If next_global is after next_local, event does not have to
+ * be queued.
+ */
+ if (next_global >= next_local)
+ next_global = KTIME_MAX;
+
+ tmc->cpuevt.nextevt.expires = next_global;
+
+ /* Queue @cpu event (is not ne queued if expires == KTIME_MAX) */
+ tmigr_add_evt(group, &tmc->cpuevt);
+
+done:
+ raw_spin_unlock(&group->lock);
+ raw_spin_unlock_irq(&tmc->lock);
+}
+
+static void __tmigr_handle_remote(struct tmigr_group *group, unsigned int cpu,
+ u64 now, unsigned long jif, bool walkup)
+{
+ struct timerqueue_node *tmr;
+ struct tmigr_group *parent;
+ struct tmigr_event *evt;
+
+again:
+ raw_spin_lock_irq(&group->lock);
+ /*
+ * Handle the group only if @cpu is the migrator or if the group
+ * has no migrator. Otherwise the group is active and is handled by
+ * its own migrator.
+ */
+ if (group->migrator != cpu && group->migrator != TMIGR_NONE) {
+ raw_spin_unlock_irq(&group->lock);
+ return;
+ }
+
+ tmr = timerqueue_getnext(&group->events);
+ if (tmr && now >= tmr->expires) {
+ /*
+ * Remove the expired entry from the queue and handle
+ * it. If this is a leaf group, call the timer poll
+ * function for the given cpu. Otherwise handle the group
+ * itself. Drop the group lock here in both cases to avoid
+ * lock ordering inversions.
+ */
+ evt = container_of(tmr, struct tmigr_event, nextevt);
+ tmigr_remove_evt(group, evt);
+
+ raw_spin_unlock_irq(&group->lock);
+
+ /*
+ * If the event is a group event walk down the hierarchy of
+ * that group to the CPU leafs. If not, handle the expired
+ * timer from the remote CPU.
+ */
+ if (evt->group) {
+ __tmigr_handle_remote(evt->group, cpu, now, jif, false);
+ } else {
+ timer_expire_remote(evt->cpu);
+ tmigr_update_remote(evt->cpu, now, jif);
+ }
+ goto again;
+ }
+
+ /*
+ * If @group is not active, queue the next event in the parent
+ * group. This is required, because the next event of @group
+ * could have been changed by tmigr_update_remote() above.
+ */
+ parent = group->parent;
+ if (parent && !group->active) {
+ raw_spin_lock_nested(&parent->lock, parent->level);
+ tmigr_add_evt(parent, &group->groupevt);
+ raw_spin_unlock(&parent->lock);
+ }
+ raw_spin_unlock_irq(&group->lock);
+
+ /* Walk the hierarchy up? */
+ if (!walkup || !parent)
+ return;
+
+ /* Racy lockless check: See comment in tmigr_handle_remote() */
+ if (parent->migrator == cpu)
+ __tmigr_handle_remote(parent, cpu, now, jif, true);
+}
+
+/**
+ * tmigr_handle_remote - Handle migratable timers on remote idle CPUs
+ *
+ * Called from the timer soft interrupt with interrupts enabled.
+ */
+void tmigr_handle_remote(void)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ int cpu = smp_processor_id();
+ unsigned long basej;
+ ktime_t now;
+
+ if (!tmigr_enabled)
+ return;
+
+ /*
+ * Check whether this CPU is responsible for handling the global
+ * timers of other CPUs. Do a racy lockless check to avoid lock
+ * contention for the busy case where timer soft interrupts happen
+ * in parallel. It's not an issue, if the CPU misses a concurrent
+ * update of the migrator role for its base group. It's not more
+ * racy than doing this check under the lock, if the update happens
+ * right after the lock is dropped. There is no damage in such a
+ * case other than potentially expiring a global timer one tick
+ * late.
+ */
+ if (tmc->tmgroup->migrator != cpu)
+ return;
+
+ now = get_jiffies_update(&basej);
+ __tmigr_handle_remote(tmc->tmgroup, cpu, now, basej, true);
+}
+
+/**
+ * tmigr_set_cpu_inactive - Set a CPU inactive in the group
+ * @group: The group from which @cpu is removed
+ * @child: The child group which was updated before
+ * @evt: The event to queue in @group
+ * @cpu: The CPU which becomes inactive
+ *
+ * Remove @cpu from @group and propagate it through the hierarchy if
+ * @cpu was the migrator of @group.
+ *
+ * Returns KTIME_MAX if @cpu is not the last outgoing CPU in the
+ * hierarchy. Otherwise it returns the first expiring global event.
+ */
+static u64 tmigr_set_cpu_inactive(struct tmigr_group *group,
+ struct tmigr_group *child,
+ struct tmigr_event *evt,
+ unsigned int cpu)
+{
+ struct tmigr_group *parent;
+ u64 nextevt = KTIME_MAX;
+
+ raw_spin_lock_nested(&group->lock, group->level);
+
+ DBG_BUG_ON(!group->active);
+
+ cpumask_clear_cpu(cpu, group->cpus);
+ group->active--;
+
+ /*
+ * If @child is not NULL, then this is a recursive invocation to
+ * propagate the deactivation of @cpu. If @child has a new migrator
+ * set it active in @group.
+ */
+ if (child && child->migrator != TMIGR_NONE) {
+ cpumask_set_cpu(child->migrator, group->cpus);
+ group->active++;
+ }
+
+ /* Add @evt to @group */
+ tmigr_add_evt(group, evt);
+
+ /* If @cpu is not the active migrator, everything is up to date */
+ if (group->migrator != cpu)
+ goto done;
+
+ /* Update the migrator. */
+ if (!group->active)
+ group->migrator = TMIGR_NONE;
+ else
+ group->migrator = cpumask_first(group->cpus);
+
+ parent = group->parent;
+ if (parent) {
+ /*
+ * @cpu was the migrator in @group, so it is marked as
+ * active in its parent group(s) as well. Propagate the
+ * migrator change.
+ */
+ evt = group->active ? NULL : &group->groupevt;
+ nextevt = tmigr_set_cpu_inactive(parent, group, evt, cpu);
+ } else {
+ /*
+ * This is the top level of the hierarchy. If @cpu is about
+ * to go offline wake up some random other cpu so it will
+ * take over the migrator duty and program its timer
+ * proper. Ideally wake the cpu with the closest expiry
+ * time, but that's overkill to figure out.
+ */
+ if (!per_cpu(tmigr_cpu, cpu).online) {
+ cpu = cpumask_any_but(cpu_online_mask, cpu);
+ smp_send_reschedule(cpu);
+ }
+ /*
+ * Return the earliest event of the top level group to make
+ * sure that its handled.
+ *
+ * This could be optimized by keeping track of the last
+ * global scheduled event and only arming it on @cpu if the
+ * new event is earlier. Not sure if its worth the
+ * complexity.
+ */
+ nextevt = group->groupevt.nextevt.expires;
+ }
+done:
+ raw_spin_unlock(&group->lock);
+ return nextevt;
+}
+
+/**
+ * tmigr_cpu_idle - Put current CPU into idle state
+ * @nextevt: The next timer event set in the current CPU
+ *
+ * Returns either the next event of the current CPU or the next event from
+ * the hierarchy if this CPU is the top level migrator.
+ *
+ * Must be called with interrupts disabled.
+ */
+u64 tmigr_cpu_idle(u64 nextevt)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ struct tmigr_group *group = tmc->tmgroup;
+ int cpu = smp_processor_id();
+
+ if (!tmc->online)
+ return nextevt;
+
+ raw_spin_lock(&tmc->lock);
+ tmc->cpuevt.nextevt.expires = nextevt;
+ nextevt = tmigr_set_cpu_inactive(group, NULL, &tmc->cpuevt, cpu);
+ raw_spin_unlock(&tmc->lock);
+ return nextevt;
+}
+
+/*
+ * tmigr_set_cpu_active - Propagate the activation of a CPU
+ * @group: The group in which the CPU is activated
+ * @evt: The event which is removed from @group
+ * @cpu: The CPU which is activated
+ */
+static void tmigr_set_cpu_active(struct tmigr_group *group,
+ struct tmigr_event *evt,
+ unsigned int cpu)
+{
+ raw_spin_lock_nested(&group->lock, group->level);
+
+ if (WARN_ON(group->active == group->num_childs)) {
+ raw_spin_unlock(&group->lock);
+ return;
+ }
+
+ cpumask_set_cpu(cpu, group->cpus);
+ group->active++;
+
+ /* The first active cpu in a group takes the migrator role */
+ if (group->active == 1) {
+ struct tmigr_group *parent = group->parent;
+
+ group->migrator = cpu;
+ /* Propagate through the hierarchy */
+ if (parent)
+ tmigr_set_cpu_active(parent, &group->groupevt, cpu);
+ }
+ /*
+ * Update groupevt and dequeue @evt. Must be called after parent
+ * groups have been updated above so @group->groupevt is inactive.
+ */
+ tmigr_remove_evt(group, evt);
+ raw_spin_unlock(&group->lock);
+}
+
+/**
+ * tmigr_cpu_activate - Activate current CPU
+ *
+ * Called from the NOHZ and cpu online code.
+ */
+void tmigr_cpu_activate(void)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ struct tmigr_group *group = tmc->tmgroup;
+ int cpu = smp_processor_id();
+ unsigned long flags;
+
+ if (!tmc->online || !group)
+ return;
+
+ local_irq_save(flags);
+ tmigr_set_cpu_active(group, &tmc->cpuevt, cpu);
+ local_irq_restore(flags);
+}
+
+static void tmigr_free_group(struct tmigr_group *group)
+{
+ if (group->parent) {
+ group->parent->num_childs--;
+ if (!group->parent->num_childs)
+ tmigr_free_group(group->parent);
+ }
+ list_del(&group->list);
+ free_cpumask_var(group->cpus);
+ kfree(group);
+}
+
+static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
+ unsigned int node)
+{
+ raw_spin_lock_init(&group->lock);
+ group->level = lvl;
+ group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE;
+ group->migrator = TMIGR_NONE;
+ timerqueue_init_head(&group->events);
+ timerqueue_init(&group->groupevt.nextevt);
+ group->groupevt.group = group;
+ group->groupevt.nextevt.expires = KTIME_MAX;
+ group->groupevt.cpu = TMIGR_NONE;
+ group->num_childs = 1;
+}
+
+static struct tmigr_group *tmigr_get_group(unsigned int node, unsigned int lvl)
+{
+ struct tmigr_group *group;
+
+ /* Try to attach to an exisiting group first */
+ list_for_each_entry(group, &tmigr_level_list[lvl], list) {
+ /*
+ * If @lvl is below the cross numa node level, check
+ * whether this group belongs to the same numa node.
+ */
+ if (lvl < tmigr_crossnode_level && group->numa_node != node)
+ continue;
+ /* If the group has capacity, use it */
+ if (group->num_childs < tmigr_childs_per_group) {
+ group->num_childs++;
+ return group;
+ }
+ }
+ /* Allocate and set up a new group */
+ group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+
+ if (!zalloc_cpumask_var_node(&group->cpus, GFP_KERNEL, node)) {
+ kfree(group);
+ return ERR_PTR(-ENOMEM);
+ }
+ tmigr_init_group(group, lvl, node);
+ /* Setup successful. Add it to the hierarchy */
+ list_add(&group->list, &tmigr_level_list[lvl]);
+ return group;
+}
+
+static int tmigr_setup_parents(unsigned int lvl)
+{
+ struct list_head *lvllist = &tmigr_level_list[lvl];
+ struct tmigr_group *group, *parent;
+ int ret = 0;
+
+ /* End of hierarchy reached? */
+ if (list_is_singular(lvllist))
+ return 0;
+
+ DBG_BUG_ON(lvl == tmigr_hierarchy_levels);
+
+ list_for_each_entry(group, lvllist, list) {
+ if (group->parent)
+ continue;
+ parent = tmigr_get_group(group->numa_node, lvl + 1);
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
+
+ raw_spin_lock_irq(&group->lock);
+ group->parent = parent;
+ if (group->active)
+ tmigr_set_cpu_active(parent, NULL, group->migrator);
+ raw_spin_unlock_irq(&group->lock);
+ ret = 1;
+ }
+ return ret;
+}
+
+static int tmigr_check_hierarchy(void)
+{
+ int lvl, ret = 0;
+
+ for (lvl = 0; lvl < tmigr_hierarchy_levels; lvl++) {
+ ret = tmigr_setup_parents(lvl);
+ if (ret != 1)
+ break;
+ }
+ return ret;
+}
+
+static struct tmigr_group *tmigr_add_cpu(unsigned int cpu)
+{
+ unsigned int node = cpu_to_node(cpu);
+ struct tmigr_group *group;
+
+ mutex_lock(&tmigr_mutex);
+ group = tmigr_get_group(node, 0);
+ if (IS_ERR(group))
+ goto out;
+ /*
+ * If the group was newly allocated, connect it
+ * to parent group(s) if necessary.
+ */
+ if (group->num_childs == 1) {
+ int ret = tmigr_check_hierarchy();
+
+ if (ret < 0) {
+ tmigr_free_group(group);
+ group = ERR_PTR(ret);
+ }
+ }
+out:
+ mutex_unlock(&tmigr_mutex);
+ return group;
+}
+
+static int tmigr_cpu_online(unsigned int cpu)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ struct tmigr_group *group;
+
+ /* First online attempt? Initialize cpu data */
+ if (!tmc->tmgroup) {
+ raw_spin_lock_init(&tmc->lock);
+ timerqueue_init(&tmc->cpuevt.nextevt);
+ tmc->cpuevt.group = NULL;
+ tmc->cpuevt.cpu = cpu;
+ group = tmigr_add_cpu(cpu);
+ if (IS_ERR(group))
+ return PTR_ERR(group);
+ tmc->tmgroup = group;
+ }
+ tmc->online = true;
+ tmigr_cpu_activate();
+ return 0;
+}
+
+static int tmigr_cpu_offline(unsigned int cpu)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ struct tmigr_group *group = tmc->tmgroup;
+
+ local_irq_disable();
+ tmc->online = false;
+ tmigr_set_cpu_inactive(group, NULL, NULL, cpu);
+ local_irq_enable();
+
+ return 0;
+}
+
+void __init tmigr_init(void)
+{
+ unsigned int cpulvl, nodelvl, cpus_per_node, i;
+ unsigned int nnodes = num_possible_nodes();
+ unsigned int ncpus = num_possible_cpus();
+ struct tmigr_group *group;
+ size_t sz;
+
+ /*
+ * Calculate the required hierarchy levels. Unfortunately there is
+ * no reliable information available, unless all possible CPUs have
+ * been brought up and all numa nodes are populated.
+ *
+ * Estimate the number of levels with the number of possible nodes and
+ * the number of possible cpus. Assume CPUs are spread evenly accross
+ * nodes.
+ */
+ cpus_per_node = DIV_ROUND_UP(ncpus, nnodes);
+ /* Calc the hierarchy levels required to hold the CPUs of a node */
+ cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node),
+ ilog2(tmigr_childs_per_group));
+ /* Calculate the extra levels to connect all nodes */
+ nodelvl = DIV_ROUND_UP(order_base_2(nnodes),
+ ilog2(tmigr_childs_per_group));
+
+ tmigr_hierarchy_levels = cpulvl + nodelvl;
+ /*
+ * If a numa node spawns more than one CPU level group then the
+ * next level(s) of the hierarchy contains groups which handle all
+ * CPU groups of the same numa node. The level above goes accross
+ * numa nodes. Store this information for the setup code to decide
+ * when node matching is not longer required.
+ */
+ tmigr_crossnode_level = cpulvl;
+
+ sz = sizeof(struct list_head) * tmigr_hierarchy_levels;
+ tmigr_level_list = kzalloc(sz, GFP_KERNEL);
+ if (!tmigr_level_list)
+ goto err;
+
+ for (i = 0; i < tmigr_hierarchy_levels; i++)
+ INIT_LIST_HEAD(&tmigr_level_list[i]);
+
+ if (cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
+ tmigr_cpu_online, tmigr_cpu_offline))
+ goto hp_err;
+
+ tmigr_enabled = true;
+ pr_info("Timer migration: %d hierarchy levels\n", tmigr_hierarchy_levels);
+ return;
+
+hp_err:
+ /* Walk levels and free already allocated groups */
+ for (i = 0; i < tmigr_hierarchy_levels; i++) {
+ list_for_each_entry(group, &tmigr_level_list[i], list)
+ tmigr_free_group(group);
+ }
+ kfree(tmigr_level_list);
+err:
+ pr_err("Timer migration setup failed\n");
+}
@@ -0,0 +1,89 @@
+/*
+ * Infrastructure for migrateable timers
+ *
+ * Copyright(C) 2016 linutronix GmbH
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#ifndef _KERNEL_TIME_MIGRATION_H
+#define _KERNEL_TIME_MIGRATION_H
+
+#ifdef CONFIG_NO_HZ_COMMON
+extern bool tmigr_enabled;
+void tmigr_init(void);
+#else
+static inline void tmigr_init(void) { }
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void tmigr_handle_remote(void);
+extern u64 tmigr_cpu_idle(u64 nextevt);
+extern void tmigr_cpu_activate(void);
+extern void timer_expire_remote(unsigned int cpu);
+#else
+static inline void tmigr_handle_remote(void) { }
+static inline u64 tmigr_cpu_idle(u64 nextevt) { }
+static inline void tmigr_cpu_activate(void) { }
+#endif
+
+#define TMIGR_NONE (~0U)
+
+/**
+ * struct tmigr_event - A timer event associated to a CPU or a group
+ * @nextevt: The node to enqueue an event in the group queue
+ * @group: The group to which this event belongs (NULL if a cpu event)
+ * @cpu: The cpu to which this event belongs (TMIGR_NONE if a group
+ * event)
+ */
+struct tmigr_event {
+ struct timerqueue_node nextevt;
+ struct tmigr_group *group;
+ unsigned int cpu;
+};
+
+/**
+ * struct tmigr_group - the hierachical group for timer migration structure
+ * @lock: Group serialization. Must be taken with interrupts disabled.
+ * @active: Specifies the number of active (not offline and not idle)
+ * childs of the group
+ * @migrator: CPU id of the migrator for this group or TMIGR_NONE
+ * @events: timerqueue head of all events of the group
+ * @groupevt: Next event of the group
+ * @parent: Pointer to the parent group. Null if top level group
+ * @cpus: CPU mask to track the active CPUs
+ * @list: List head to queue in the global group level lists
+ * @level: Specifies the hierarchy level of the group
+ * @numa_node: Specifies the numa node of the group. NUMA_NO_NODE if the
+ * group spawns multiple numa nodes
+ * @num_childs: Number of childs of a group that are connected with the group
+ */
+struct tmigr_group {
+ raw_spinlock_t lock;
+ unsigned int active;
+ unsigned int migrator;
+ struct timerqueue_head events;
+ struct tmigr_event groupevt;
+ struct tmigr_group *parent;
+ cpumask_var_t cpus;
+ struct list_head list;
+ unsigned int level;
+ unsigned int numa_node;
+ unsigned int num_childs;
+};
+
+/**
+ * struct tmigr_cpu - Per CPU entry connected to a leaf group in the hierarchy
+ * @lock: Protection for the per cpu data
+ * @online: Online marker vs. the timer migration functionality.
+ * @cpuevt: Specifies the next timer event of the CPU
+ * @tmgroup: Pointer to the leaf group to which this CPU belongs
+ */
+struct tmigr_cpu {
+ raw_spinlock_t lock;
+ bool online;
+ struct tmigr_event cpuevt;
+ struct tmigr_group *tmgroup;
+};
+
+#endif