@@ -159,6 +159,25 @@ static void idle_loop(void)
}
}
+/*
+ * Idle loop for siblings in active schedule units.
+ * We don't do any standard idle work like tasklets or livepatching.
+ */
+static void guest_idle_loop(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ for ( ; ; )
+ {
+ ASSERT(!cpu_is_offline(cpu));
+
+ if ( !softirq_pending(cpu) && !scrub_free_pages() &&
+ !softirq_pending(cpu))
+ sched_guest_idle(pm_idle, cpu);
+ do_softirq();
+ }
+}
+
void startup_cpu_idle_loop(void)
{
struct vcpu *v = current;
@@ -172,6 +191,10 @@ void startup_cpu_idle_loop(void)
static void noreturn continue_idle_domain(struct vcpu *v)
{
+ /* Idle vcpus might be attached to non-idle units! */
+ if ( !is_idle_domain(v->sched_unit->domain) )
+ reset_stack_and_jump_nolp(guest_idle_loop);
+
reset_stack_and_jump(idle_loop);
}
@@ -143,10 +143,21 @@ static struct scheduler sched_idle_ops = {
.switch_sched = sched_idle_switch_sched,
};
+static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
+ unsigned int cpu)
+{
+ unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
+ const struct domain *d = unit->domain;
+
+ return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
+}
+
static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
unsigned int cpu)
{
- return unit->domain->vcpu[unit->unit_id + per_cpu(sched_res_idx, cpu)];
+ struct vcpu *v = unit2vcpu_cpu(unit, cpu);
+
+ return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
}
static inline struct scheduler *dom_scheduler(const struct domain *d)
@@ -266,8 +277,11 @@ static inline void vcpu_runstate_change(
trace_runstate_change(v, new_state);
- unit->runstate_cnt[v->runstate.state]--;
- unit->runstate_cnt[new_state]++;
+ if ( !is_idle_vcpu(v) )
+ {
+ unit->runstate_cnt[v->runstate.state]--;
+ unit->runstate_cnt[new_state]++;
+ }
delta = new_entry_time - v->runstate.state_entry_time;
if ( delta > 0 )
@@ -279,21 +293,11 @@ static inline void vcpu_runstate_change(
v->runstate.state = new_state;
}
-static inline void sched_unit_runstate_change(struct sched_unit *unit,
- bool running, s_time_t new_entry_time)
+void sched_guest_idle(void (*idle) (void), unsigned int cpu)
{
- struct vcpu *v;
-
- for_each_sched_unit_vcpu ( unit, v )
- {
- if ( running )
- vcpu_runstate_change(v, v->new_state, new_entry_time);
- else
- vcpu_runstate_change(v,
- ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
- (vcpu_runnable(v) ? RUNSTATE_runnable : RUNSTATE_offline)),
- new_entry_time);
- }
+ atomic_inc(&per_cpu(sched_urgent_count, cpu));
+ idle();
+ atomic_dec(&per_cpu(sched_urgent_count, cpu));
}
void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
@@ -543,6 +547,7 @@ int sched_init_vcpu(struct vcpu *v)
if ( is_idle_domain(d) )
{
get_sched_res(v->processor)->curr = unit;
+ get_sched_res(v->processor)->sched_unit_idle = unit;
v->is_running = 1;
unit->is_running = true;
unit->state_entry_time = NOW();
@@ -875,7 +880,7 @@ static void sched_unit_move_locked(struct sched_unit *unit,
*
* sched_unit_migrate_finish() will do the work now if it can, or simply
* return if it can't (because unit is still running); in that case
- * sched_unit_migrate_finish() will be called by context_saved().
+ * sched_unit_migrate_finish() will be called by unit_context_saved().
*/
static void sched_unit_migrate_start(struct sched_unit *unit)
{
@@ -898,7 +903,7 @@ static void sched_unit_migrate_finish(struct sched_unit *unit)
/*
* If the unit is currently running, this will be handled by
- * context_saved(); and in any case, if the bit is cleared, then
+ * unit_context_saved(); and in any case, if the bit is cleared, then
* someone else has already done the work so we don't need to.
*/
if ( unit->is_running )
@@ -1783,33 +1788,66 @@ static void sched_switch_units(struct sched_resource *sr,
struct sched_unit *next, struct sched_unit *prev,
s_time_t now)
{
- sr->curr = next;
-
- TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id, prev->unit_id,
- now - prev->state_entry_time);
- TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, next->unit_id,
- (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
- (now - next->state_entry_time) : 0, prev->next_time);
+ unsigned int cpu;
ASSERT(unit_running(prev));
- TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
- next->domain->domain_id, next->unit_id);
+ if ( prev != next )
+ {
+ sr->curr = next;
+ sr->prev = prev;
- sched_unit_runstate_change(prev, false, now);
+ TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id,
+ prev->unit_id, now - prev->state_entry_time);
+ TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id,
+ next->unit_id,
+ (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
+ (now - next->state_entry_time) : 0, prev->next_time);
+ TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
+ next->domain->domain_id, next->unit_id);
- ASSERT(!unit_running(next));
- sched_unit_runstate_change(next, true, now);
+ ASSERT(!unit_running(next));
- /*
- * NB. Don't add any trace records from here until the actual context
- * switch, else lost_records resume will not work properly.
- */
+ /*
+ * NB. Don't add any trace records from here until the actual context
+ * switch, else lost_records resume will not work properly.
+ */
+
+ ASSERT(!next->is_running);
+ next->is_running = true;
+ next->state_entry_time = now;
+
+ if ( is_idle_unit(prev) )
+ {
+ prev->runstate_cnt[RUNSTATE_running] = 0;
+ prev->runstate_cnt[RUNSTATE_runnable] = sched_granularity;
+ }
+ if ( is_idle_unit(next) )
+ {
+ next->runstate_cnt[RUNSTATE_running] = sched_granularity;
+ next->runstate_cnt[RUNSTATE_runnable] = 0;
+ }
+ }
+
+ for_each_cpu ( cpu, sr->cpus )
+ {
+ struct vcpu *vprev = get_cpu_current(cpu);
+ struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu);
+
+ if ( vprev != vnext || vprev->runstate.state != vnext->new_state )
+ {
+ vcpu_runstate_change(vprev,
+ ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
+ (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)),
+ now);
+ vcpu_runstate_change(vnext, vnext->new_state, now);
+ }
+
+ vnext->is_running = 1;
- ASSERT(!next->is_running);
- next->vcpu_list->is_running = 1;
- next->is_running = true;
- next->state_entry_time = now;
+ if ( is_idle_vcpu(vnext) )
+ vnext->sched_unit = next;
+ }
}
static bool sched_tasklet_check_cpu(unsigned int cpu)
@@ -1865,29 +1903,39 @@ static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
if ( prev->next_time >= 0 ) /* -ve means no limit */
set_timer(&sr->s_timer, now + prev->next_time);
- if ( likely(prev != next) )
- sched_switch_units(sr, next, prev, now);
+ sched_switch_units(sr, next, prev, now);
return next;
}
-static void context_saved(struct vcpu *prev)
+static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext)
{
- struct sched_unit *unit = prev->sched_unit;
-
/* Clear running flag /after/ writing context to memory. */
smp_wmb();
- prev->is_running = 0;
+ if ( vprev != vnext )
+ vprev->is_running = 0;
+}
+
+static void unit_context_saved(struct sched_resource *sr)
+{
+ struct sched_unit *unit = sr->prev;
+
+ if ( !unit )
+ return;
+
unit->is_running = false;
unit->state_entry_time = NOW();
+ sr->prev = NULL;
/* Check for migration request /after/ clearing running flag. */
smp_mb();
- sched_context_saved(vcpu_scheduler(prev), unit);
+ sched_context_saved(unit_scheduler(unit), unit);
- sched_unit_migrate_finish(unit);
+ /* Idle never migrates and idle vcpus might belong to other units. */
+ if ( !is_idle_unit(unit) )
+ sched_unit_migrate_finish(unit);
}
/*
@@ -1897,35 +1945,44 @@ static void context_saved(struct vcpu *prev)
* The counter will be 0 in case no rendezvous is needed. For the rendezvous
* case it is initialised to the number of cpus to rendezvous plus 1. Each
* member entering decrements the counter. The last one will decrement it to
- * 1 and perform the final needed action in that case (call of context_saved()
- * if vcpu was switched), and then set the counter to zero. The other members
+ * 1 and perform the final needed action in that case (call of
+ * unit_context_saved()), and then set the counter to zero. The other members
* will wait until the counter becomes zero until they proceed.
*/
void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
{
struct sched_unit *next = vnext->sched_unit;
+ struct sched_resource *sr = get_sched_res(smp_processor_id());
if ( atomic_read(&next->rendezvous_out_cnt) )
{
int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
- /* Call context_saved() before releasing other waiters. */
+ vcpu_context_saved(vprev, vnext);
+
+ /* Call unit_context_saved() before releasing other waiters. */
if ( cnt == 1 )
{
- if ( vprev != vnext )
- context_saved(vprev);
+ unit_context_saved(sr);
atomic_set(&next->rendezvous_out_cnt, 0);
}
else
while ( atomic_read(&next->rendezvous_out_cnt) )
cpu_relax();
}
- else if ( vprev != vnext && sched_granularity == 1 )
- context_saved(vprev);
+ else
+ {
+ vcpu_context_saved(vprev, vnext);
+ if ( sched_granularity == 1 )
+ unit_context_saved(sr);
+ }
+
+ if ( is_idle_vcpu(vprev) && vprev != vnext )
+ vprev->sched_unit = sr->sched_unit_idle;
}
static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
- s_time_t now)
+ bool reset_idle_unit, s_time_t now)
{
if ( unlikely(vprev == vnext) )
{
@@ -1934,6 +1991,17 @@ static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
now - vprev->runstate.state_entry_time,
vprev->sched_unit->next_time);
sched_context_switched(vprev, vnext);
+
+ /*
+ * We are switching from a non-idle to an idle unit.
+ * A vcpu of the idle unit might have been running before due to
+ * the guest vcpu being blocked. We must adjust the unit of the idle
+ * vcpu which might have been set to the guest's one.
+ */
+ if ( reset_idle_unit )
+ vnext->sched_unit =
+ get_sched_res(smp_processor_id())->sched_unit_idle;
+
trace_continue_running(vnext);
return continue_running(vprev);
}
@@ -1992,7 +2060,7 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
pcpu_schedule_unlock_irq(*lock, cpu);
raise_softirq(SCHED_SLAVE_SOFTIRQ);
- sched_context_switch(vprev, vprev, now);
+ sched_context_switch(vprev, vprev, false, now);
return NULL; /* ARM only. */
}
@@ -2035,7 +2103,8 @@ static void sched_slave(void)
pcpu_schedule_unlock_irq(lock, cpu);
- sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu), now);
+ sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
+ is_idle_unit(next) && !is_idle_unit(prev), now);
}
/*
@@ -2097,7 +2166,8 @@ static void schedule(void)
pcpu_schedule_unlock_irq(lock, cpu);
vnext = sched_unit2vcpu_cpu(next, cpu);
- sched_context_switch(vprev, vnext, now);
+ sched_context_switch(vprev, vnext,
+ !is_idle_unit(prev) && is_idle_unit(next), now);
}
/* The scheduler timer: force a run through the scheduler */
@@ -2168,6 +2238,7 @@ static int cpu_schedule_up(unsigned int cpu)
*/
sr->curr = idle_vcpu[cpu]->sched_unit;
+ sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit;
sr->sched_priv = NULL;
@@ -2337,6 +2408,7 @@ void __init scheduler_init(void)
if ( vcpu_create(idle_domain, 0) == NULL )
BUG();
get_sched_res(0)->curr = idle_vcpu[0]->sched_unit;
+ get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit;
}
/*
@@ -18,6 +18,7 @@ DECLARE_PER_CPU(struct vcpu *, curr_vcpu);
#define current (this_cpu(curr_vcpu))
#define set_current(vcpu) do { current = (vcpu); } while (0)
+#define get_cpu_current(cpu) (per_cpu(curr_vcpu, cpu))
/* Per-VCPU state that lives at the top of the stack */
struct cpu_info {
@@ -77,6 +77,11 @@ struct cpu_info {
/* get_stack_bottom() must be 16-byte aligned */
};
+static inline struct cpu_info *get_cpu_info_from_stack(unsigned long sp)
+{
+ return (struct cpu_info *)((sp | (STACK_SIZE - 1)) + 1) - 1;
+}
+
static inline struct cpu_info *get_cpu_info(void)
{
#ifdef __clang__
@@ -87,7 +92,7 @@ static inline struct cpu_info *get_cpu_info(void)
register unsigned long sp asm("rsp");
#endif
- return (struct cpu_info *)((sp | (STACK_SIZE - 1)) + 1) - 1;
+ return get_cpu_info_from_stack(sp);
}
#define get_current() (get_cpu_info()->current_vcpu)
@@ -124,16 +129,22 @@ unsigned long get_stack_dump_bottom (unsigned long sp);
# define CHECK_FOR_LIVEPATCH_WORK ""
#endif
-#define reset_stack_and_jump(__fn) \
+#define switch_stack_and_jump(fn, instr) \
({ \
__asm__ __volatile__ ( \
"mov %0,%%"__OP"sp;" \
- CHECK_FOR_LIVEPATCH_WORK \
+ instr \
"jmp %c1" \
- : : "r" (guest_cpu_user_regs()), "i" (__fn) : "memory" ); \
+ : : "r" (guest_cpu_user_regs()), "i" (fn) : "memory" ); \
unreachable(); \
})
+#define reset_stack_and_jump(fn) \
+ switch_stack_and_jump(fn, CHECK_FOR_LIVEPATCH_WORK)
+
+#define reset_stack_and_jump_nolp(fn) \
+ switch_stack_and_jump(fn, "")
+
/*
* Which VCPU's state is currently running on each CPU?
* This is not necesasrily the same as 'current' as a CPU may be
@@ -77,6 +77,13 @@ void set_nr_sockets(void);
/* Representing HT and core siblings in each socket. */
extern cpumask_t **socket_cpumask;
+/*
+ * To be used only while no context switch can occur on the cpu, i.e.
+ * by certain scheduling code only.
+ */
+#define get_cpu_current(cpu) \
+ (get_cpu_info_from_stack((unsigned long)stack_base[cpu])->current_vcpu)
+
#endif /* !__ASSEMBLY__ */
#endif
@@ -39,6 +39,8 @@ struct sched_resource {
spinlock_t *schedule_lock,
_lock;
struct sched_unit *curr;
+ struct sched_unit *sched_unit_idle;
+ struct sched_unit *prev;
void *sched_priv;
struct timer s_timer; /* scheduling timer */
@@ -194,7 +196,7 @@ static inline void sched_clear_pause_flags_atomic(struct sched_unit *unit,
static inline struct sched_unit *sched_idle_unit(unsigned int cpu)
{
- return idle_vcpu[cpu]->sched_unit;
+ return get_sched_res(cpu)->sched_unit_idle;
}
static inline unsigned int sched_get_resource_cpu(unsigned int cpu)
@@ -929,6 +929,7 @@ void restore_vcpu_affinity(struct domain *d);
void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
uint64_t get_cpu_idle_time(unsigned int cpu);
+void sched_guest_idle(void (*idle) (void), unsigned int cpu);
/*
* Used by idle loop to decide whether there is work to do: