@@ -59,6 +59,18 @@ static void do_idle(void)
sched_tick_resume();
}
+void guest_idle_loop(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ for ( ; ; )
+ {
+ if ( !softirq_pending(cpu) )
+ do_idle();
+ do_softirq();
+ }
+}
+
void idle_loop(void)
{
unsigned int cpu = smp_processor_id();
@@ -329,6 +341,8 @@ static void continue_new_vcpu(struct vcpu *prev)
if ( is_idle_vcpu(current) )
reset_stack_and_jump(idle_loop);
+ else if ( !vcpu_runnable(current) )
+ sched_vcpu_idle(current);
else if ( is_32bit_domain(current->domain) )
/* check_wakeup_from_wait(); */
reset_stack_and_jump(return_to_new_vcpu32);
@@ -126,6 +126,18 @@ static void play_dead(void)
(*dead_idle)();
}
+void guest_idle_loop(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ for ( ; ; )
+ {
+ if ( !softirq_pending(cpu) )
+ pm_idle();
+ do_softirq();
+ }
+}
+
static void idle_loop(void)
{
unsigned int cpu = smp_processor_id();
@@ -1702,7 +1714,7 @@ static void __context_switch(void)
gdt = !is_pv_32bit_domain(nd) ? per_cpu(gdt_table, cpu) :
per_cpu(compat_gdt_table, cpu);
- need_full_gdt_n = need_full_gdt(nd);
+ need_full_gdt_n = need_full_gdt(nd) && is_vcpu_online(n);
if ( need_full_gdt_n )
write_full_gdt_ptes(gdt, n);
@@ -1855,6 +1867,9 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
/* Ensure that the vcpu has an up-to-date time base. */
update_vcpu_system_time(next);
+ if ( !vcpu_runnable(next) )
+ sched_vcpu_idle(next);
+
/*
* Schedule tail *should* be a terminal function pointer, but leave a
* bug frame around just in case it returns, to save going back into the
@@ -1868,6 +1883,9 @@ void continue_running(struct vcpu *same)
{
context_wait_rendezvous_out(same->sched_item, NULL);
+ if ( !vcpu_runnable(same) )
+ sched_vcpu_idle(same);
+
/* See the comment above. */
same->domain->arch.ctxt_switch->tail(same);
BUG();
@@ -1541,6 +1541,8 @@ int hvm_vcpu_initialise(struct vcpu *v)
hvm_set_guest_tsc(v, 0);
}
+ paging_update_paging_modes(v);
+
return 0;
fail6:
@@ -3016,9 +3016,15 @@ int vcpu_destroy_pagetables(struct vcpu *v)
{
unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
struct page_info *page;
- l4_pgentry_t *l4tab = NULL;
+ l4_pgentry_t *l4tab = v->domain->arch.pv.l4tab_idle;
int rc = put_old_guest_table(v);
+ if ( l4tab && mfn == __virt_to_mfn(l4tab) )
+ {
+ v->arch.guest_table = pagetable_null();
+ mfn = 0;
+ }
+
if ( rc )
return rc;
@@ -3027,6 +3033,8 @@ int vcpu_destroy_pagetables(struct vcpu *v)
l4tab = map_domain_page(_mfn(mfn));
mfn = l4e_get_pfn(*l4tab);
}
+ else
+ l4tab = NULL;
if ( mfn )
{
@@ -43,7 +43,7 @@ bool pv_destroy_ldt(struct vcpu *v)
if ( v->arch.pv.shadow_ldt_mapcnt == 0 )
goto out;
#else
- ASSERT(v == current || !vcpu_cpu_dirty(v));
+ ASSERT(v == current || !vcpu_cpu_dirty(v) || (v->pause_flags & VPF_down));
#endif
pl1e = pv_ldt_ptes(v);
@@ -80,7 +80,7 @@ void pv_destroy_gdt(struct vcpu *v)
l1_pgentry_t zero_l1e = l1e_from_mfn(zero_mfn, __PAGE_HYPERVISOR_RO);
unsigned int i;
- ASSERT(v == current || !vcpu_cpu_dirty(v));
+ ASSERT(v == current || !vcpu_cpu_dirty(v) || (v->pause_flags & VPF_down));
v->arch.pv.gdt_ents = 0;
for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
@@ -102,7 +102,7 @@ long pv_set_gdt(struct vcpu *v, unsigned long *frames, unsigned int entries)
l1_pgentry_t *pl1e;
unsigned int i, nr_frames = DIV_ROUND_UP(entries, 512);
- ASSERT(v == current || !vcpu_cpu_dirty(v));
+ ASSERT(v == current || !vcpu_cpu_dirty(v) || (v->pause_flags & VPF_down));
if ( entries > FIRST_RESERVED_GDT_ENTRY )
return -EINVAL;
@@ -259,6 +259,12 @@ int pv_vcpu_initialise(struct vcpu *v)
goto done;
}
+ if ( d->arch.pv.l4tab_idle )
+ {
+ v->arch.guest_table = pagetable_from_paddr(__pa(d->arch.pv.l4tab_idle));
+ update_cr3(v);
+ }
+
done:
if ( rc )
pv_vcpu_destroy(v);
@@ -275,6 +281,7 @@ void pv_domain_destroy(struct domain *d)
XFREE(d->arch.pv.cpuidmasks);
FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
+ FREE_XENHEAP_PAGE(d->arch.pv.l4tab_idle);
}
@@ -307,6 +314,18 @@ int pv_domain_initialise(struct domain *d)
d->arch.ctxt_switch = &pv_csw;
+ if ( sched_granularity > 1 )
+ {
+ l4_pgentry_t *l4;
+
+ l4 = alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
+ if ( !l4 )
+ goto fail;
+ clear_page(l4);
+ init_xen_l4_slots(l4, _mfn(virt_to_mfn(l4)), d, INVALID_MFN, true);
+ d->arch.pv.l4tab_idle = l4;
+ }
+
/* 64-bit PV guest by default. */
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
@@ -56,7 +56,7 @@ int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
integer_param("sched_ratelimit_us", sched_ratelimit_us);
/* Number of vcpus per struct sched_item. */
-static unsigned int sched_granularity = 1;
+unsigned int sched_granularity = 1;
/* Various timer handlers. */
static void s_timer_fn(void *unused);
@@ -1124,6 +1124,17 @@ int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
return vcpu_set_affinity(v, affinity, v->sched_item->cpu_soft_affinity);
}
+void sched_vcpu_idle(struct vcpu *v)
+{
+ if ( !v->is_urgent )
+ {
+ v->is_urgent = 1;
+ atomic_inc(&per_cpu(sched_res, v->processor)->urgent_count);
+ }
+
+ reset_stack_and_jump(guest_idle_loop);
+}
+
/* Block the currently-executing domain until a pertinent event occurs. */
void vcpu_block(void)
{
@@ -254,6 +254,9 @@ struct pv_domain
atomic_t nr_l4_pages;
+ /* L4 tab for offline vcpus with scheduling granularity > 1. */
+ l4_pgentry_t *l4tab_idle;
+
/* XPTI active? */
bool xpti;
/* Use PCID feature? */
@@ -201,6 +201,9 @@ static inline unsigned int sched_get_resource_cpu(unsigned int cpu)
return per_cpu(sched_res, cpu)->processor;
}
+void sched_vcpu_idle(struct vcpu *v);
+void guest_idle_loop(void);
+
/*
* Scratch space, for avoiding having too many cpumask_t on the stack.
* Within each scheduler, when using the scratch mask of one pCPU:
@@ -488,6 +488,8 @@ extern struct vcpu *idle_vcpu[NR_CPUS];
#define is_idle_domain(d) ((d)->domain_id == DOMID_IDLE)
#define is_idle_vcpu(v) (is_idle_domain((v)->domain))
+extern unsigned int sched_granularity;
+
static inline bool is_system_domain(const struct domain *d)
{
return d->domain_id >= DOMID_FIRST_RESERVED;
With core scheduling active a single vcpu might need to idle while other vcpu(s) is/are running. In order to avoid having to mix vcpus from different sched items on the same sched resource we need a new idle mode in an active guest vcpu. This idle is similar to the idle_loop() of the idle vcpus, but without any tasklet work, memory scrubbing or live patch work. We avoid deep sleep states by setting the vcpu to "urgent". As the guest idle vcpu should still be active from the hypervisor's point of view we need a valid cr3 value to be active even if the vcpu has not been initialized yet. For this purpose allocate a l4 page for pv-domains or allocate the monitor table early for HVM domains. Some assertions need to be modified to accept an offline vcpu to appear to be running now. Signed-off-by: Juergen Gross <jgross@suse.com> --- xen/arch/arm/domain.c | 14 ++++++++++++++ xen/arch/x86/domain.c | 20 +++++++++++++++++++- xen/arch/x86/hvm/hvm.c | 2 ++ xen/arch/x86/mm.c | 10 +++++++++- xen/arch/x86/pv/descriptor-tables.c | 6 +++--- xen/arch/x86/pv/domain.c | 19 +++++++++++++++++++ xen/common/schedule.c | 13 ++++++++++++- xen/include/asm-x86/domain.h | 3 +++ xen/include/xen/sched-if.h | 3 +++ xen/include/xen/sched.h | 2 ++ 10 files changed, 86 insertions(+), 6 deletions(-)