diff mbox series

[RFC,42/49] xen/sched: add support for guest vcpu idle

Message ID 20190329150934.17694-43-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series xen: add core scheduling support | expand

Commit Message

Jürgen Groß March 29, 2019, 3:09 p.m. UTC
With core scheduling active a single vcpu might need to idle while
other vcpu(s) is/are running. In order to avoid having to mix vcpus
from different sched items on the same sched resource we need a new
idle mode in an active guest vcpu.

This idle is similar to the idle_loop() of the idle vcpus, but
without any tasklet work, memory scrubbing or live patch work. We
avoid deep sleep states by setting the vcpu to "urgent".

As the guest idle vcpu should still be active from the hypervisor's
point of view we need a valid cr3 value to be active even if the vcpu
has not been initialized yet. For this purpose allocate a l4 page for
pv-domains or allocate the monitor table early for HVM domains.

Some assertions need to be modified to accept an offline vcpu to
appear to be running now.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 xen/arch/arm/domain.c               | 14 ++++++++++++++
 xen/arch/x86/domain.c               | 20 +++++++++++++++++++-
 xen/arch/x86/hvm/hvm.c              |  2 ++
 xen/arch/x86/mm.c                   | 10 +++++++++-
 xen/arch/x86/pv/descriptor-tables.c |  6 +++---
 xen/arch/x86/pv/domain.c            | 19 +++++++++++++++++++
 xen/common/schedule.c               | 13 ++++++++++++-
 xen/include/asm-x86/domain.h        |  3 +++
 xen/include/xen/sched-if.h          |  3 +++
 xen/include/xen/sched.h             |  2 ++
 10 files changed, 86 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 6dc633ed50..881523d87f 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -59,6 +59,18 @@  static void do_idle(void)
     sched_tick_resume();
 }
 
+void guest_idle_loop(void)
+{
+    unsigned int cpu = smp_processor_id();
+
+    for ( ; ; )
+    {
+        if ( !softirq_pending(cpu) )
+            do_idle();
+        do_softirq();
+    }
+}
+
 void idle_loop(void)
 {
     unsigned int cpu = smp_processor_id();
@@ -329,6 +341,8 @@  static void continue_new_vcpu(struct vcpu *prev)
 
     if ( is_idle_vcpu(current) )
         reset_stack_and_jump(idle_loop);
+    else if ( !vcpu_runnable(current) )
+        sched_vcpu_idle(current);
     else if ( is_32bit_domain(current->domain) )
         /* check_wakeup_from_wait(); */
         reset_stack_and_jump(return_to_new_vcpu32);
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 5e764d8a54..9acf2e9792 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -126,6 +126,18 @@  static void play_dead(void)
     (*dead_idle)();
 }
 
+void guest_idle_loop(void)
+{
+    unsigned int cpu = smp_processor_id();
+
+    for ( ; ; )
+    {
+        if ( !softirq_pending(cpu) )
+            pm_idle();
+        do_softirq();
+    }
+}
+
 static void idle_loop(void)
 {
     unsigned int cpu = smp_processor_id();
@@ -1702,7 +1714,7 @@  static void __context_switch(void)
     gdt = !is_pv_32bit_domain(nd) ? per_cpu(gdt_table, cpu) :
                                     per_cpu(compat_gdt_table, cpu);
 
-    need_full_gdt_n = need_full_gdt(nd);
+    need_full_gdt_n = need_full_gdt(nd) && is_vcpu_online(n);
 
     if ( need_full_gdt_n )
         write_full_gdt_ptes(gdt, n);
@@ -1855,6 +1867,9 @@  void context_switch(struct vcpu *prev, struct vcpu *next)
     /* Ensure that the vcpu has an up-to-date time base. */
     update_vcpu_system_time(next);
 
+    if ( !vcpu_runnable(next) )
+        sched_vcpu_idle(next);
+
     /*
      * Schedule tail *should* be a terminal function pointer, but leave a
      * bug frame around just in case it returns, to save going back into the
@@ -1868,6 +1883,9 @@  void continue_running(struct vcpu *same)
 {
     context_wait_rendezvous_out(same->sched_item, NULL);
 
+    if ( !vcpu_runnable(same) )
+        sched_vcpu_idle(same);
+
     /* See the comment above. */
     same->domain->arch.ctxt_switch->tail(same);
     BUG();
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index f184136f81..6668df9f3b 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1541,6 +1541,8 @@  int hvm_vcpu_initialise(struct vcpu *v)
         hvm_set_guest_tsc(v, 0);
     }
 
+    paging_update_paging_modes(v);
+
     return 0;
 
  fail6:
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index dbec130da0..a3d97adfca 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3016,9 +3016,15 @@  int vcpu_destroy_pagetables(struct vcpu *v)
 {
     unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
     struct page_info *page;
-    l4_pgentry_t *l4tab = NULL;
+    l4_pgentry_t *l4tab = v->domain->arch.pv.l4tab_idle;
     int rc = put_old_guest_table(v);
 
+    if ( l4tab && mfn == __virt_to_mfn(l4tab) )
+    {
+        v->arch.guest_table = pagetable_null();
+        mfn = 0;
+    }
+
     if ( rc )
         return rc;
 
@@ -3027,6 +3033,8 @@  int vcpu_destroy_pagetables(struct vcpu *v)
         l4tab = map_domain_page(_mfn(mfn));
         mfn = l4e_get_pfn(*l4tab);
     }
+    else
+        l4tab = NULL;
 
     if ( mfn )
     {
diff --git a/xen/arch/x86/pv/descriptor-tables.c b/xen/arch/x86/pv/descriptor-tables.c
index 940804b18a..1bcb1c2dd6 100644
--- a/xen/arch/x86/pv/descriptor-tables.c
+++ b/xen/arch/x86/pv/descriptor-tables.c
@@ -43,7 +43,7 @@  bool pv_destroy_ldt(struct vcpu *v)
     if ( v->arch.pv.shadow_ldt_mapcnt == 0 )
         goto out;
 #else
-    ASSERT(v == current || !vcpu_cpu_dirty(v));
+    ASSERT(v == current || !vcpu_cpu_dirty(v) || (v->pause_flags & VPF_down));
 #endif
 
     pl1e = pv_ldt_ptes(v);
@@ -80,7 +80,7 @@  void pv_destroy_gdt(struct vcpu *v)
     l1_pgentry_t zero_l1e = l1e_from_mfn(zero_mfn, __PAGE_HYPERVISOR_RO);
     unsigned int i;
 
-    ASSERT(v == current || !vcpu_cpu_dirty(v));
+    ASSERT(v == current || !vcpu_cpu_dirty(v) || (v->pause_flags & VPF_down));
 
     v->arch.pv.gdt_ents = 0;
     for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
@@ -102,7 +102,7 @@  long pv_set_gdt(struct vcpu *v, unsigned long *frames, unsigned int entries)
     l1_pgentry_t *pl1e;
     unsigned int i, nr_frames = DIV_ROUND_UP(entries, 512);
 
-    ASSERT(v == current || !vcpu_cpu_dirty(v));
+    ASSERT(v == current || !vcpu_cpu_dirty(v) || (v->pause_flags & VPF_down));
 
     if ( entries > FIRST_RESERVED_GDT_ENTRY )
         return -EINVAL;
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 4b6f48dea2..3ecdb96e8e 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -259,6 +259,12 @@  int pv_vcpu_initialise(struct vcpu *v)
             goto done;
     }
 
+    if ( d->arch.pv.l4tab_idle )
+    {
+        v->arch.guest_table = pagetable_from_paddr(__pa(d->arch.pv.l4tab_idle));
+        update_cr3(v);
+    }
+
  done:
     if ( rc )
         pv_vcpu_destroy(v);
@@ -275,6 +281,7 @@  void pv_domain_destroy(struct domain *d)
     XFREE(d->arch.pv.cpuidmasks);
 
     FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
+    FREE_XENHEAP_PAGE(d->arch.pv.l4tab_idle);
 }
 
 
@@ -307,6 +314,18 @@  int pv_domain_initialise(struct domain *d)
 
     d->arch.ctxt_switch = &pv_csw;
 
+    if ( sched_granularity > 1 )
+    {
+        l4_pgentry_t *l4;
+
+        l4 = alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
+        if ( !l4 )
+            goto fail;
+        clear_page(l4);
+        init_xen_l4_slots(l4, _mfn(virt_to_mfn(l4)), d, INVALID_MFN, true);
+        d->arch.pv.l4tab_idle = l4;
+    }
+
     /* 64-bit PV guest by default. */
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
 
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index d33efbcdc5..d2a02aea34 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -56,7 +56,7 @@  int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
 integer_param("sched_ratelimit_us", sched_ratelimit_us);
 
 /* Number of vcpus per struct sched_item. */
-static unsigned int sched_granularity = 1;
+unsigned int sched_granularity = 1;
 
 /* Various timer handlers. */
 static void s_timer_fn(void *unused);
@@ -1124,6 +1124,17 @@  int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
     return vcpu_set_affinity(v, affinity, v->sched_item->cpu_soft_affinity);
 }
 
+void sched_vcpu_idle(struct vcpu *v)
+{
+    if ( !v->is_urgent )
+    {
+        v->is_urgent = 1;
+        atomic_inc(&per_cpu(sched_res, v->processor)->urgent_count);
+    }
+
+    reset_stack_and_jump(guest_idle_loop);
+}
+
 /* Block the currently-executing domain until a pertinent event occurs. */
 void vcpu_block(void)
 {
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 214e44ce1c..695292456b 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -254,6 +254,9 @@  struct pv_domain
 
     atomic_t nr_l4_pages;
 
+    /* L4 tab for offline vcpus with scheduling granularity > 1. */
+    l4_pgentry_t *l4tab_idle;
+
     /* XPTI active? */
     bool xpti;
     /* Use PCID feature? */
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
index 49724aafd0..4a3fb092c2 100644
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -201,6 +201,9 @@  static inline unsigned int sched_get_resource_cpu(unsigned int cpu)
     return per_cpu(sched_res, cpu)->processor;
 }
 
+void sched_vcpu_idle(struct vcpu *v);
+void guest_idle_loop(void);
+
 /*
  * Scratch space, for avoiding having too many cpumask_t on the stack.
  * Within each scheduler, when using the scratch mask of one pCPU:
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 873a903977..52a1abfca9 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -488,6 +488,8 @@  extern struct vcpu *idle_vcpu[NR_CPUS];
 #define is_idle_domain(d) ((d)->domain_id == DOMID_IDLE)
 #define is_idle_vcpu(v)   (is_idle_domain((v)->domain))
 
+extern unsigned int sched_granularity;
+
 static inline bool is_system_domain(const struct domain *d)
 {
     return d->domain_id >= DOMID_FIRST_RESERVED;