diff mbox series

[4/8] x86/paging: move and conditionalize flush_tlb() hook

Message ID 584a986e-08ea-d064-9447-ed23c6e39721@suse.com (mailing list archive)
State Superseded
Headers show
Series x86/mm: address aspects noticed during XSA-410 work | expand

Commit Message

Jan Beulich Dec. 21, 2022, 1:26 p.m. UTC
The hook isn't mode dependent, hence it's misplaced in struct
paging_mode. (Or alternatively I see no reason why the alloc_page() and
free_page() hooks don't also live there.) Move it to struct
paging_domain.

The hook also is used for HVM guests only, so make respective pieces
conditional upon CONFIG_HVM.

While there also add __must_check to the hook declaration, as it's
imperative that callers deal with getting back "false".

While moving the shadow implementation, introduce a "curr" local
variable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

Comments

Andrew Cooper Dec. 21, 2022, 6:43 p.m. UTC | #1
On 21/12/2022 1:26 pm, Jan Beulich wrote:
> The hook isn't mode dependent, hence it's misplaced in struct
> paging_mode. (Or alternatively I see no reason why the alloc_page() and
> free_page() hooks don't also live there.) Move it to struct
> paging_domain.

How you flush the TLBs has absolutely nothing to do with what mode the
guest is in.

But this hook too confuses p2m flushes with vcpu flushes.

> The hook also is used for HVM guests only, so make respective pieces
> conditional upon CONFIG_HVM.
>
> While there also add __must_check to the hook declaration, as it's
> imperative that callers deal with getting back "false".
>
> While moving the shadow implementation, introduce a "curr" local
> variable.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>, with two
observations.

> --- a/xen/arch/x86/include/asm/paging.h
> +++ b/xen/arch/x86/include/asm/paging.h
> @@ -300,6 +299,12 @@ static inline unsigned long paging_ga_to
>          page_order);
>  }
>  
> +/* Flush selected vCPUs TLBs.  NULL for all. */
> +static inline bool paging_flush_tlb(const unsigned long *vcpu_bitmap)
> +{
> +    return current->domain->arch.paging.flush_tlb(vcpu_bitmap);

Not for this patch, but for cases like this, we should probably drop the
function pointer.

There are only two options, and they're invariant for the context, so

if ( hap )
    hap_flush_tlb(...);
else
    shadow_flush_tlb(...);

will almost certainly be faster on any CPU that Xen is liable to run
on.  Especially as HAP is probably ~100% common case.

> --- a/xen/arch/x86/mm/shadow/hvm.c
> +++ b/xen/arch/x86/mm/shadow/hvm.c
> @@ -688,6 +688,66 @@ static void sh_emulate_unmap_dest(struct
>      atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
>  }
>  
> +static bool flush_vcpu(const struct vcpu *v, const unsigned long *vcpu_bitmap)
> +{
> +    return !vcpu_bitmap || test_bit(v->vcpu_id, vcpu_bitmap);
> +}
> +
> +/* Flush TLB of selected vCPUs.  NULL for all. */
> +bool cf_check shadow_flush_tlb(const unsigned long *vcpu_bitmap)
> +{
> +    static DEFINE_PER_CPU(cpumask_t, flush_cpumask);

The hap and shadow variants both have a static percpu mask like this.

However, this is an irqs-on region with no nested locking, so I suspect
this path can share one of the scheduler percpu variables too.

~Andrew
diff mbox series

Patch

--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -237,6 +237,11 @@  struct paging_domain {
     void (*free_page)(struct domain *d, struct page_info *pg);
 
     void (*update_paging_mode)(struct vcpu *v);
+
+#ifdef CONFIG_HVM
+    /* Flush selected vCPUs TLBs.  NULL for all. */
+    bool __must_check (*flush_tlb)(const unsigned long *vcpu_bitmap);
+#endif
 };
 
 struct paging_vcpu {
--- a/xen/arch/x86/include/asm/paging.h
+++ b/xen/arch/x86/include/asm/paging.h
@@ -140,7 +140,6 @@  struct paging_mode {
 #endif
     void          (*update_cr3            )(struct vcpu *v, int do_locking,
                                             bool noflush);
-    bool          (*flush_tlb             )(const unsigned long *vcpu_bitmap);
 
     unsigned int guest_levels;
 
@@ -300,6 +299,12 @@  static inline unsigned long paging_ga_to
         page_order);
 }
 
+/* Flush selected vCPUs TLBs.  NULL for all. */
+static inline bool paging_flush_tlb(const unsigned long *vcpu_bitmap)
+{
+    return current->domain->arch.paging.flush_tlb(vcpu_bitmap);
+}
+
 #endif /* CONFIG_HVM */
 
 /* Update all the things that are derived from the guest's CR3.
@@ -408,12 +413,6 @@  static always_inline unsigned int paging
     return bits;
 }
 
-/* Flush selected vCPUs TLBs.  NULL for all. */
-static inline bool paging_flush_tlb(const unsigned long *vcpu_bitmap)
-{
-    return paging_get_hostmode(current)->flush_tlb(vcpu_bitmap);
-}
-
 #endif /* XEN_PAGING_H */
 
 /*
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -445,6 +445,7 @@  static void hap_destroy_monitor_table(st
 /************************************************/
 
 static void cf_check hap_update_paging_mode(struct vcpu *v);
+static bool cf_check flush_tlb(const unsigned long *vcpu_bitmap);
 
 void hap_domain_init(struct domain *d)
 {
@@ -458,6 +459,7 @@  void hap_domain_init(struct domain *d)
     paging_log_dirty_init(d, &hap_ops);
 
     d->arch.paging.update_paging_mode = hap_update_paging_mode;
+    d->arch.paging.flush_tlb          = flush_tlb;
 }
 
 /* return 0 for success, -errno for failure */
@@ -847,7 +849,6 @@  static const struct paging_mode hap_pagi
     .gva_to_gfn             = hap_gva_to_gfn_real_mode,
     .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_real_mode,
     .update_cr3             = hap_update_cr3,
-    .flush_tlb              = flush_tlb,
     .guest_levels           = 1
 };
 
@@ -857,7 +858,6 @@  static const struct paging_mode hap_pagi
     .gva_to_gfn             = hap_gva_to_gfn_2_levels,
     .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_2_levels,
     .update_cr3             = hap_update_cr3,
-    .flush_tlb              = flush_tlb,
     .guest_levels           = 2
 };
 
@@ -867,7 +867,6 @@  static const struct paging_mode hap_pagi
     .gva_to_gfn             = hap_gva_to_gfn_3_levels,
     .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_3_levels,
     .update_cr3             = hap_update_cr3,
-    .flush_tlb              = flush_tlb,
     .guest_levels           = 3
 };
 
@@ -877,7 +876,6 @@  static const struct paging_mode hap_pagi
     .gva_to_gfn             = hap_gva_to_gfn_4_levels,
     .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_4_levels,
     .update_cr3             = hap_update_cr3,
-    .flush_tlb              = flush_tlb,
     .guest_levels           = 4
 };
 
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -68,6 +68,7 @@  int shadow_domain_init(struct domain *d)
     d->arch.paging.shadow.oos_active = 0;
 #endif
 #ifdef CONFIG_HVM
+    d->arch.paging.flush_tlb = shadow_flush_tlb;
     d->arch.paging.shadow.pagetable_dying_op = 0;
 #endif
 
@@ -3134,66 +3135,6 @@  static void cf_check sh_clean_dirty_bitm
     paging_unlock(d);
 }
 
-
-static bool flush_vcpu(const struct vcpu *v, const unsigned long *vcpu_bitmap)
-{
-    return !vcpu_bitmap || test_bit(v->vcpu_id, vcpu_bitmap);
-}
-
-/* Flush TLB of selected vCPUs.  NULL for all. */
-bool cf_check shadow_flush_tlb(const unsigned long *vcpu_bitmap)
-{
-    static DEFINE_PER_CPU(cpumask_t, flush_cpumask);
-    cpumask_t *mask = &this_cpu(flush_cpumask);
-    struct domain *d = current->domain;
-    struct vcpu *v;
-
-    /* Avoid deadlock if more than one vcpu tries this at the same time. */
-    if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
-        return false;
-
-    /* Pause all other vcpus. */
-    for_each_vcpu ( d, v )
-        if ( v != current && flush_vcpu(v, vcpu_bitmap) )
-            vcpu_pause_nosync(v);
-
-    /* Now that all VCPUs are signalled to deschedule, we wait... */
-    for_each_vcpu ( d, v )
-        if ( v != current && flush_vcpu(v, vcpu_bitmap) )
-            while ( !vcpu_runnable(v) && v->is_running )
-                cpu_relax();
-
-    /* All other vcpus are paused, safe to unlock now. */
-    spin_unlock(&d->hypercall_deadlock_mutex);
-
-    cpumask_clear(mask);
-
-    /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
-    for_each_vcpu ( d, v )
-    {
-        unsigned int cpu;
-
-        if ( !flush_vcpu(v, vcpu_bitmap) )
-            continue;
-
-        paging_update_cr3(v, false);
-
-        cpu = read_atomic(&v->dirty_cpu);
-        if ( is_vcpu_dirty_cpu(cpu) )
-            __cpumask_set_cpu(cpu, mask);
-    }
-
-    /* Flush TLBs on all CPUs with dirty vcpu state. */
-    guest_flush_tlb_mask(d, mask);
-
-    /* Done. */
-    for_each_vcpu ( d, v )
-        if ( v != current && flush_vcpu(v, vcpu_bitmap) )
-            vcpu_unpause(v);
-
-    return true;
-}
-
 /**************************************************************************/
 /* Shadow-control XEN_DOMCTL dispatcher */
 
--- a/xen/arch/x86/mm/shadow/hvm.c
+++ b/xen/arch/x86/mm/shadow/hvm.c
@@ -688,6 +688,66 @@  static void sh_emulate_unmap_dest(struct
     atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
 }
 
+static bool flush_vcpu(const struct vcpu *v, const unsigned long *vcpu_bitmap)
+{
+    return !vcpu_bitmap || test_bit(v->vcpu_id, vcpu_bitmap);
+}
+
+/* Flush TLB of selected vCPUs.  NULL for all. */
+bool cf_check shadow_flush_tlb(const unsigned long *vcpu_bitmap)
+{
+    static DEFINE_PER_CPU(cpumask_t, flush_cpumask);
+    cpumask_t *mask = &this_cpu(flush_cpumask);
+    const struct vcpu *curr = current;
+    struct domain *d = curr->domain;
+    struct vcpu *v;
+
+    /* Avoid deadlock if more than one vcpu tries this at the same time. */
+    if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
+        return false;
+
+    /* Pause all other vcpus. */
+    for_each_vcpu ( d, v )
+        if ( v != curr && flush_vcpu(v, vcpu_bitmap) )
+            vcpu_pause_nosync(v);
+
+    /* Now that all VCPUs are signalled to deschedule, we wait... */
+    for_each_vcpu ( d, v )
+        if ( v != curr && flush_vcpu(v, vcpu_bitmap) )
+            while ( !vcpu_runnable(v) && v->is_running )
+                cpu_relax();
+
+    /* All other vcpus are paused, safe to unlock now. */
+    spin_unlock(&d->hypercall_deadlock_mutex);
+
+    cpumask_clear(mask);
+
+    /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
+    for_each_vcpu ( d, v )
+    {
+        unsigned int cpu;
+
+        if ( !flush_vcpu(v, vcpu_bitmap) )
+            continue;
+
+        paging_update_cr3(v, false);
+
+        cpu = read_atomic(&v->dirty_cpu);
+        if ( is_vcpu_dirty_cpu(cpu) )
+            __cpumask_set_cpu(cpu, mask);
+    }
+
+    /* Flush TLBs on all CPUs with dirty vcpu state. */
+    guest_flush_tlb_mask(d, mask);
+
+    /* Done. */
+    for_each_vcpu ( d, v )
+        if ( v != curr && flush_vcpu(v, vcpu_bitmap) )
+            vcpu_unpause(v);
+
+    return true;
+}
+
 mfn_t sh_make_monitor_table(const struct vcpu *v, unsigned int shadow_levels)
 {
     struct domain *d = v->domain;
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -4198,7 +4198,6 @@  const struct paging_mode sh_paging_mode
     .gva_to_gfn                    = sh_gva_to_gfn,
 #endif
     .update_cr3                    = sh_update_cr3,
-    .flush_tlb                     = shadow_flush_tlb,
     .guest_levels                  = GUEST_PAGING_LEVELS,
     .shadow.detach_old_tables      = sh_detach_old_tables,
 #ifdef CONFIG_PV