@@ -45,6 +45,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/desc.h>
+#include <asm/fixmap.h>
#include <asm/i387.h>
#include <asm/xstate.h>
#include <asm/cpuidle.h>
@@ -2110,11 +2111,47 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
local_irq_disable();
+ if ( is_pv_domain(prevd) && prevd->arch.asi )
+ {
+ /*
+ * Don't leak the L4 shadow mapping in the per-CPU area. Can't be done
+ * in paravirt_ctxt_switch_from() because the lazy idle vCPU context
+ * switch would otherwise enter an infinite loop in
+ * mapcache_current_vcpu() with sync_local_execstate().
+ *
+ * Note clearing the fixmpa must strictly be done ahead of changing the
+ * current vCPU and with interrupts disabled, so there's no window
+ * where current->domain->arch.asi == true and PCPU_FIX_PV_L4SHADOW is
+ * not mapped.
+ */
+ percpu_clear_fixmap(PCPU_FIX_PV_L4SHADOW);
+ get_cpu_info()->root_pgt_changed = false;
+ }
+
set_current(next);
if ( (per_cpu(curr_vcpu, cpu) == next) ||
(is_idle_domain(nextd) && cpu_online(cpu)) )
{
+ if ( is_pv_domain(nextd) && nextd->arch.asi )
+ {
+ /* Signal the fixmap entry must be mapped. */
+ get_cpu_info()->new_cr3 = true;
+ if ( get_cpu_info()->root_pgt_changed )
+ {
+ /*
+ * Map and update the shadow L4 in case we received any
+ * FLUSH_ROOT_PGTBL request while running on the idle vCPU.
+ *
+ * Do it before enabling interrupts so that no flush IPI can be
+ * delivered without having PCPU_FIX_PV_L4SHADOW correctly
+ * mapped.
+ */
+ pv_update_shadow_l4(next, true);
+ get_cpu_info()->root_pgt_changed = false;
+ }
+ }
+
local_irq_enable();
}
else
@@ -17,6 +17,7 @@
#include <asm/nops.h>
#include <asm/page.h>
#include <asm/pv/domain.h>
+#include <asm/pv/mm.h>
#include <asm/spec_ctrl.h>
/* Debug builds: Wrap frequently to stress-test the wrap logic. */
@@ -192,7 +193,15 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
unsigned int order = (flags - 1) & FLUSH_ORDER_MASK;
if ( flags & FLUSH_ROOT_PGTBL )
+ {
+ const struct vcpu *curr = current;
+ const struct domain *curr_d = curr->domain;
+
get_cpu_info()->root_pgt_changed = true;
+ if ( is_pv_domain(curr_d) && curr_d->arch.asi )
+ /* Update the shadow root page-table ahead of doing TLB flush. */
+ pv_update_shadow_l4(curr, false);
+ }
if ( flags & (FLUSH_TLB|FLUSH_TLB_GLOBAL) )
{
@@ -60,10 +60,14 @@ struct cpu_info {
uint8_t scf; /* SCF_* */
/*
- * The following field controls copying of the L4 page table of 64-bit
- * PV guests to the per-cpu root page table on entering the guest context.
- * If set the L4 page table is being copied to the root page table and
- * the field will be reset.
+ * For XPTI the following field controls copying of the L4 page table of
+ * 64-bit PV guests to the per-cpu root page table on entering the guest
+ * context. If set the L4 page table is being copied to the root page
+ * table and the field will be reset.
+ *
+ * For ASI the field is used to acknowledge whether a FLUSH_ROOT_PGTBL
+ * request has been received when running the idle vCPU on PV guest
+ * page-tables (a lazy context switch to the idle vCPU).
*/
bool root_pgt_changed;
@@ -74,6 +78,9 @@ struct cpu_info {
*/
bool use_pv_cr3;
+ /* For ASI: per-CPU fixmap of guest L4 is possibly out of sync. */
+ bool new_cr3;
+
/* get_stack_bottom() must be 16-byte aligned */
};
@@ -120,6 +120,7 @@ extern void __set_fixmap_x(
/* per-CPU fixmap area. */
enum percpu_fixed_addresses {
+ PCPU_FIX_PV_L4SHADOW,
__end_of_percpu_fixed_addresses
};
@@ -23,6 +23,9 @@ bool pv_destroy_ldt(struct vcpu *v);
int validate_segdesc_page(struct page_info *page);
+void pv_clear_l4_guest_entries(root_pgentry_t *root_pgt);
+void pv_update_shadow_l4(const struct vcpu *v, bool flush);
+
#else
#include <xen/errno.h>
@@ -44,6 +47,11 @@ static inline bool pv_map_ldt_shadow_page(unsigned int off) { return false; }
static inline bool pv_destroy_ldt(struct vcpu *v)
{ ASSERT_UNREACHABLE(); return false; }
+static inline void pv_clear_l4_guest_entries(root_pgentry_t *root_pgt)
+{ ASSERT_UNREACHABLE(); }
+static inline void pv_update_shadow_l4(const struct vcpu *v, bool flush)
+{ ASSERT_UNREACHABLE(); }
+
#endif
#endif /* __X86_PV_MM_H__ */
@@ -513,6 +513,8 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT;
if ( is_pv_domain(d) && d->arch.pv.pcid )
v->arch.cr3 |= get_pcid_bits(v, false);
+ if ( is_pv_domain(d) && d->arch.asi )
+ get_cpu_info()->new_cr3 = true;
}
void write_ptbase(struct vcpu *v)
@@ -532,6 +534,40 @@ void write_ptbase(struct vcpu *v)
cpu_info->pv_cr3 |= get_pcid_bits(v, true);
switch_cr3_cr4(v->arch.cr3, new_cr4);
}
+ else if ( is_pv_domain(d) && d->arch.asi )
+ {
+ root_pgentry_t *root_pgt = this_cpu(root_pgt);
+ unsigned long cr3 = __pa(root_pgt);
+
+ /*
+ * XPTI and ASI cannot be simultaneously used even by different
+ * domains at runtime.
+ */
+ ASSERT(!cpu_info->use_pv_cr3 && !cpu_info->xen_cr3 &&
+ !cpu_info->pv_cr3);
+
+ if ( new_cr4 & X86_CR4_PCIDE )
+ cr3 |= get_pcid_bits(v, false);
+
+ /*
+ * Zap guest L4 entries ahead of flushing the TLB, so that the CPU
+ * cannot speculatively populate the TLB with stale mappings.
+ */
+ pv_clear_l4_guest_entries(root_pgt);
+
+ /*
+ * Switch to the shadow L4 with just the Xen slots populated, the guest
+ * slots will be populated by pv_update_shadow_l4() once running on the
+ * shadow L4.
+ *
+ * The reason for switching to the per-CPU shadow L4 before updating
+ * the guest slots is that pv_update_shadow_l4() uses per-CPU mappings,
+ * and the in-use page-table previous to the switch_cr3_cr4() call
+ * might not support per-CPU mappings.
+ */
+ switch_cr3_cr4(cr3, new_cr4);
+ pv_update_shadow_l4(v, false);
+ }
else
{
ASSERT(!is_hvm_domain(d) || !d->arch.asi
@@ -6505,6 +6541,17 @@ void setup_perdomain_slot(const struct vcpu *v, root_pgentry_t *root_pgt)
ASSERT(l3);
populate_perdomain(d, root_pgt, l3);
+
+ if ( is_pv_domain(d) )
+ {
+ /*
+ * Abuse the fact that this function is called on vCPU context
+ * switch and clean previous guest controlled slots from the shadow
+ * L4.
+ */
+ pv_clear_l4_guest_entries(root_pgt);
+ get_cpu_info()->new_cr3 = true;
+ }
}
else if ( is_hvm_domain(d) || d->arch.pv.xpti )
l4e_write(&root_pgt[root_table_offset(PERDOMAIN_VIRT_START)],
@@ -15,6 +15,7 @@
#include <asm/invpcid.h>
#include <asm/spec_ctrl.h>
#include <asm/pv/domain.h>
+#include <asm/pv/mm.h>
#include <asm/shadow.h>
#ifdef CONFIG_PV32
@@ -384,7 +385,7 @@ int pv_domain_initialise(struct domain *d)
d->arch.ctxt_switch = &pv_csw;
- d->arch.pv.flush_root_pt = d->arch.pv.xpti;
+ d->arch.pv.flush_root_pt = d->arch.pv.xpti || d->arch.asi;
if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
switch ( ACCESS_ONCE(opt_pcid) )
@@ -446,7 +447,27 @@ static void _toggle_guest_pt(struct vcpu *v)
* to release). Switch to the idle page tables in such an event; the
* guest will have been crashed already.
*/
- cr3 = v->arch.cr3;
+ if ( v->domain->arch.asi )
+ {
+ /*
+ * _toggle_guest_pt() might switch between user and kernel page tables,
+ * but doesn't use write_ptbase(), and hence needs an explicit call to
+ * sync the shadow L4.
+ */
+ cr3 = __pa(this_cpu(root_pgt));
+ if ( v->domain->arch.pv.pcid )
+ cr3 |= get_pcid_bits(v, false);
+ /*
+ * Ensure the current root page table is already the shadow L4, as
+ * guest user/kernel switches can only happen once the guest is
+ * running.
+ */
+ ASSERT(read_cr3() == cr3);
+ pv_update_shadow_l4(v, false);
+ }
+ else
+ cr3 = v->arch.cr3;
+
if ( shadow_mode_enabled(v->domain) )
{
cr3 &= ~X86_CR3_NOFLUSH;
@@ -11,6 +11,7 @@
#include <xen/guest_access.h>
#include <asm/current.h>
+#include <asm/fixmap.h>
#include <asm/p2m.h>
#include "mm.h"
@@ -103,6 +104,57 @@ void init_xen_pae_l2_slots(l2_pgentry_t *l2t, const struct domain *d)
}
#endif
+void pv_clear_l4_guest_entries(root_pgentry_t *root_pgt)
+{
+ unsigned int i;
+
+ for ( i = 0; i < ROOT_PAGETABLE_FIRST_XEN_SLOT; i++ )
+ l4e_write(&root_pgt[i], l4e_empty());
+ for ( i = ROOT_PAGETABLE_LAST_XEN_SLOT + 1; i < L4_PAGETABLE_ENTRIES; i++ )
+ l4e_write(&root_pgt[i], l4e_empty());
+}
+
+void pv_update_shadow_l4(const struct vcpu *v, bool flush)
+{
+ const root_pgentry_t *guest_pgt = percpu_fix_to_virt(PCPU_FIX_PV_L4SHADOW);
+ root_pgentry_t *shadow_pgt = this_cpu(root_pgt);
+
+ ASSERT(!v->domain->arch.pv.xpti);
+ ASSERT(is_pv_vcpu(v));
+ ASSERT(!is_idle_vcpu(v));
+
+ if ( get_cpu_info()->new_cr3 )
+ {
+ percpu_set_fixmap(PCPU_FIX_PV_L4SHADOW, maddr_to_mfn(v->arch.cr3),
+ __PAGE_HYPERVISOR_RO);
+ get_cpu_info()->new_cr3 = false;
+ }
+
+ if ( is_pv_32bit_vcpu(v) )
+ {
+ l4e_write(&shadow_pgt[0], guest_pgt[0]);
+ l4e_write(&shadow_pgt[root_table_offset(PERDOMAIN_ALT_VIRT_START)],
+ shadow_pgt[root_table_offset(PERDOMAIN_VIRT_START)]);
+ }
+ else
+ {
+ unsigned int i;
+
+ for ( i = 0; i < ROOT_PAGETABLE_FIRST_XEN_SLOT; i++ )
+ l4e_write(&shadow_pgt[i], guest_pgt[i]);
+ for ( i = ROOT_PAGETABLE_LAST_XEN_SLOT + 1;
+ i < L4_PAGETABLE_ENTRIES; i++ )
+ l4e_write(&shadow_pgt[i], guest_pgt[i]);
+
+ /* The presence of this Xen slot is selected by the guest. */
+ l4e_write(&shadow_pgt[l4_table_offset(RO_MPT_VIRT_START)],
+ guest_pgt[l4_table_offset(RO_MPT_VIRT_START)]);
+ }
+
+ if ( flush )
+ flush_local(FLUSH_TLB_GLOBAL);
+}
+
/*
* Local variables:
* mode: C
@@ -829,7 +829,7 @@ int setup_cpu_root_pgt(unsigned int cpu)
unsigned int off;
int rc;
- if ( !opt_xpti_hwdom && !opt_xpti_domu )
+ if ( !opt_xpti_hwdom && !opt_xpti_domu && !opt_asi_pv )
return 0;
rpt = alloc_xenheap_page();
@@ -839,6 +839,18 @@ int setup_cpu_root_pgt(unsigned int cpu)
clear_page(rpt);
per_cpu(root_pgt, cpu) = rpt;
+ if ( opt_asi_pv )
+ {
+ /*
+ * Populate the Xen slots, the guest ones will be copied from the guest
+ * root page-table.
+ */
+ init_xen_l4_slots(rpt, _mfn(virt_to_mfn(rpt)), INVALID_MFN, NULL,
+ false, false, true);
+
+ return 0;
+ }
+
rpt[root_table_offset(RO_MPT_VIRT_START)] =
idle_pg_table[root_table_offset(RO_MPT_VIRT_START)];
/* SH_LINEAR_PT inserted together with guest mappings. */
@@ -892,6 +904,12 @@ static void cleanup_cpu_root_pgt(unsigned int cpu)
per_cpu(root_pgt, cpu) = NULL;
+ if ( opt_asi_pv )
+ {
+ free_xenheap_page(rpt);
+ return;
+ }
+
for ( r = root_table_offset(DIRECTMAP_VIRT_START);
r < root_table_offset(HYPERVISOR_VIRT_END); ++r )
{
When running PV guests it's possible for the guest to use the same root page table (L4) for all vCPUs, which in turn will result in Xen also using the same root page table on all pCPUs that are running any domain vCPU. When using XPTI Xen switches to a per-CPU shadow L4 when running in guest context, switching to the fully populated L4 when in Xen context. Take advantage of this existing shadowing and force the usage of a per-CPU L4 that shadows the guest selected L4 when Address Space Isolation is requested for PV guests. The mapping of the guest L4 is done with a per-CPU fixmap entry, that however requires that the currently loaded L4 has the per-CPU slot setup. In order to ensure this switch to the shadow per-CPU L4 with just the Xen slots populated, and then map the guest L4 and copy the contents of the guest controlled slots. Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> --- xen/arch/x86/domain.c | 37 +++++++++++++++++++++ xen/arch/x86/flushtlb.c | 9 ++++++ xen/arch/x86/include/asm/current.h | 15 ++++++--- xen/arch/x86/include/asm/fixmap.h | 1 + xen/arch/x86/include/asm/pv/mm.h | 8 +++++ xen/arch/x86/mm.c | 47 +++++++++++++++++++++++++++ xen/arch/x86/pv/domain.c | 25 ++++++++++++-- xen/arch/x86/pv/mm.c | 52 ++++++++++++++++++++++++++++++ xen/arch/x86/smpboot.c | 20 +++++++++++- 9 files changed, 207 insertions(+), 7 deletions(-)