@@ -929,6 +929,18 @@ int arch_domain_create(struct domain *d,
d->arch.msr_relaxed = config->arch.misc_flags & XEN_X86_MSR_RELAXED;
+ if ( !d->arch.asi && (opt_asi_hvm || opt_asi_pv ) )
+ {
+ /*
+ * This domain is not using ASI, but other domains on the system
+ * possibly are, hence the CPU stacks are on the per-CPU page-table
+ * region. Add an L3 entry that has all the stacks mapped.
+ */
+ rc = map_all_stacks(d);
+ if ( rc )
+ goto fail;
+ }
+
return 0;
fail:
@@ -24,6 +24,11 @@
* 0 - IST Shadow Stacks (4x 1k, read-only)
*/
+static inline bool is_shstk_slot(unsigned int i)
+{
+ return (i == 0 || i == PRIMARY_SHSTK_SLOT);
+}
+
/*
* Identify which stack page the stack pointer is on. Returns an index
* as per the comment above.
@@ -120,6 +120,11 @@ extern void __set_fixmap_x(
/* per-CPU fixmap area. */
enum percpu_fixed_addresses {
+ /* For alignment reasons the per-CPU stacks must come first. */
+ PCPU_STACK_START,
+ PCPU_STACK_END = PCPU_STACK_START + NR_CPUS * (1U << STACK_ORDER) - 1,
+#define PERCPU_STACK_IDX(c) (PCPU_STACK_START + (c) * (1U << STACK_ORDER))
+#define PERCPU_STACK_ADDR(c) percpu_fix_to_virt(PERCPU_STACK_IDX(c))
PCPU_FIX_PV_L4SHADOW,
__end_of_percpu_fixed_addresses
};
@@ -521,7 +521,7 @@ extern struct rangeset *mmio_ro_ranges;
#define compat_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define compat_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-void memguard_guard_stack(void *p);
+void memguard_guard_stack(void *p, unsigned int cpu);
void memguard_unguard_stack(void *p);
struct mmio_ro_emulate_ctxt {
@@ -652,4 +652,8 @@ static inline int destroy_xen_mappings_cpu(unsigned long s, unsigned long e,
return modify_xen_mappings_cpu(s, e, _PAGE_NONE, cpu);
}
+/* Setup a per-domain slot that maps all pCPU stacks. */
+int map_all_stacks(struct domain *d);
+int add_stack(const void *stack, unsigned int cpu);
+
#endif /* __ASM_X86_MM_H__ */
@@ -79,6 +79,18 @@ extern bool unaccounted_cpus;
void *cpu_alloc_stack(unsigned int cpu);
+/*
+ * Setup the per-CPU area stack mappings.
+ *
+ * @dest_cpu: CPU where the mappings are to appear.
+ * @stack_cpu: CPU whose stacks should be mapped.
+ */
+void cpu_set_stack_mappings(unsigned int dest_cpu, unsigned int stack_cpu);
+
+#define HAS_ARCH_SMP_CALLFUNC
+void arch_smp_pre_callfunc(unsigned int cpu);
+void arch_smp_post_callfunc(unsigned int cpu);
+
#endif /* !__ASSEMBLY__ */
#endif
@@ -87,6 +87,7 @@
* doing the final put_page(), and remove it from the iommu if so.
*/
+#include <xen/cpu.h>
#include <xen/init.h>
#include <xen/ioreq.h>
#include <xen/kernel.h>
@@ -6352,31 +6353,40 @@ void free_perdomain_mappings(struct domain *d)
d->arch.perdomain_l3_pg = NULL;
}
-static void write_sss_token(unsigned long *ptr)
+static void write_sss_token(unsigned long *ptr, unsigned long va)
{
/*
* A supervisor shadow stack token is its own linear address, with the
* busy bit (0) clear.
*/
- *ptr = (unsigned long)ptr;
+ *ptr = va;
}
-void memguard_guard_stack(void *p)
+void memguard_guard_stack(void *p, unsigned int cpu)
{
+ unsigned long va =
+ (opt_asi_hvm || opt_asi_pv) ? (unsigned long)PERCPU_STACK_ADDR(cpu)
+ : (unsigned long)p;
+
/* IST Shadow stacks. 4x 1k in stack page 0. */
if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
{
- write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8);
- write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8);
- write_sss_token(p + (IST_DB * IST_SHSTK_SIZE) - 8);
- write_sss_token(p + (IST_DF * IST_SHSTK_SIZE) - 8);
+ write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8,
+ va + (IST_MCE * IST_SHSTK_SIZE) - 8);
+ write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8,
+ va + (IST_NMI * IST_SHSTK_SIZE) - 8);
+ write_sss_token(p + (IST_DB * IST_SHSTK_SIZE) - 8,
+ va + (IST_DB * IST_SHSTK_SIZE) - 8);
+ write_sss_token(p + (IST_DF * IST_SHSTK_SIZE) - 8,
+ va + (IST_DF * IST_SHSTK_SIZE) - 8);
}
map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
/* Primary Shadow Stack. 1x 4k in stack page 5. */
p += PRIMARY_SHSTK_SLOT * PAGE_SIZE;
+ va += PRIMARY_SHSTK_SLOT * PAGE_SIZE;
if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
- write_sss_token(p + PAGE_SIZE - 8);
+ write_sss_token(p + PAGE_SIZE - 8, va + PAGE_SIZE - 8);
map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
}
@@ -6567,6 +6577,105 @@ void setup_perdomain_slot(const struct vcpu *v, root_pgentry_t *root_pgt)
root_pgt[root_table_offset(PERDOMAIN_VIRT_START)]);
}
+static struct page_info *l2_all_stacks;
+
+int add_stack(const void *stack, unsigned int cpu)
+{
+ unsigned long va = (unsigned long)PERCPU_STACK_ADDR(cpu);
+ struct page_info *pg;
+ l2_pgentry_t *l2tab = NULL;
+ l1_pgentry_t *l1tab = NULL;
+ unsigned int nr;
+ int rc = 0;
+
+ /*
+ * Assume CPU stack allocation is always serialized, either because it's
+ * done on the BSP during boot, or in case of hotplug, in stop machine
+ * context.
+ */
+ ASSERT(system_state < SYS_STATE_active || cpu_in_hotplug_context());
+
+ if ( !opt_asi_hvm && !opt_asi_pv )
+ return 0;
+
+ if ( !l2_all_stacks )
+ {
+ l2_all_stacks = alloc_domheap_page(NULL, MEMF_no_owner);
+ if ( !l2_all_stacks )
+ return -ENOMEM;
+ l2tab = __map_domain_page(l2_all_stacks);
+ clear_page(l2tab);
+ }
+ else
+ l2tab = __map_domain_page(l2_all_stacks);
+
+ /* code assumes all the stacks can be mapped with a single l2. */
+ ASSERT(l3_table_offset((unsigned long)percpu_fix_to_virt(PCPU_STACK_END)) ==
+ l3_table_offset((unsigned long)percpu_fix_to_virt(PCPU_STACK_START)));
+ for ( nr = 0 ; nr < (1U << STACK_ORDER) ; nr++)
+ {
+ l2_pgentry_t *pl2e = l2tab + l2_table_offset(va);
+
+ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
+ {
+ pg = alloc_domheap_page(NULL, MEMF_no_owner);
+ if ( !pg )
+ {
+ rc = -ENOMEM;
+ break;
+ }
+ l1tab = __map_domain_page(pg);
+ clear_page(l1tab);
+ l2e_write(pl2e, l2e_from_page(pg, __PAGE_HYPERVISOR_RW));
+ }
+ else if ( !l1tab )
+ l1tab = map_l1t_from_l2e(*pl2e);
+
+ l1e_write(&l1tab[l1_table_offset(va)],
+ l1e_from_mfn(virt_to_mfn(stack),
+ is_shstk_slot(nr) ? __PAGE_HYPERVISOR_SHSTK
+ : __PAGE_HYPERVISOR_RW));
+
+ va += PAGE_SIZE;
+ stack += PAGE_SIZE;
+
+ if ( !l1_table_offset(va) )
+ {
+ unmap_domain_page(l1tab);
+ l1tab = NULL;
+ }
+ }
+
+ unmap_domain_page(l1tab);
+ unmap_domain_page(l2tab);
+ /*
+ * Don't care to free the intermediate page-tables on failure, can be used
+ * to map other stacks.
+ */
+
+ return rc;
+}
+
+int map_all_stacks(struct domain *d)
+{
+ /*
+ * Create the per-domain L3. Pass a dummy PERDOMAIN_VIRT_START, but note
+ * only the per-domain L3 is allocated when nr == 0.
+ */
+ int rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL);
+ l3_pgentry_t *l3tab;
+
+ if ( rc )
+ return rc;
+
+ l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
+ l3tab[l3_table_offset((unsigned long)percpu_fix_to_virt(PCPU_STACK_START))]
+ = l3e_from_page(l2_all_stacks, __PAGE_HYPERVISOR_RW);
+ unmap_domain_page(l3tab);
+
+ return 0;
+}
+
static void __init __maybe_unused build_assertions(void)
{
/*
@@ -808,8 +808,6 @@ static void __init noreturn reinit_bsp_stack(void)
/* Update SYSCALL trampolines */
percpu_traps_init();
- stack_base[0] = stack;
-
rc = setup_cpu_root_pgt(0);
if ( rc )
panic("Error %d setting up PV root page table\n", rc);
@@ -1771,10 +1769,6 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
system_state = SYS_STATE_boot;
- bsp_stack = cpu_alloc_stack(0);
- if ( !bsp_stack )
- panic("No memory for BSP stack\n");
-
console_init_ring();
vesa_init();
@@ -1961,6 +1955,16 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
alternative_branches();
+ /*
+ * Alloc the BSP stack closer to the point where the AP ones also get
+ * allocated - and after the speculation mitigations have been initialized.
+ * In order to set up the shadow stack token correctly Xen needs to know
+ * whether per-CPU mapped stacks are being used.
+ */
+ bsp_stack = cpu_alloc_stack(0);
+ if ( !bsp_stack )
+ panic("No memory for BSP stack\n");
+
/*
* Setup the local per-domain L3 for the BSP also, so it matches the state
* of the APs.
@@ -2065,8 +2069,17 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
info->last_spec_ctrl = default_xen_spec_ctrl;
}
+ stack_base[0] = bsp_stack;
+
/* Copy the cpu info block, and move onto the BSP stack. */
- bsp_info = get_cpu_info_from_stack((unsigned long)bsp_stack);
+ if ( opt_asi_hvm || opt_asi_pv )
+ {
+ cpu_set_stack_mappings(0, 0);
+ bsp_info = get_cpu_info_from_stack((unsigned long)PERCPU_STACK_ADDR(0));
+ }
+ else
+ bsp_info = get_cpu_info_from_stack((unsigned long)bsp_stack);
+
*bsp_info = *info;
asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
@@ -22,6 +22,7 @@
#include <asm/hardirq.h>
#include <asm/hpet.h>
#include <asm/setup.h>
+#include <asm/spec_ctrl.h>
#include <irq_vectors.h>
#include <mach_apic.h>
@@ -433,3 +434,31 @@ long cf_check cpu_down_helper(void *data)
ret = cpu_down(cpu);
return ret;
}
+
+void arch_smp_pre_callfunc(unsigned int cpu)
+{
+ if ( (!opt_asi_pv && !opt_asi_hvm) || cpu == smp_processor_id() ||
+ (!current->domain->arch.asi && !is_idle_vcpu(current)) ||
+ /*
+ * CPU#0 still runs on the .init stack when the APs are started, don't
+ * attempt to map such stack.
+ */
+ (!cpu && system_state < SYS_STATE_active) )
+ return;
+
+ cpu_set_stack_mappings(smp_processor_id(), cpu);
+}
+
+void arch_smp_post_callfunc(unsigned int cpu)
+{
+ unsigned int i;
+
+ if ( (!opt_asi_pv && !opt_asi_hvm) || cpu == smp_processor_id() ||
+ (!current->domain->arch.asi && !is_idle_vcpu(current)) )
+ return;
+
+ for ( i = 0; i < (1U << STACK_ORDER); i++ )
+ percpu_clear_fixmap(PERCPU_STACK_IDX(cpu) + i);
+
+ flush_area_local(PERCPU_STACK_ADDR(cpu), FLUSH_ORDER(STACK_ORDER));
+}
@@ -579,7 +579,20 @@ static int do_boot_cpu(int apicid, int cpu)
printk("Booting processor %d/%d eip %lx\n",
cpu, apicid, start_eip);
- stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info);
+ if ( opt_asi_hvm || opt_asi_pv )
+ {
+ /*
+ * Uniformly run with the stack mapping of the per-CPU area (including
+ * the idle vCPU) if ASI is enabled for any domain type.
+ */
+ cpu_set_stack_mappings(cpu, cpu);
+
+ ASSERT(IS_ALIGNED((unsigned long)PERCPU_STACK_ADDR(cpu), STACK_SIZE));
+
+ stack_start = PERCPU_STACK_ADDR(cpu) + STACK_SIZE - sizeof(struct cpu_info);
+ }
+ else
+ stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info);
/*
* If per-CPU idle root page table has been allocated, switch to it as
@@ -1053,11 +1066,41 @@ void *cpu_alloc_stack(unsigned int cpu)
stack = alloc_xenheap_pages(STACK_ORDER, memflags);
if ( stack )
- memguard_guard_stack(stack);
+ {
+ int rc = add_stack(stack, cpu);
+
+ if ( rc )
+ {
+ printk(XENLOG_ERR "unable to map stack for CPU %u: %d\n", cpu, rc);
+ free_xenheap_pages(stack, STACK_ORDER);
+ return NULL;
+ }
+ memguard_guard_stack(stack, cpu);
+ }
return stack;
}
+void cpu_set_stack_mappings(unsigned int dest_cpu, unsigned int stack_cpu)
+{
+ unsigned int i;
+
+ for ( i = 0; i < (1U << STACK_ORDER); i++ )
+ {
+ unsigned int flags = (is_shstk_slot(i) ? __PAGE_HYPERVISOR_SHSTK
+ : __PAGE_HYPERVISOR_RW) |
+ (dest_cpu == stack_cpu ? _PAGE_GLOBAL : 0);
+
+ if ( is_shstk_slot(i) && dest_cpu != stack_cpu )
+ continue;
+
+ percpu_set_fixmap_remote(dest_cpu, PERCPU_STACK_IDX(stack_cpu) + i,
+ _mfn(virt_to_mfn(stack_base[stack_cpu] +
+ i * PAGE_SIZE)),
+ flags);
+ }
+}
+
static int cpu_smpboot_alloc(unsigned int cpu)
{
struct cpu_info *info;
@@ -609,10 +609,12 @@ void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs)
unsigned long esp = regs->rsp;
unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1);
unsigned long esp_top, esp_bottom;
+ const void *stack = current->domain->arch.asi ? PERCPU_STACK_ADDR(cpu)
+ : stack_base[cpu];
- if ( _p(curr_stack_base) != stack_base[cpu] )
+ if ( _p(curr_stack_base) != stack )
printk("Current stack base %p differs from expected %p\n",
- _p(curr_stack_base), stack_base[cpu]);
+ _p(curr_stack_base), stack);
esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
esp_top = esp_bottom - PRIMARY_STACK_SIZE;
@@ -32,6 +32,7 @@ void efi_rs_leave(struct efi_rs_state *state);
#ifndef CONFIG_ARM
# include <asm/i387.h>
+# include <asm/spec_ctrl.h>
# include <asm/xstate.h>
# include <public/platform.h>
#endif
@@ -85,6 +86,7 @@ struct efi_rs_state efi_rs_enter(void)
static const u16 fcw = FCW_DEFAULT;
static const u32 mxcsr = MXCSR_DEFAULT;
struct efi_rs_state state = { .cr3 = 0 };
+ root_pgentry_t *efi_pgt, *idle_pgt;
if ( mfn_eq(efi_l4_mfn, INVALID_MFN) )
return state;
@@ -98,6 +100,16 @@ struct efi_rs_state efi_rs_enter(void)
efi_rs_on_cpu = smp_processor_id();
+ if ( opt_asi_pv || opt_asi_hvm )
+ {
+ /* Insert the idle per-domain slot for the stack mapping. */
+ efi_pgt = map_domain_page(efi_l4_mfn);
+ idle_pgt = maddr_to_virt(idle_vcpu[efi_rs_on_cpu]->arch.cr3);
+ efi_pgt[root_table_offset(PERDOMAIN_VIRT_START)].l4 =
+ idle_pgt[root_table_offset(PERDOMAIN_VIRT_START)].l4;
+ unmap_domain_page(efi_pgt);
+ }
+
/* prevent fixup_page_fault() from doing anything */
irq_enter();
@@ -29,6 +29,7 @@ static struct call_data_struct {
void (*func) (void *info);
void *info;
int wait;
+ unsigned int caller;
cpumask_t selected;
} call_data;
@@ -63,6 +64,7 @@ void on_selected_cpus(
call_data.func = func;
call_data.info = info;
call_data.wait = wait;
+ call_data.caller = smp_processor_id();
smp_send_call_function_mask(&call_data.selected);
@@ -82,6 +84,12 @@ void smp_call_function_interrupt(void)
if ( !cpumask_test_cpu(cpu, &call_data.selected) )
return;
+ /*
+ * TODO: use bounce buffers to pass callfunc data, so that when using ASI
+ * there's no need to map remote CPU stacks.
+ */
+ arch_smp_pre_callfunc(call_data.caller);
+
irq_enter();
if ( unlikely(!func) )
@@ -102,6 +110,8 @@ void smp_call_function_interrupt(void)
}
irq_exit();
+
+ arch_smp_post_callfunc(call_data.caller);
}
/*
@@ -76,4 +76,9 @@ extern void *stack_base[NR_CPUS];
void initialize_cpu_data(unsigned int cpu);
int setup_cpu_root_pgt(unsigned int cpu);
+#ifndef HAS_ARCH_SMP_CALLFUNC
+static inline void arch_smp_pre_callfunc(unsigned int cpu) {}
+static inline void arch_smp_post_callfunc(unsigned int cpu) {}
+#endif
+
#endif /* __XEN_SMP_H__ */
When using ASI the CPU stack is mapped using a range of fixmap entries in the per-CPU region. This ensures the stack is only accessible by the current CPU. Note however there's further work required in order to allocate the stack from domheap instead of xenheap, and ensure the stack is not part of the direct map. For domains not running with ASI enabled all the CPU stacks are mapped in the per-domain L3, so that the stack is always at the same linear address, regardless of whether ASI is enabled or not for the domain. When calling UEFI runtime methods the current per-domain slot needs to be added to the EFI L4, so that the stack is available in UEFI. Finally, some users of callfunc IPIs pass parameters from the stack, so when handling a callfunc IPI the stack of the caller CPU is mapped into the address space of the CPU handling the IPI. This needs further work to use a bounce buffer in order to avoid having to map remote CPU stacks. Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> --- There's also further work required in order to avoid mapping remote stack when handling callfunc IPIs. --- xen/arch/x86/domain.c | 12 +++ xen/arch/x86/include/asm/current.h | 5 ++ xen/arch/x86/include/asm/fixmap.h | 5 ++ xen/arch/x86/include/asm/mm.h | 6 +- xen/arch/x86/include/asm/smp.h | 12 +++ xen/arch/x86/mm.c | 125 +++++++++++++++++++++++++++-- xen/arch/x86/setup.c | 27 +++++-- xen/arch/x86/smp.c | 29 +++++++ xen/arch/x86/smpboot.c | 47 ++++++++++- xen/arch/x86/traps.c | 6 +- xen/common/efi/runtime.c | 12 +++ xen/common/smp.c | 10 +++ xen/include/xen/smp.h | 5 ++ 13 files changed, 281 insertions(+), 20 deletions(-)