@@ -2531,7 +2531,7 @@ config MITIGATION_ADDRESS_SPACE_ISOLATION
The !PARAVIRT dependency is only because of lack of testing; in theory
the code is written to work under paravirtualization. In practice
there are likely to be unhandled cases, in particular concerning TLB
- flushes.
+ flushes and CR3 manipulation.
config ADDRESS_SPACE_ISOLATION_DEFAULT_ON
@@ -11,6 +11,16 @@
/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+/*
+ * CR3 access helpers (e.g. write_cr3()) will call asi_exit() to exit the
+ * restricted address space first. We cannot call the version defined in
+ * arch/x86/mm/asi.c here, so make sure we always call the noop version in
+ * asm-generic/asi.h. It does not matter because early during boot asi_exit()
+ * would be a noop anyway. The alternative is spamming the code with *_raw()
+ * variants of the CR3 helpers.
+ */
+#undef CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION
+
#include "error.h"
#include "misc.h"
@@ -1,4 +1,15 @@
// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * CR3 access helpers (e.g. write_cr3()) will call asi_exit() to exit the
+ * restricted address space first. We cannot call the version defined in
+ * arch/x86/mm/asi.c here, so make sure we always call the noop version in
+ * asm-generic/asi.h. It does not matter because early during boot asi_exit()
+ * would be a noop anyway. The alternative is spamming the code with *_raw()
+ * variants of the CR3 helpers.
+ */
+#undef CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION
+
#include "misc.h"
#include <asm/bootparam.h>
#include <asm/e820/types.h>
@@ -226,6 +226,11 @@ static __always_inline unsigned long read_cr3_pa(void)
return __read_cr3() & CR3_ADDR_MASK;
}
+static __always_inline unsigned long read_cr3_pa_raw(void)
+{
+ return __read_cr3_raw() & CR3_ADDR_MASK;
+}
+
static inline unsigned long native_read_cr3_pa(void)
{
return __native_read_cr3() & CR3_ADDR_MASK;
@@ -5,6 +5,7 @@
#ifdef __KERNEL__
#include <asm/nops.h>
#include <asm/processor-flags.h>
+#include <asm-generic/asi.h>
#include <linux/errno.h>
#include <linux/irqflags.h>
@@ -42,18 +43,32 @@ static __always_inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val) : "memory");
}
-static __always_inline unsigned long __native_read_cr3(void)
+void asi_exit(void);
+
+static __always_inline unsigned long __native_read_cr3_raw(void)
{
unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER);
return val;
}
-static __always_inline void native_write_cr3(unsigned long val)
+static __always_inline unsigned long __native_read_cr3(void)
+{
+ asi_exit();
+ return __native_read_cr3_raw();
+}
+
+static __always_inline void native_write_cr3_raw(unsigned long val)
{
asm volatile("mov %0,%%cr3": : "r" (val) : "memory");
}
+static __always_inline void native_write_cr3(unsigned long val)
+{
+ asi_exit();
+ native_write_cr3_raw(val);
+}
+
static inline unsigned long native_read_cr4(void)
{
unsigned long val;
@@ -152,17 +167,39 @@ static __always_inline void write_cr2(unsigned long x)
/*
* Careful! CR3 contains more than just an address. You probably want
* read_cr3_pa() instead.
+ *
+ * The implementation interacts with ASI to ensure that the returned value is
+ * stable as long as preemption is disabled.
*/
static __always_inline unsigned long __read_cr3(void)
{
return __native_read_cr3();
}
+/*
+ * The return value of this is unstable under ASI, even with preemption off.
+ * Call __read_cr3 instead unless you have a good reason not to.
+ */
+static __always_inline unsigned long __read_cr3_raw(void)
+{
+ return __native_read_cr3_raw();
+}
+
+/* This interacts with ASI like __read_cr3. */
static __always_inline void write_cr3(unsigned long x)
{
native_write_cr3(x);
}
+/*
+ * Like __read_cr3_raw, this doesn't interact with ASI. It's very unlikely that
+ * this should be called from outside ASI-specific code.
+ */
+static __always_inline void write_cr3_raw(unsigned long x)
+{
+ native_write_cr3_raw(x);
+}
+
static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
@@ -79,7 +79,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = __read_cr3();
+ cr3 = __read_cr3_raw();
cr4 = __read_cr4();
printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
log_lvl, cr0, cr2, cr3, cr4);
@@ -113,7 +113,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = __read_cr3();
+ cr3 = __read_cr3_raw();
cr4 = __read_cr4();
printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
@@ -3214,6 +3214,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
*/
vmcs_writel(GUEST_RFLAGS, 0);
+ /*
+ * Stabilize CR3 to ensure the VM Exit returns to the correct address
+ * space. This is costly, we wouldn't do this in hot-path code.
+ */
+ asi_exit();
+
cr3 = __get_current_cr3_fast();
if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
vmcs_writel(HOST_CR3, cr3);
@@ -4323,8 +4323,14 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
/*
* Save the most likely value for this task's CR3 in the VMCS.
* We can't use __get_current_cr3_fast() because we're not atomic.
+ *
+ * Use __read_cr3_raw() to avoid exiting ASI if we are in the restrict
+ * address space. Preemption is enabled, so rescheduling could make us
+ * re-enter ASI anyway. It's okay to avoid exiting ASI here because
+ * vmx_vcpu_enter_exit() and nested_vmx_check_vmentry_hw() will
+ * explicitly enter or exit ASI and update CR3 in the VMCS if needed.
*/
- cr3 = __read_cr3();
+ cr3 = __read_cr3_raw();
vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
vmx->loaded_vmcs->host_state.cr3 = cr3;
@@ -488,7 +488,7 @@ noinstr void __asi_enter(void)
pcid = asi_pcid(target, this_cpu_read(cpu_tlbstate.loaded_mm_asid));
asi_cr3 = build_cr3_pcid_noinstr(target->pgd, pcid, tlbstate_lam_cr3_mask(), false);
- write_cr3(asi_cr3);
+ write_cr3_raw(asi_cr3);
maybe_flush_data(target);
/*
@@ -559,7 +559,7 @@ noinstr void asi_exit(void)
/* Tainting first makes reentrancy easier to reason about. */
this_cpu_or(asi_taints, ASI_TAINT_KERNEL_DATA);
- write_cr3(unrestricted_cr3);
+ write_cr3_raw(unrestricted_cr3);
/*
* Must not update curr_asi until after CR3 write, otherwise a
* re-entrant call might not enter this branch. (This means we
@@ -295,7 +295,7 @@ static bool low_pfn(unsigned long pfn)
static void dump_pagetable(unsigned long address)
{
- pgd_t *base = __va(read_cr3_pa());
+ pgd_t *base = __va(read_cr3_pa_raw());
pgd_t *pgd = &base[pgd_index(address)];
p4d_t *p4d;
pud_t *pud;
@@ -351,7 +351,7 @@ static int bad_address(void *p)
static void dump_pagetable(unsigned long address)
{
- pgd_t *base = __va(read_cr3_pa());
+ pgd_t *base = __va(read_cr3_pa_raw());
pgd_t *pgd = base + pgd_index(address);
p4d_t *p4d;
pud_t *pud;
@@ -519,7 +519,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
pgd_t *pgd;
pte_t *pte;
- pgd = __va(read_cr3_pa());
+ pgd = __va(read_cr3_pa_raw());
pgd += pgd_index(address);
pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);
@@ -1578,7 +1578,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
* be losing some stats here. However for now this keeps ASI
* page faults nice and fast.
*/
- pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
+ pgd = (pgd_t *)__va(read_cr3_pa_raw()) + pgd_index(address);
if (!user_mode(regs) && kernel_access_ok(error_code, address, pgd)) {
warn_if_bad_asi_pf(error_code, address);
return;
@@ -331,8 +331,14 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
* Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask writes.
+ *
+ * The context switching code will explicitly exit ASI when needed, do
+ * not use write_cr3() as it has an implicit ASI exit. Calling
+ * asi_exit() here, where loaded_mm == LOADED_MM_SWITCHING, will cause
+ * the VM_BUG_ON() in asi_exit() to fire mistakenly even though
+ * loaded_mm is never accessed.
*/
- write_cr3(new_mm_cr3);
+ write_cr3_raw(new_mm_cr3);
}
void leave_mm(void)
@@ -559,11 +565,11 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
* without going through leave_mm() / switch_mm_irqs_off() or that
* does something like write_cr3(read_cr3_pa()).
*
- * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
+ * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3_raw()
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
+ if (WARN_ON_ONCE(__read_cr3_raw() != build_cr3(prev->pgd, prev_asid,
tlbstate_lam_cr3_mask()))) {
/*
* If we were to BUG here, we'd be very likely to kill
@@ -1173,7 +1179,7 @@ noinstr unsigned long __get_current_cr3_fast(void)
*/
VM_WARN_ON_ONCE(asi && asi_in_critical_section());
- VM_BUG_ON(cr3 != __read_cr3());
+ VM_BUG_ON(cr3 != __read_cr3_raw());
return cr3;
}
EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
@@ -1373,7 +1379,7 @@ static inline bool cr3_matches_current_mm(void)
* find a current ASI domain.
*/
barrier();
- pgd_cr3 = __va(read_cr3_pa());
+ pgd_cr3 = __va(read_cr3_pa_raw());
return pgd_cr3 == current->mm->pgd || pgd_cr3 == pgd_asi;
}
@@ -379,7 +379,7 @@ void snp_dump_hva_rmpentry(unsigned long hva)
pgd_t *pgd;
pte_t *pte;
- pgd = __va(read_cr3_pa());
+ pgd = __va(read_cr3_pa_raw());
pgd += pgd_index(hva);
pte = lookup_address_in_pgd(pgd, hva, &level);
@@ -66,7 +66,7 @@ void efi_5level_switch(void)
bool have_la57 = native_read_cr4() & X86_CR4_LA57;
bool need_toggle = want_la57 ^ have_la57;
u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
- u64 *cr3 = (u64 *)__native_read_cr3();
+ u64 *cr3 = (u64 *)__native_read_cr3_raw();
u64 *new_cr3;
if (!la57_toggle || !need_toggle)
@@ -71,6 +71,7 @@ static inline pgd_t *asi_pgd(struct asi *asi) { return NULL; }
static inline void asi_handle_switch_mm(void) { }
+struct thread_struct;
static inline void asi_init_thread_state(struct thread_struct *thread) { }
static inline void asi_intr_enter(void) { }