@@ -140,19 +140,23 @@ DECLARE_PER_CPU_ALIGNED(struct asi *, curr_asi);
void asi_check_boottime_disable(void);
-void asi_init_mm_state(struct mm_struct *mm);
+int asi_init_mm_state(struct mm_struct *mm);
int asi_init_class(enum asi_class_id class_id, struct asi_taint_policy *taint_policy);
+void asi_init_userspace_class(void);
void asi_uninit_class(enum asi_class_id class_id);
const char *asi_class_name(enum asi_class_id class_id);
int asi_init(struct mm_struct *mm, enum asi_class_id class_id, struct asi **out_asi);
void asi_destroy(struct asi *asi);
+void asi_destroy_userspace(struct mm_struct *mm);
void asi_clone_user_pgtbl(struct mm_struct *mm, pgd_t *pgdp);
/* Enter an ASI domain (restricted address space) and begin the critical section. */
void asi_enter(struct asi *asi);
+void asi_enter_userspace(void);
+
/*
* Leave the "tense" state if we are in it, i.e. end the critical section. We
* will stay relaxed until the next asi_enter.
@@ -294,7 +298,7 @@ void asi_handle_switch_mm(void);
*/
static inline bool asi_maps_user_addr(enum asi_class_id class_id)
{
- return false;
+ return class_id == ASI_CLASS_USERSPACE;
}
#endif /* CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION */
@@ -25,6 +25,7 @@ const char *asi_class_names[] = {
#if IS_ENABLED(CONFIG_KVM)
[ASI_CLASS_KVM] = "KVM",
#endif
+ [ASI_CLASS_USERSPACE] = "userspace",
};
DEFINE_PER_CPU_ALIGNED(struct asi *, curr_asi);
@@ -67,6 +68,32 @@ int asi_init_class(enum asi_class_id class_id, struct asi_taint_policy *taint_po
}
EXPORT_SYMBOL_GPL(asi_init_class);
+void __init asi_init_userspace_class(void)
+{
+ static struct asi_taint_policy policy = {
+ /*
+ * Prevent going to userspace with sensitive data potentially
+ * left in sidechannels by code running in the unrestricted
+ * address space, or another MM. Note we don't check for guest
+ * data here. This reflects the assumption that the guest trusts
+ * its VMM (absent fancy HW features, which are orthogonal).
+ */
+ .protect_data = ASI_TAINT_KERNEL_DATA | ASI_TAINT_OTHER_MM_DATA,
+ /*
+ * Don't go into userspace with control flow state controlled by
+ * other processes, or any KVM guest the process is running.
+ * Note this bit is about protecting userspace from other parts
+ * of the system, while data_taints is about protecting other
+ * parts of the system from the guest.
+ */
+ .prevent_control = ASI_TAINT_GUEST_CONTROL | ASI_TAINT_OTHER_MM_CONTROL,
+ .set = ASI_TAINT_USER_CONTROL | ASI_TAINT_USER_DATA,
+ };
+ int err = asi_init_class(ASI_CLASS_USERSPACE, &policy);
+
+ WARN_ON(err);
+}
+
void asi_uninit_class(enum asi_class_id class_id)
{
if (!boot_cpu_has(X86_FEATURE_ASI))
@@ -385,7 +412,8 @@ int asi_init(struct mm_struct *mm, enum asi_class_id class_id, struct asi **out_
int err = 0;
uint i;
- *out_asi = NULL;
+ if (out_asi)
+ *out_asi = NULL;
if (!boot_cpu_has(X86_FEATURE_ASI))
return 0;
@@ -424,7 +452,7 @@ int asi_init(struct mm_struct *mm, enum asi_class_id class_id, struct asi **out_
exit_unlock:
if (err)
__asi_destroy(asi);
- else
+ else if (out_asi)
*out_asi = asi;
__asi_init_user_pgds(mm, asi);
@@ -515,6 +543,12 @@ static __always_inline void maybe_flush_data(struct asi *next_asi)
this_cpu_and(asi_taints, ~ASI_TAINTS_DATA_MASK);
}
+void asi_destroy_userspace(struct mm_struct *mm)
+{
+ VM_BUG_ON(!asi_class_initialized(ASI_CLASS_USERSPACE));
+ asi_destroy(&mm->asi[ASI_CLASS_USERSPACE]);
+}
+
noinstr void __asi_enter(void)
{
u64 asi_cr3;
@@ -584,6 +618,11 @@ noinstr void asi_enter(struct asi *asi)
}
EXPORT_SYMBOL_GPL(asi_enter);
+noinstr void asi_enter_userspace(void)
+{
+ asi_enter(¤t->mm->asi[ASI_CLASS_USERSPACE]);
+}
+
noinstr void asi_relax(void)
{
if (static_asi_enabled()) {
@@ -633,13 +672,15 @@ noinstr void asi_exit(void)
}
EXPORT_SYMBOL_GPL(asi_exit);
-void asi_init_mm_state(struct mm_struct *mm)
+int asi_init_mm_state(struct mm_struct *mm)
{
if (!boot_cpu_has(X86_FEATURE_ASI))
- return;
+ return 0;
memset(mm->asi, 0, sizeof(mm->asi));
mutex_init(&mm->asi_init_lock);
+
+ return asi_init(mm, ASI_CLASS_USERSPACE, NULL);
}
void asi_handle_switch_mm(void)
@@ -15,6 +15,7 @@ enum asi_class_id {
#if IS_ENABLED(CONFIG_KVM)
ASI_CLASS_KVM,
#endif
+ ASI_CLASS_USERSPACE,
ASI_MAX_NUM_CLASSES,
};
static_assert(order_base_2(X86_CR3_ASI_PCID_BITS) <= ASI_MAX_NUM_CLASSES);
@@ -37,8 +38,10 @@ int asi_init_class(enum asi_class_id class_id,
static inline void asi_uninit_class(enum asi_class_id class_id) { }
+static inline void asi_init_userspace_class(void) { }
+
struct mm_struct;
-static inline void asi_init_mm_state(struct mm_struct *mm) { }
+static inline int asi_init_mm_state(struct mm_struct *mm) { return 0; }
static inline int asi_init(struct mm_struct *mm, enum asi_class_id class_id,
struct asi **out_asi)
@@ -48,8 +51,12 @@ static inline int asi_init(struct mm_struct *mm, enum asi_class_id class_id,
static inline void asi_destroy(struct asi *asi) { }
+static inline void asi_destroy_userspace(struct mm_struct *mm) { }
+
static inline void asi_enter(struct asi *asi) { }
+static inline void asi_enter_userspace(void) { }
+
static inline void asi_relax(void) { }
static inline bool asi_is_relaxed(void) { return true; }
@@ -191,6 +191,16 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
{
long ret;
+ /*
+ * End the ASI critical section for userspace. Syscalls are the only
+ * place this happens - all other entry from userspace is handled via
+ * ASI's interrupt-tracking. The reason syscalls are special is that's
+ * where it's possible to switch to another ASI domain within the same
+ * task (i.e. KVM_RUN), an asi_relax() is required here in case of an
+ * upcoming asi_enter().
+ */
+ asi_relax();
+
enter_from_user_mode(regs);
instrumentation_begin();
@@ -355,6 +365,7 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
*/
static __always_inline void exit_to_user_mode(void)
{
+
instrumentation_begin();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
@@ -953,6 +953,8 @@ void start_kernel(void)
/* Architectural and non-timekeeping rng init, before allocator init */
random_init_early(command_line);
+ asi_init_userspace_class();
+
/*
* These use large bootmem allocations and must precede
* initalization of page allocator
@@ -218,6 +218,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
__syscall_exit_to_user_mode_work(regs);
instrumentation_end();
exit_to_user_mode();
+ asi_enter_userspace();
}
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
@@ -917,6 +917,7 @@ void __mmdrop(struct mm_struct *mm)
/* Ensure no CPUs are using this as their lazy tlb mm */
cleanup_lazy_tlbs(mm);
+ asi_destroy_userspace(mm);
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
@@ -1297,7 +1298,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (mm_alloc_pgd(mm))
goto fail_nopgd;
- asi_init_mm_state(mm);
+ if (asi_init_mm_state(mm))
+ goto fail_nocontext;
if (init_new_context(p, mm))
goto fail_nocontext;
Now userspace gets a restricted address space too. The critical section begins on exit to userspace and ends when it makes a system call. Other entries from userspace just interrupt the critical section via asi_intr_enter(). The reason why system calls have to actually asi_relax() (i.e. fully terminate the critical section instead of just interrupting it) is that system calls are the type of kernel entry that can lead to transition into a _different_ ASI domain, namely the KVM one: it is not supported to transition into a different domain while a critical section exists (i.e. while asi_state.target is not NULL), even if it has been paused by asi_intr_enter() (i.e. even if asi_state.intr_nest_depth is nonzero) - there must be an asi_relax() between any two asi_enter()s. The restricted address space for bare-metal tasks naturally contains the entire userspace address region, although the task's own memory is still missing from the direct map. This implementation creates new userspace-specific APIs for asi_init(), asi_destroy() and asi_enter(), which seems a little ugly, maybe this suggest a general rework of these APIs given that the "generic" version only has one caller. For RFC code this seems good enough though. Signed-off-by: Brendan Jackman <jackmanb@google.com> --- arch/x86/include/asm/asi.h | 8 ++++++-- arch/x86/mm/asi.c | 49 ++++++++++++++++++++++++++++++++++++++++---- include/asm-generic/asi.h | 9 +++++++- include/linux/entry-common.h | 11 ++++++++++ init/main.c | 2 ++ kernel/entry/common.c | 1 + kernel/fork.c | 4 +++- 7 files changed, 76 insertions(+), 8 deletions(-)