diff mbox series

[RFC,v2,21/29] KVM: x86: asi: Restricted address space for VM execution

Message ID 20250110-asi-rfc-v2-v2-21-8419288bc805@google.com (mailing list archive)
State New
Headers show
Series Address Space Isolation (ASI) | expand

Commit Message

Brendan Jackman Jan. 10, 2025, 6:40 p.m. UTC
An ASI restricted address space is added for KVM. This protects the
userspace from attack by the guest, and the guest from attack by other
processes. It doesn't attempt to prevent the guest from attack by the
current process.

This change incorporates an extra asi_exit at the end of vcpu_run. We
expect later iterations of ASI to drop that call as we gain the
ability to context switch within the ASI domain.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++
 arch/x86/kvm/svm/svm.c          |  2 ++
 arch/x86/kvm/vmx/vmx.c          | 38 ++++++++++++--------
 arch/x86/kvm/x86.c              | 77 ++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 105 insertions(+), 15 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d9f763a7bb9d5db422ea5625b2c28420bd14f26..00cda452dd6ca6ec57ff85ca194ee4aeb6af3be7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -37,6 +37,7 @@ 
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/hyperv-tlfs.h>
 #include <asm/reboot.h>
+#include <asm/asi.h>
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
 
@@ -1535,6 +1536,8 @@  struct kvm_arch {
 	 */
 #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
 	struct kvm_mmu_memory_cache split_desc_cache;
+
+	struct asi *asi;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9df3e1e5ae81a1346409632edd693cb7e0740f72..f2c3154292b4f6c960b490b0773f53bea43897bb 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4186,6 +4186,7 @@  static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
 	guest_state_enter_irqoff();
 
 	amd_clear_divider();
+	asi_enter(vcpu->kvm->arch.asi);
 
 	if (sev_es_guest(vcpu->kvm))
 		__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
@@ -4193,6 +4194,7 @@  static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
 	else
 		__svm_vcpu_run(svm, spec_ctrl_intercepted);
 
+	asi_relax();
 	guest_state_exit_irqoff();
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d28618e9277ede83ad2edc1b1778ea44123aa797..181d230b1c057fed33f7b29b7b0e378dbdfeb174 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -49,6 +49,7 @@ 
 #include <asm/mwait.h>
 #include <asm/spec-ctrl.h>
 #include <asm/vmx.h>
+#include <asm/asi.h>
 
 #include <trace/events/ipi.h>
 
@@ -7282,14 +7283,34 @@  static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 					unsigned int flags)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long cr3;
 
 	guest_state_enter_irqoff();
+	asi_enter(vcpu->kvm->arch.asi);
+
+	/*
+	 * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
+	 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+	 * it switches back to the current->mm, which can occur in KVM context
+	 * when switching to a temporary mm to patch kernel code, e.g. if KVM
+	 * toggles a static key while handling a VM-Exit.
+	 * Also, this must be done after asi_enter(), as it changes CR3
+	 * when switching address spaces.
+	 */
+	cr3 = __get_current_cr3_fast();
+	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+		vmcs_writel(HOST_CR3, cr3);
+		vmx->loaded_vmcs->host_state.cr3 = cr3;
+	}
 
 	/*
 	 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
 	 * mitigation for MDS is done late in VMentry and is still
 	 * executed in spite of L1D Flush. This is because an extra VERW
 	 * should not matter much after the big hammer L1D Flush.
+	 *
+	 * This is only after asi_enter() for performance reasons.
+	 * RFC: This also needs to be integrated with ASI's tainting model.
 	 */
 	if (static_branch_unlikely(&vmx_l1d_should_flush))
 		vmx_l1d_flush(vcpu);
@@ -7310,6 +7331,8 @@  static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 
 	vmx->idt_vectoring_info = 0;
 
+	asi_relax();
+
 	vmx_enable_fb_clear(vmx);
 
 	if (unlikely(vmx->fail)) {
@@ -7338,7 +7361,7 @@  static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long cr3, cr4;
+	unsigned long cr4;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!enable_vnmi &&
@@ -7381,19 +7404,6 @@  fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 	vcpu->arch.regs_dirty = 0;
 
-	/*
-	 * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
-	 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
-	 * it switches back to the current->mm, which can occur in KVM context
-	 * when switching to a temporary mm to patch kernel code, e.g. if KVM
-	 * toggles a static key while handling a VM-Exit.
-	 */
-	cr3 = __get_current_cr3_fast();
-	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
-		vmcs_writel(HOST_CR3, cr3);
-		vmx->loaded_vmcs->host_state.cr3 = cr3;
-	}
-
 	cr4 = cr4_read_shadow();
 	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
 		vmcs_writel(HOST_CR4, cr4);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83fe0a78146fc198115aba0e76ba57ecfb1dd8d9..3e0811eb510650abc601e4adce1ce4189835a730 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -85,6 +85,7 @@ 
 #include <asm/emulate_prefix.h>
 #include <asm/sgx.h>
 #include <clocksource/hyperv_timer.h>
+#include <asm/asi.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -9674,6 +9675,55 @@  static void kvm_x86_check_cpu_compat(void *ret)
 	*(int *)ret = kvm_x86_check_processor_compatibility();
 }
 
+#ifdef CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION
+static inline int kvm_x86_init_asi_class(void)
+{
+	static struct asi_taint_policy policy = {
+		/*
+		 * Prevent going to the guest with sensitive data potentially
+		 * left in sidechannels by code running in the unrestricted
+		 * address space, or another MM.
+		 */
+		.protect_data =  ASI_TAINT_KERNEL_DATA | ASI_TAINT_OTHER_MM_DATA,
+		/*
+		 * Prevent going to the guest with branch predictor state
+		 * influenced by other processes. Note this bit is about
+		 * protecting the guest from other parts of the system, while
+		 * data_taints is about protecting other parts of the system
+		 * from the guest.
+		 */
+		.prevent_control = ASI_TAINT_OTHER_MM_CONTROL,
+		.set = ASI_TAINT_GUEST_DATA,
+	};
+
+	/*
+	 * Inform ASI that the guest will gain control of the branch predictor,
+	 * unless we're just unconditionally blasting it after VM Exit.
+	 *
+	 * RFC: This is a bit simplified - on some configurations we could avoid
+	 * a duplicated RSB-fill if we had a separate taint specifically for the
+	 * RSB.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) ||
+	    !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+	    !cpu_feature_enabled(X86_FEATURE_RSB_VMEXIT))
+		policy.set = ASI_TAINT_GUEST_CONTROL;
+
+	/*
+	 * And the same for data left behind by code in the userspace domain
+	 * (i.e. the VMM itself, plus kernel code serving its syscalls etc).
+	 * This should eventually be configurable: users whose VMMs contain
+	 * no secrets can disable it to avoid paying a mitigation cost on
+	 * transition between their guest and userspace.
+	 */
+	policy.protect_data |= ASI_TAINT_USER_DATA;
+
+	return asi_init_class(ASI_CLASS_KVM, &policy);
+}
+#else /* CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION */
+static inline int kvm_x86_init_asi_class(void) { return 0; }
+#endif /* CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION */
+
 int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 {
 	u64 host_pat;
@@ -9737,6 +9787,10 @@  int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 	kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
 	kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
 
+	r = kvm_x86_init_asi_class();
+	if (r < 0)
+		goto out_mmu_exit;
+
 	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
 		kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 		kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
@@ -9754,7 +9808,7 @@  int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 
 	r = ops->hardware_setup();
 	if (r != 0)
-		goto out_mmu_exit;
+		goto out_asi_uninit;
 
 	kvm_ops_update(ops);
 
@@ -9810,6 +9864,8 @@  int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 out_unwind_ops:
 	kvm_x86_ops.enable_virtualization_cpu = NULL;
 	kvm_x86_call(hardware_unsetup)();
+out_asi_uninit:
+	asi_uninit_class(ASI_CLASS_KVM);
 out_mmu_exit:
 	kvm_mmu_vendor_module_exit();
 out_free_percpu:
@@ -9841,6 +9897,7 @@  void kvm_x86_vendor_exit(void)
 	cancel_work_sync(&pvclock_gtod_work);
 #endif
 	kvm_x86_call(hardware_unsetup)();
+	asi_uninit_class(ASI_CLASS_KVM);
 	kvm_mmu_vendor_module_exit();
 	free_percpu(user_return_msrs);
 	kmem_cache_destroy(x86_emulator_cache);
@@ -11574,6 +11631,13 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 	r = vcpu_run(vcpu);
 
+	/*
+	 * At present ASI doesn't have the capability to transition directly
+	 * from the restricted address space to the user address space. So we
+	 * just return to the unrestricted address space in between.
+	 */
+	asi_exit();
+
 out:
 	kvm_put_guest_fpu(vcpu);
 	if (kvm_run->kvm_valid_regs)
@@ -12705,6 +12769,14 @@  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (ret)
 		goto out_uninit_mmu;
 
+	ret = asi_init(kvm->mm, ASI_CLASS_KVM, &kvm->arch.asi);
+	if (ret)
+		goto out_uninit_mmu;
+
+	ret = static_call(kvm_x86_vm_init)(kvm);
+	if (ret)
+		goto out_asi_destroy;
+
 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
@@ -12742,6 +12814,8 @@  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	return 0;
 
+out_asi_destroy:
+	asi_destroy(kvm->arch.asi);
 out_uninit_mmu:
 	kvm_mmu_uninit_vm(kvm);
 	kvm_page_track_cleanup(kvm);
@@ -12883,6 +12957,7 @@  void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_destroy_vcpus(kvm);
 	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
+	asi_destroy(kvm->arch.asi);
 	kvm_mmu_uninit_vm(kvm);
 	kvm_page_track_cleanup(kvm);
 	kvm_xen_destroy_vm(kvm);