Message ID | 1414142304-6635-1-git-send-email-tiejun.chen@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 10/24/2014 11:18 AM, Tiejun Chen wrote: > Instead of vmx_init(), actually it would make reasonable sense to do > anything specific to vmx hardware setting in vmx_x86_ops->hardware_setup(). > > Signed-off-by: Tiejun Chen <tiejun.chen@intel.com> Please split this patch in multiple parts. It is quite hard to review this way. Paolo > --- > arch/x86/kvm/vmx.c | 720 +++++++++++++++++++++++++++-------------------------- > 1 file changed, 361 insertions(+), 359 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 04fa1b8..9270076 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -3106,10 +3106,302 @@ static __init int alloc_kvm_area(void) > return 0; > } > > +#define MSR_TYPE_R 1 > +#define MSR_TYPE_W 2 > +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, > + u32 msr, int type) > +{ > + int f = sizeof(unsigned long); > + > + if (!cpu_has_vmx_msr_bitmap()) > + return; > + > + /* > + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals > + * have the write-low and read-high bitmap offsets the wrong way round. > + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. > + */ > + if (msr <= 0x1fff) { > + if (type & MSR_TYPE_R) > + /* read-low */ > + __clear_bit(msr, msr_bitmap + 0x000 / f); > + > + if (type & MSR_TYPE_W) > + /* write-low */ > + __clear_bit(msr, msr_bitmap + 0x800 / f); > + > + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { > + msr &= 0x1fff; > + if (type & MSR_TYPE_R) > + /* read-high */ > + __clear_bit(msr, msr_bitmap + 0x400 / f); > + > + if (type & MSR_TYPE_W) > + /* write-high */ > + __clear_bit(msr, msr_bitmap + 0xc00 / f); > + > + } > +} > + > +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, > + u32 msr, int type) > +{ > + int f = sizeof(unsigned long); > + > + if (!cpu_has_vmx_msr_bitmap()) > + return; > + > + /* > + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals > + * have the write-low and read-high bitmap offsets the wrong way round. > + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. > + */ > + if (msr <= 0x1fff) { > + if (type & MSR_TYPE_R) > + /* read-low */ > + __set_bit(msr, msr_bitmap + 0x000 / f); > + > + if (type & MSR_TYPE_W) > + /* write-low */ > + __set_bit(msr, msr_bitmap + 0x800 / f); > + > + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { > + msr &= 0x1fff; > + if (type & MSR_TYPE_R) > + /* read-high */ > + __set_bit(msr, msr_bitmap + 0x400 / f); > + > + if (type & MSR_TYPE_W) > + /* write-high */ > + __set_bit(msr, msr_bitmap + 0xc00 / f); > + > + } > +} > + > +static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) > +{ > + if (!longmode_only) > + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, > + msr, MSR_TYPE_R | MSR_TYPE_W); > + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, > + msr, MSR_TYPE_R | MSR_TYPE_W); > +} > + > +static void vmx_enable_intercept_msr_read_x2apic(u32 msr) > +{ > + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, > + msr, MSR_TYPE_R); > + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, > + msr, MSR_TYPE_R); > +} > + > +static void vmx_disable_intercept_msr_read_x2apic(u32 msr) > +{ > + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, > + msr, MSR_TYPE_R); > + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, > + msr, MSR_TYPE_R); > +} > + > +static void vmx_disable_intercept_msr_write_x2apic(u32 msr) > +{ > + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, > + msr, MSR_TYPE_W); > + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, > + msr, MSR_TYPE_W); > +} > + > +static int vmx_vm_has_apicv(struct kvm *kvm) > +{ > + return enable_apicv && irqchip_in_kernel(kvm); > +} > + > +static void ept_set_mmio_spte_mask(void) > +{ > + /* > + * EPT Misconfigurations can be generated if the value of bits 2:0 > + * of an EPT paging-structure entry is 110b (write/execute). > + * Also, magic bits (0x3ull << 62) is set to quickly identify mmio > + * spte. > + */ > + kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); > +} > + > +static int __grow_ple_window(int val) > +{ > + if (ple_window_grow < 1) > + return ple_window; > + > + val = min(val, ple_window_actual_max); > + > + if (ple_window_grow < ple_window) > + val *= ple_window_grow; > + else > + val += ple_window_grow; > + > + return val; > +} > + > +static int __shrink_ple_window(int val, int modifier, int minimum) > +{ > + if (modifier < 1) > + return ple_window; > + > + if (modifier < ple_window) > + val /= modifier; > + else > + val -= modifier; > + > + return max(val, minimum); > +} > + > +static void grow_ple_window(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + int old = vmx->ple_window; > + > + vmx->ple_window = __grow_ple_window(old); > + > + if (vmx->ple_window != old) > + vmx->ple_window_dirty = true; > + > + trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); > +} > + > +static void shrink_ple_window(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + int old = vmx->ple_window; > + > + vmx->ple_window = __shrink_ple_window(old, > + ple_window_shrink, ple_window); > + > + if (vmx->ple_window != old) > + vmx->ple_window_dirty = true; > + > + trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); > +} > + > +/* > + * ple_window_actual_max is computed to be one grow_ple_window() below > + * ple_window_max. (See __grow_ple_window for the reason.) > + * This prevents overflows, because ple_window_max is int. > + * ple_window_max effectively rounded down to a multiple of ple_window_grow in > + * this process. > + * ple_window_max is also prevented from setting vmx->ple_window < ple_window. > + */ > +static void update_ple_window_actual_max(void) > +{ > + ple_window_actual_max = > + __shrink_ple_window(max(ple_window_max, ple_window), > + ple_window_grow, INT_MIN); > +} > + > + > static __init int hardware_setup(void) > { > - if (setup_vmcs_config(&vmcs_config) < 0) > - return -EIO; > + int r = -ENOMEM, i, msr; > + > + rdmsrl_safe(MSR_EFER, &host_efer); > + > + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) > + kvm_define_shared_msr(i, vmx_msr_index[i]); > + > + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx_io_bitmap_a) > + return r; > + > + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx_io_bitmap_b) > + goto out; > + > + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx_msr_bitmap_legacy) > + goto out1; > + > + vmx_msr_bitmap_legacy_x2apic = > + (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx_msr_bitmap_legacy_x2apic) > + goto out2; > + > + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx_msr_bitmap_longmode) > + goto out3; > + > + vmx_msr_bitmap_longmode_x2apic = > + (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx_msr_bitmap_longmode_x2apic) > + goto out4; > + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx_vmread_bitmap) > + goto out5; > + > + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); > + if (!vmx_vmwrite_bitmap) > + goto out6; > + > + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); > + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); > + > + /* > + * Allow direct access to the PC debug port (it is often used for I/O > + * delays, but the vmexits simply slow things down). > + */ > + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); > + clear_bit(0x80, vmx_io_bitmap_a); > + > + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); > + > + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); > + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); > + > + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); > + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); > + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); > + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); > + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); > + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); > + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); > + > + memcpy(vmx_msr_bitmap_legacy_x2apic, > + vmx_msr_bitmap_legacy, PAGE_SIZE); > + memcpy(vmx_msr_bitmap_longmode_x2apic, > + vmx_msr_bitmap_longmode, PAGE_SIZE); > + > + if (enable_apicv) { > + for (msr = 0x800; msr <= 0x8ff; msr++) > + vmx_disable_intercept_msr_read_x2apic(msr); > + > + /* According SDM, in x2apic mode, the whole id reg is used. > + * But in KVM, it only use the highest eight bits. Need to > + * intercept it */ > + vmx_enable_intercept_msr_read_x2apic(0x802); > + /* TMCCT */ > + vmx_enable_intercept_msr_read_x2apic(0x839); > + /* TPR */ > + vmx_disable_intercept_msr_write_x2apic(0x808); > + /* EOI */ > + vmx_disable_intercept_msr_write_x2apic(0x80b); > + /* SELF-IPI */ > + vmx_disable_intercept_msr_write_x2apic(0x83f); > + } > + > + if (enable_ept) { > + kvm_mmu_set_mask_ptes(0ull, > + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, > + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, > + 0ull, VMX_EPT_EXECUTABLE_MASK); > + ept_set_mmio_spte_mask(); > + kvm_enable_tdp(); > + } else > + kvm_disable_tdp(); > + > + update_ple_window_actual_max(); > + > + if (setup_vmcs_config(&vmcs_config) < 0) { > + r = -EIO; > + goto out7; > + } > > if (boot_cpu_has(X86_FEATURE_NX)) > kvm_enable_efer_bits(EFER_NX); > @@ -3169,10 +3461,38 @@ static __init int hardware_setup(void) > nested_vmx_setup_ctls_msrs(); > > return alloc_kvm_area(); > + > +out7: > + free_page((unsigned long)vmx_vmwrite_bitmap); > +out6: > + free_page((unsigned long)vmx_vmread_bitmap); > +out5: > + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); > +out4: > + free_page((unsigned long)vmx_msr_bitmap_longmode); > +out3: > + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); > +out2: > + free_page((unsigned long)vmx_msr_bitmap_legacy); > +out1: > + free_page((unsigned long)vmx_io_bitmap_b); > +out: > + free_page((unsigned long)vmx_io_bitmap_a); > + > + return r; > } > > static __exit void hardware_unsetup(void) > { > + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); > + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); > + free_page((unsigned long)vmx_msr_bitmap_legacy); > + free_page((unsigned long)vmx_msr_bitmap_longmode); > + free_page((unsigned long)vmx_io_bitmap_b); > + free_page((unsigned long)vmx_io_bitmap_a); > + free_page((unsigned long)vmx_vmwrite_bitmap); > + free_page((unsigned long)vmx_vmread_bitmap); > + > free_kvm_area(); > } > > @@ -4057,162 +4377,52 @@ static int alloc_apic_access_page(struct kvm *kvm) > kvm->arch.apic_access_page_done = true; > out: > mutex_unlock(&kvm->slots_lock); > - return r; > -} > - > -static int alloc_identity_pagetable(struct kvm *kvm) > -{ > - /* Called with kvm->slots_lock held. */ > - > - struct kvm_userspace_memory_region kvm_userspace_mem; > - int r = 0; > - > - BUG_ON(kvm->arch.ept_identity_pagetable_done); > - > - kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; > - kvm_userspace_mem.flags = 0; > - kvm_userspace_mem.guest_phys_addr = > - kvm->arch.ept_identity_map_addr; > - kvm_userspace_mem.memory_size = PAGE_SIZE; > - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); > - > - return r; > -} > - > -static void allocate_vpid(struct vcpu_vmx *vmx) > -{ > - int vpid; > - > - vmx->vpid = 0; > - if (!enable_vpid) > - return; > - spin_lock(&vmx_vpid_lock); > - vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); > - if (vpid < VMX_NR_VPIDS) { > - vmx->vpid = vpid; > - __set_bit(vpid, vmx_vpid_bitmap); > - } > - spin_unlock(&vmx_vpid_lock); > -} > - > -static void free_vpid(struct vcpu_vmx *vmx) > -{ > - if (!enable_vpid) > - return; > - spin_lock(&vmx_vpid_lock); > - if (vmx->vpid != 0) > - __clear_bit(vmx->vpid, vmx_vpid_bitmap); > - spin_unlock(&vmx_vpid_lock); > -} > - > -#define MSR_TYPE_R 1 > -#define MSR_TYPE_W 2 > -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, > - u32 msr, int type) > -{ > - int f = sizeof(unsigned long); > - > - if (!cpu_has_vmx_msr_bitmap()) > - return; > - > - /* > - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals > - * have the write-low and read-high bitmap offsets the wrong way round. > - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. > - */ > - if (msr <= 0x1fff) { > - if (type & MSR_TYPE_R) > - /* read-low */ > - __clear_bit(msr, msr_bitmap + 0x000 / f); > - > - if (type & MSR_TYPE_W) > - /* write-low */ > - __clear_bit(msr, msr_bitmap + 0x800 / f); > - > - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { > - msr &= 0x1fff; > - if (type & MSR_TYPE_R) > - /* read-high */ > - __clear_bit(msr, msr_bitmap + 0x400 / f); > - > - if (type & MSR_TYPE_W) > - /* write-high */ > - __clear_bit(msr, msr_bitmap + 0xc00 / f); > - > - } > -} > - > -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, > - u32 msr, int type) > -{ > - int f = sizeof(unsigned long); > - > - if (!cpu_has_vmx_msr_bitmap()) > - return; > - > - /* > - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals > - * have the write-low and read-high bitmap offsets the wrong way round. > - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. > - */ > - if (msr <= 0x1fff) { > - if (type & MSR_TYPE_R) > - /* read-low */ > - __set_bit(msr, msr_bitmap + 0x000 / f); > - > - if (type & MSR_TYPE_W) > - /* write-low */ > - __set_bit(msr, msr_bitmap + 0x800 / f); > - > - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { > - msr &= 0x1fff; > - if (type & MSR_TYPE_R) > - /* read-high */ > - __set_bit(msr, msr_bitmap + 0x400 / f); > - > - if (type & MSR_TYPE_W) > - /* write-high */ > - __set_bit(msr, msr_bitmap + 0xc00 / f); > - > - } > -} > - > -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) > -{ > - if (!longmode_only) > - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, > - msr, MSR_TYPE_R | MSR_TYPE_W); > - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, > - msr, MSR_TYPE_R | MSR_TYPE_W); > -} > - > -static void vmx_enable_intercept_msr_read_x2apic(u32 msr) > -{ > - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, > - msr, MSR_TYPE_R); > - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, > - msr, MSR_TYPE_R); > + return r; > } > > -static void vmx_disable_intercept_msr_read_x2apic(u32 msr) > +static int alloc_identity_pagetable(struct kvm *kvm) > { > - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, > - msr, MSR_TYPE_R); > - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, > - msr, MSR_TYPE_R); > + /* Called with kvm->slots_lock held. */ > + > + struct kvm_userspace_memory_region kvm_userspace_mem; > + int r = 0; > + > + BUG_ON(kvm->arch.ept_identity_pagetable_done); > + > + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; > + kvm_userspace_mem.flags = 0; > + kvm_userspace_mem.guest_phys_addr = > + kvm->arch.ept_identity_map_addr; > + kvm_userspace_mem.memory_size = PAGE_SIZE; > + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); > + > + return r; > } > > -static void vmx_disable_intercept_msr_write_x2apic(u32 msr) > +static void allocate_vpid(struct vcpu_vmx *vmx) > { > - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, > - msr, MSR_TYPE_W); > - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, > - msr, MSR_TYPE_W); > + int vpid; > + > + vmx->vpid = 0; > + if (!enable_vpid) > + return; > + spin_lock(&vmx_vpid_lock); > + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); > + if (vpid < VMX_NR_VPIDS) { > + vmx->vpid = vpid; > + __set_bit(vpid, vmx_vpid_bitmap); > + } > + spin_unlock(&vmx_vpid_lock); > } > > -static int vmx_vm_has_apicv(struct kvm *kvm) > +static void free_vpid(struct vcpu_vmx *vmx) > { > - return enable_apicv && irqchip_in_kernel(kvm); > + if (!enable_vpid) > + return; > + spin_lock(&vmx_vpid_lock); > + if (vmx->vpid != 0) > + __clear_bit(vmx->vpid, vmx_vpid_bitmap); > + spin_unlock(&vmx_vpid_lock); > } > > /* > @@ -4376,17 +4586,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) > return exec_control; > } > > -static void ept_set_mmio_spte_mask(void) > -{ > - /* > - * EPT Misconfigurations can be generated if the value of bits 2:0 > - * of an EPT paging-structure entry is 110b (write/execute). > - * Also, magic bits (0x3ull << 62) is set to quickly identify mmio > - * spte. > - */ > - kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); > -} > - > /* > * Sets up the vmcs for emulated real mode. > */ > @@ -5706,76 +5905,6 @@ out: > return ret; > } > > -static int __grow_ple_window(int val) > -{ > - if (ple_window_grow < 1) > - return ple_window; > - > - val = min(val, ple_window_actual_max); > - > - if (ple_window_grow < ple_window) > - val *= ple_window_grow; > - else > - val += ple_window_grow; > - > - return val; > -} > - > -static int __shrink_ple_window(int val, int modifier, int minimum) > -{ > - if (modifier < 1) > - return ple_window; > - > - if (modifier < ple_window) > - val /= modifier; > - else > - val -= modifier; > - > - return max(val, minimum); > -} > - > -static void grow_ple_window(struct kvm_vcpu *vcpu) > -{ > - struct vcpu_vmx *vmx = to_vmx(vcpu); > - int old = vmx->ple_window; > - > - vmx->ple_window = __grow_ple_window(old); > - > - if (vmx->ple_window != old) > - vmx->ple_window_dirty = true; > - > - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); > -} > - > -static void shrink_ple_window(struct kvm_vcpu *vcpu) > -{ > - struct vcpu_vmx *vmx = to_vmx(vcpu); > - int old = vmx->ple_window; > - > - vmx->ple_window = __shrink_ple_window(old, > - ple_window_shrink, ple_window); > - > - if (vmx->ple_window != old) > - vmx->ple_window_dirty = true; > - > - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); > -} > - > -/* > - * ple_window_actual_max is computed to be one grow_ple_window() below > - * ple_window_max. (See __grow_ple_window for the reason.) > - * This prevents overflows, because ple_window_max is int. > - * ple_window_max effectively rounded down to a multiple of ple_window_grow in > - * this process. > - * ple_window_max is also prevented from setting vmx->ple_window < ple_window. > - */ > -static void update_ple_window_actual_max(void) > -{ > - ple_window_actual_max = > - __shrink_ple_window(max(ple_window_max, ple_window), > - ple_window_grow, INT_MIN); > -} > - > /* > * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE > * exiting, so only get here on cpu with PAUSE-Loop-Exiting. > @@ -9158,150 +9287,23 @@ static struct kvm_x86_ops vmx_x86_ops = { > > static int __init vmx_init(void) > { > - int r, i, msr; > - > - rdmsrl_safe(MSR_EFER, &host_efer); > - > - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) > - kvm_define_shared_msr(i, vmx_msr_index[i]); > - > - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_io_bitmap_a) > - return -ENOMEM; > - > - r = -ENOMEM; > - > - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_io_bitmap_b) > - goto out; > - > - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_msr_bitmap_legacy) > - goto out1; > - > - vmx_msr_bitmap_legacy_x2apic = > - (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_msr_bitmap_legacy_x2apic) > - goto out2; > - > - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_msr_bitmap_longmode) > - goto out3; > - > - vmx_msr_bitmap_longmode_x2apic = > - (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_msr_bitmap_longmode_x2apic) > - goto out4; > - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_vmread_bitmap) > - goto out5; > - > - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); > - if (!vmx_vmwrite_bitmap) > - goto out6; > - > - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); > - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); > - > - /* > - * Allow direct access to the PC debug port (it is often used for I/O > - * delays, but the vmexits simply slow things down). > - */ > - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); > - clear_bit(0x80, vmx_io_bitmap_a); > - > - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); > - > - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); > - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); > - > - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ > + int r = -ENOMEM; > > r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), > __alignof__(struct vcpu_vmx), THIS_MODULE); > if (r) > - goto out7; > + return r; > > #ifdef CONFIG_KEXEC > rcu_assign_pointer(crash_vmclear_loaded_vmcss, > crash_vmclear_local_loaded_vmcss); > #endif > > - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); > - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); > - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); > - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); > - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); > - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); > - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); > - > - memcpy(vmx_msr_bitmap_legacy_x2apic, > - vmx_msr_bitmap_legacy, PAGE_SIZE); > - memcpy(vmx_msr_bitmap_longmode_x2apic, > - vmx_msr_bitmap_longmode, PAGE_SIZE); > - > - if (enable_apicv) { > - for (msr = 0x800; msr <= 0x8ff; msr++) > - vmx_disable_intercept_msr_read_x2apic(msr); > - > - /* According SDM, in x2apic mode, the whole id reg is used. > - * But in KVM, it only use the highest eight bits. Need to > - * intercept it */ > - vmx_enable_intercept_msr_read_x2apic(0x802); > - /* TMCCT */ > - vmx_enable_intercept_msr_read_x2apic(0x839); > - /* TPR */ > - vmx_disable_intercept_msr_write_x2apic(0x808); > - /* EOI */ > - vmx_disable_intercept_msr_write_x2apic(0x80b); > - /* SELF-IPI */ > - vmx_disable_intercept_msr_write_x2apic(0x83f); > - } > - > - if (enable_ept) { > - kvm_mmu_set_mask_ptes(0ull, > - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, > - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, > - 0ull, VMX_EPT_EXECUTABLE_MASK); > - ept_set_mmio_spte_mask(); > - kvm_enable_tdp(); > - } else > - kvm_disable_tdp(); > - > - update_ple_window_actual_max(); > - > return 0; > - > -out7: > - free_page((unsigned long)vmx_vmwrite_bitmap); > -out6: > - free_page((unsigned long)vmx_vmread_bitmap); > -out5: > - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); > -out4: > - free_page((unsigned long)vmx_msr_bitmap_longmode); > -out3: > - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); > -out2: > - free_page((unsigned long)vmx_msr_bitmap_legacy); > -out1: > - free_page((unsigned long)vmx_io_bitmap_b); > -out: > - free_page((unsigned long)vmx_io_bitmap_a); > - return r; > } > > static void __exit vmx_exit(void) > { > - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); > - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); > - free_page((unsigned long)vmx_msr_bitmap_legacy); > - free_page((unsigned long)vmx_msr_bitmap_longmode); > - free_page((unsigned long)vmx_io_bitmap_b); > - free_page((unsigned long)vmx_io_bitmap_a); > - free_page((unsigned long)vmx_vmwrite_bitmap); > - free_page((unsigned long)vmx_vmread_bitmap); > - > #ifdef CONFIG_KEXEC > RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); > synchronize_rcu(); > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 04fa1b8..9270076 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3106,10 +3106,302 @@ static __init int alloc_kvm_area(void) return 0; } +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __set_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __set_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __set_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __set_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) +{ + if (!longmode_only) + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + msr, MSR_TYPE_R | MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, + msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void vmx_enable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_W); +} + +static int vmx_vm_has_apicv(struct kvm *kvm) +{ + return enable_apicv && irqchip_in_kernel(kvm); +} + +static void ept_set_mmio_spte_mask(void) +{ + /* + * EPT Misconfigurations can be generated if the value of bits 2:0 + * of an EPT paging-structure entry is 110b (write/execute). + * Also, magic bits (0x3ull << 62) is set to quickly identify mmio + * spte. + */ + kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); +} + +static int __grow_ple_window(int val) +{ + if (ple_window_grow < 1) + return ple_window; + + val = min(val, ple_window_actual_max); + + if (ple_window_grow < ple_window) + val *= ple_window_grow; + else + val += ple_window_grow; + + return val; +} + +static int __shrink_ple_window(int val, int modifier, int minimum) +{ + if (modifier < 1) + return ple_window; + + if (modifier < ple_window) + val /= modifier; + else + val -= modifier; + + return max(val, minimum); +} + +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, + ple_window_shrink, ple_window); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); +} + +/* + * ple_window_actual_max is computed to be one grow_ple_window() below + * ple_window_max. (See __grow_ple_window for the reason.) + * This prevents overflows, because ple_window_max is int. + * ple_window_max effectively rounded down to a multiple of ple_window_grow in + * this process. + * ple_window_max is also prevented from setting vmx->ple_window < ple_window. + */ +static void update_ple_window_actual_max(void) +{ + ple_window_actual_max = + __shrink_ple_window(max(ple_window_max, ple_window), + ple_window_grow, INT_MIN); +} + + static __init int hardware_setup(void) { - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; + int r = -ENOMEM, i, msr; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); + + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_a) + return r; + + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_b) + goto out; + + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) + goto out1; + + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) + goto out2; + + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) + goto out3; + + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) + goto out5; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) + goto out6; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } + + if (enable_ept) { + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); + + update_ple_window_actual_max(); + + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; + goto out7; + } if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -3169,10 +3461,38 @@ static __init int hardware_setup(void) nested_vmx_setup_ctls_msrs(); return alloc_kvm_area(); + +out7: + free_page((unsigned long)vmx_vmwrite_bitmap); +out6: + free_page((unsigned long)vmx_vmread_bitmap); +out5: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +out4: + free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); +out2: + free_page((unsigned long)vmx_msr_bitmap_legacy); +out1: + free_page((unsigned long)vmx_io_bitmap_b); +out: + free_page((unsigned long)vmx_io_bitmap_a); + + return r; } static __exit void hardware_unsetup(void) { + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((unsigned long)vmx_vmread_bitmap); + free_kvm_area(); } @@ -4057,162 +4377,52 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); - return r; -} - -static int alloc_identity_pagetable(struct kvm *kvm) -{ - /* Called with kvm->slots_lock held. */ - - struct kvm_userspace_memory_region kvm_userspace_mem; - int r = 0; - - BUG_ON(kvm->arch.ept_identity_pagetable_done); - - kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = - kvm->arch.ept_identity_map_addr; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); - - return r; -} - -static void allocate_vpid(struct vcpu_vmx *vmx) -{ - int vpid; - - vmx->vpid = 0; - if (!enable_vpid) - return; - spin_lock(&vmx_vpid_lock); - vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) { - vmx->vpid = vpid; - __set_bit(vpid, vmx_vpid_bitmap); - } - spin_unlock(&vmx_vpid_lock); -} - -static void free_vpid(struct vcpu_vmx *vmx) -{ - if (!enable_vpid) - return; - spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); - spin_unlock(&vmx_vpid_lock); -} - -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __clear_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __clear_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) -{ - if (!longmode_only) - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, - msr, MSR_TYPE_R | MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, - msr, MSR_TYPE_R | MSR_TYPE_W); -} - -static void vmx_enable_intercept_msr_read_x2apic(u32 msr) -{ - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + return r; } -static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +static int alloc_identity_pagetable(struct kvm *kvm) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + /* Called with kvm->slots_lock held. */ + + struct kvm_userspace_memory_region kvm_userspace_mem; + int r = 0; + + BUG_ON(kvm->arch.ept_identity_pagetable_done); + + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; + kvm_userspace_mem.flags = 0; + kvm_userspace_mem.guest_phys_addr = + kvm->arch.ept_identity_map_addr; + kvm_userspace_mem.memory_size = PAGE_SIZE; + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); + + return r; } -static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +static void allocate_vpid(struct vcpu_vmx *vmx) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); + int vpid; + + vmx->vpid = 0; + if (!enable_vpid) + return; + spin_lock(&vmx_vpid_lock); + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) { + vmx->vpid = vpid; + __set_bit(vpid, vmx_vpid_bitmap); + } + spin_unlock(&vmx_vpid_lock); } -static int vmx_vm_has_apicv(struct kvm *kvm) +static void free_vpid(struct vcpu_vmx *vmx) { - return enable_apicv && irqchip_in_kernel(kvm); + if (!enable_vpid) + return; + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); } /* @@ -4376,17 +4586,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) return exec_control; } -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0x3ull << 62) is set to quickly identify mmio - * spte. - */ - kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); -} - /* * Sets up the vmcs for emulated real mode. */ @@ -5706,76 +5905,6 @@ out: return ret; } -static int __grow_ple_window(int val) -{ - if (ple_window_grow < 1) - return ple_window; - - val = min(val, ple_window_actual_max); - - if (ple_window_grow < ple_window) - val *= ple_window_grow; - else - val += ple_window_grow; - - return val; -} - -static int __shrink_ple_window(int val, int modifier, int minimum) -{ - if (modifier < 1) - return ple_window; - - if (modifier < ple_window) - val /= modifier; - else - val -= modifier; - - return max(val, minimum); -} - -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, - ple_window_shrink, ple_window); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); -} - -/* - * ple_window_actual_max is computed to be one grow_ple_window() below - * ple_window_max. (See __grow_ple_window for the reason.) - * This prevents overflows, because ple_window_max is int. - * ple_window_max effectively rounded down to a multiple of ple_window_grow in - * this process. - * ple_window_max is also prevented from setting vmx->ple_window < ple_window. - */ -static void update_ple_window_actual_max(void) -{ - ple_window_actual_max = - __shrink_ple_window(max(ple_window_max, ple_window), - ple_window_grow, INT_MIN); -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -9158,150 +9287,23 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r, i, msr; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return -ENOMEM; - - r = -ENOMEM; - - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out3; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out4; - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out5; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out6; - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - clear_bit(0x80, vmx_io_bitmap_a); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + int r = -ENOMEM; r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - goto out7; + return r; #ifdef CONFIG_KEXEC rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); #endif - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - - memcpy(vmx_msr_bitmap_legacy_x2apic, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, - vmx_msr_bitmap_longmode, PAGE_SIZE); - - if (enable_apicv) { - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); - - /* According SDM, in x2apic mode, the whole id reg is used. - * But in KVM, it only use the highest eight bits. Need to - * intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); - } - - if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); - - update_ple_window_actual_max(); - return 0; - -out7: - free_page((unsigned long)vmx_vmwrite_bitmap); -out6: - free_page((unsigned long)vmx_vmread_bitmap); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out4: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); -out: - free_page((unsigned long)vmx_io_bitmap_a); - return r; } static void __exit vmx_exit(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); - #ifdef CONFIG_KEXEC RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu();
Instead of vmx_init(), actually it would make reasonable sense to do anything specific to vmx hardware setting in vmx_x86_ops->hardware_setup(). Signed-off-by: Tiejun Chen <tiejun.chen@intel.com> --- arch/x86/kvm/vmx.c | 720 +++++++++++++++++++++++++++-------------------------- 1 file changed, 361 insertions(+), 359 deletions(-)