diff mbox

[v11,2/3] x86, apicv: add virtual x2apic support

Message ID 1358331672-32384-3-git-send-email-yang.z.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhang, Yang Z Jan. 16, 2013, 10:21 a.m. UTC
From: Yang Zhang <yang.z.zhang@Intel.com>

basically to benefit from apicv, we need to enable virtualized x2apic mode.
Currently, we only enable it when guest is really using x2apic.

Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
    0x800 - 0x8ff: no read intercept for apicv register virtualization,
                   except APIC ID and TMCCT which need software's assistance to
		   get right value.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/include/asm/vmx.h      |    1 +
 arch/x86/kvm/lapic.c            |   20 ++--
 arch/x86/kvm/lapic.h            |    5 +
 arch/x86/kvm/svm.c              |    6 +
 arch/x86/kvm/vmx.c              |  204 +++++++++++++++++++++++++++++++++++----
 6 files changed, 209 insertions(+), 28 deletions(-)

Comments

Gleb Natapov Jan. 17, 2013, 1:22 p.m. UTC | #1
On Wed, Jan 16, 2013 at 06:21:11PM +0800, Yang Zhang wrote:
> From: Yang Zhang <yang.z.zhang@Intel.com>
> 
> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> Currently, we only enable it when guest is really using x2apic.
> 
> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>                    except APIC ID and TMCCT which need software's assistance to
> 		   get right value.
> 
> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/include/asm/vmx.h      |    1 +
>  arch/x86/kvm/lapic.c            |   20 ++--
>  arch/x86/kvm/lapic.h            |    5 +
>  arch/x86/kvm/svm.c              |    6 +
>  arch/x86/kvm/vmx.c              |  204 +++++++++++++++++++++++++++++++++++----
>  6 files changed, 209 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..35aa8e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -697,6 +697,7 @@ struct kvm_x86_ops {
>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>  	int (*get_tdp_level)(void);
>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 44c3f7e..0a54df0 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -139,6 +139,7 @@
>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
>  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
>  #define SECONDARY_EXEC_RDTSCP			0x00000008
> +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
>  #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
>  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
>  #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 0664c13..f39aee3 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -140,11 +140,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
>  	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
>  	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
>  
> -static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> -{
> -	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> -}
> -
>  static inline int kvm_apic_id(struct kvm_lapic *apic)
>  {
>  	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> @@ -1323,12 +1318,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
>  		value &= ~MSR_IA32_APICBASE_BSP;
>  
> -	vcpu->arch.apic_base = value;
> -	if (apic_x2apic_mode(apic)) {
> -		u32 id = kvm_apic_id(apic);
> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> -		kvm_apic_set_ldr(apic, ldr);
> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
> +		if (value & X2APIC_ENABLE) {
> +			u32 id = kvm_apic_id(apic);
> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> +			kvm_apic_set_ldr(apic, ldr);
> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> +		} else
> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>  	}
> +
> +	vcpu->arch.apic_base = value;
>  	apic->base_address = apic->vcpu->arch.apic_base &
>  			     MSR_IA32_APICBASE_BASE;
>  
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 9a8ee22..22a5397 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -126,4 +126,9 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
>  	return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
>  }
>  
> +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> +{
> +	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> +}
> +
>  #endif
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d29d3cd..38407e9 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>  }
>  
> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
> +{
> +	return;
> +}
> +
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = {
>  	.enable_nmi_window = enable_nmi_window,
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>  
>  	.set_tss_addr = svm_set_tss_addr,
>  	.get_tdp_level = get_npt_level,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 0403634..526955d 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -643,6 +643,8 @@ static unsigned long *vmx_io_bitmap_a;
>  static unsigned long *vmx_io_bitmap_b;
>  static unsigned long *vmx_msr_bitmap_legacy;
>  static unsigned long *vmx_msr_bitmap_longmode;
> +static unsigned long *vmx_msr_bitmap_legacy_x2apic;
> +static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>  
>  static bool cpu_has_load_ia32_efer;
>  static bool cpu_has_load_perf_global_ctrl;
> @@ -767,6 +769,12 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>  }
>  
> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> +{
> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +}
> +
>  static inline bool cpu_has_vmx_apic_register_virt(void)
>  {
>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
> @@ -1830,6 +1838,24 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
>  	vmx->guest_msrs[from] = tmp;
>  }
>  
> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long *msr_bitmap;
> +
> +	if (apic_x2apic_mode(vcpu->arch.apic))
> +		if (is_long_mode(vcpu))
> +			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
> +		else
> +			msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
> +	else
> +		if (is_long_mode(vcpu))
> +			msr_bitmap = vmx_msr_bitmap_longmode;
> +		else
> +			msr_bitmap = vmx_msr_bitmap_legacy;
> +
> +	vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
> +}
> +
>  /*
>   * Set up the vmcs to automatically save and restore system
>   * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
> @@ -1838,7 +1864,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
>  static void setup_msrs(struct vcpu_vmx *vmx)
>  {
>  	int save_nmsrs, index;
> -	unsigned long *msr_bitmap;
>  
>  	save_nmsrs = 0;
>  #ifdef CONFIG_X86_64
> @@ -1870,14 +1895,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
>  
>  	vmx->save_nmsrs = save_nmsrs;
>  
> -	if (cpu_has_vmx_msr_bitmap()) {
> -		if (is_long_mode(&vmx->vcpu))
> -			msr_bitmap = vmx_msr_bitmap_longmode;
> -		else
> -			msr_bitmap = vmx_msr_bitmap_legacy;
> -
> -		vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
> -	}
> +	if (cpu_has_vmx_msr_bitmap())
> +		vmx_set_msr_bitmap(&vmx->vcpu);
>  }
>  
>  /*
> @@ -2543,6 +2562,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
>  		min2 = 0;
>  		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>  			SECONDARY_EXEC_WBINVD_EXITING |
>  			SECONDARY_EXEC_ENABLE_VPID |
>  			SECONDARY_EXEC_ENABLE_EPT |
> @@ -3724,7 +3744,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
>  	spin_unlock(&vmx_vpid_lock);
>  }
>  
> -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
> +#define MSR_TYPE_R	1
> +#define MSR_TYPE_W	2
> +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> +						u32 msr, int type)
>  {
>  	int f = sizeof(unsigned long);
>  
> @@ -3737,20 +3760,99 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>  	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
>  	 */
>  	if (msr <= 0x1fff) {
> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> +		if (type & MSR_TYPE_R)
> +			/* read-low */
> +			__clear_bit(msr, msr_bitmap + 0x000 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-low */
> +			__clear_bit(msr, msr_bitmap + 0x800 / f);
> +
>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>  		msr &= 0x1fff;
> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> +		if (type & MSR_TYPE_R)
> +			/* read-high */
> +			__clear_bit(msr, msr_bitmap + 0x400 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-high */
> +			__clear_bit(msr, msr_bitmap + 0xc00 / f);
> +
> +	}
> +}
> +
> +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
> +						u32 msr, int type)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (!cpu_has_vmx_msr_bitmap())
> +		return;
> +
> +	/*
> +	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
> +	 * have the write-low and read-high bitmap offsets the wrong way round.
> +	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
> +	 */
> +	if (msr <= 0x1fff) {
> +		if (type & MSR_TYPE_R)
> +			/* read-low */
> +			__set_bit(msr, msr_bitmap + 0x000 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-low */
> +			__set_bit(msr, msr_bitmap + 0x800 / f);
> +
> +	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> +		msr &= 0x1fff;
> +		if (type & MSR_TYPE_R)
> +			/* read-high */
> +			__set_bit(msr, msr_bitmap + 0x400 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-high */
> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
> +
>  	}
>  }
>  
>  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
>  {
>  	if (!longmode_only)
> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> +}
> +
> +static void vmx_intercept_for_msr_read_x2apic(u32 msr, bool set)
> +{
> +	if (set) {
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> +				msr, MSR_TYPE_R);
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> +				msr, MSR_TYPE_R);
> +	} else {
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> +				msr, MSR_TYPE_R);
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> +				msr, MSR_TYPE_R);
> +	}
> +}
> +
> +static void vmx_intercept_for_msr_write_x2apic(u32 msr, bool set)
> +{
> +	if (set) {
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> +				msr, MSR_TYPE_W);
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> +				msr, MSR_TYPE_W);
> +	} else {
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> +				msr, MSR_TYPE_W);
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> +				msr, MSR_TYPE_W);
> +	}
>  }
>  
>  /*
> @@ -3848,6 +3950,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>  	if (!enable_apicv_reg)
>  		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>  	return exec_control;
>  }
>  
> @@ -6103,6 +6206,53 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  	vmcs_write32(TPR_THRESHOLD, irr);
>  }
>  
> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
> +{
> +	u32 exec_control, sec_exec_control;
> +	int msr;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	/* There is not point to enable virtualize x2apic without enable
> +	 * apicv*/
> +	if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg)
> +		return;
> +
> +	if (set) {
> +		exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +		/* virtualize x2apic mode relies on tpr shadow */
> +		if (!(exec_control & CPU_BASED_TPR_SHADOW))
> +			return;
> +	}
> +
> +	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +
> +	if (set) {
> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +	} else {
> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +		if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
> +			sec_exec_control |=
> +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +	}
> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
> +
> +	if (set) {
> +		for (msr = 0x800; msr <= 0x8ff; msr++)
> +			vmx_intercept_for_msr_read_x2apic(msr, false);
> +
> +		/* According SDM, in x2apic mode, the whole id reg is used.
> +		 * But in KVM, it only use the highest eight bits. Need to
> +		 * intercept it */
> +		vmx_intercept_for_msr_read_x2apic(0x802, true);
> +		/* TMCCT */
> +		vmx_intercept_for_msr_read_x2apic(0x839, true);
> +		/* TPR */
> +		vmx_intercept_for_msr_write_x2apic(0x808, false);
> +	}
Do it during vmx init, not here. Here you only need to call
vmx_set_msr_bitmap().

> +	vmx_set_msr_bitmap(vcpu);
> +}
> +
>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
>  {
>  	u32 exit_intr_info;
> @@ -7366,6 +7516,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
>  	.enable_nmi_window = enable_nmi_window,
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>  
>  	.set_tss_addr = vmx_set_tss_addr,
>  	.get_tdp_level = get_ept_level,
> @@ -7419,11 +7570,19 @@ static int __init vmx_init(void)
>  	if (!vmx_msr_bitmap_legacy)
>  		goto out1;
>  
> +	vmx_msr_bitmap_legacy_x2apic =
> +				(unsigned long *)__get_free_page(GFP_KERNEL);
> +	if (!vmx_msr_bitmap_legacy_x2apic)
> +		goto out2;
>  
>  	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
>  	if (!vmx_msr_bitmap_longmode)
> -		goto out2;
> +		goto out3;
>  
> +	vmx_msr_bitmap_longmode_x2apic =
> +				(unsigned long *)__get_free_page(GFP_KERNEL);
> +	if (!vmx_msr_bitmap_longmode_x2apic)
> +		goto out4;
>  
>  	/*
>  	 * Allow direct access to the PC debug port (it is often used for I/O
> @@ -7456,6 +7615,11 @@ static int __init vmx_init(void)
>  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
>  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
>  
> +	memcpy(vmx_msr_bitmap_legacy_x2apic,
> +			vmx_msr_bitmap_legacy, PAGE_SIZE);
> +	memcpy(vmx_msr_bitmap_longmode_x2apic,
> +			vmx_msr_bitmap_longmode, PAGE_SIZE);
> +
>  	if (enable_ept) {
>  		kvm_mmu_set_mask_ptes(0ull,
>  			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
> @@ -7468,8 +7632,10 @@ static int __init vmx_init(void)
>  
>  	return 0;
>  
> -out3:
> +out4:
>  	free_page((unsigned long)vmx_msr_bitmap_longmode);
> +out3:
> +	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
>  out2:
>  	free_page((unsigned long)vmx_msr_bitmap_legacy);
>  out1:
> @@ -7481,6 +7647,8 @@ out:
>  
>  static void __exit vmx_exit(void)
>  {
> +	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
> +	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>  	free_page((unsigned long)vmx_msr_bitmap_legacy);
>  	free_page((unsigned long)vmx_msr_bitmap_longmode);
>  	free_page((unsigned long)vmx_io_bitmap_b);
> -- 
> 1.7.1

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 18, 2013, 1:49 a.m. UTC | #2
Gleb Natapov wrote on 2013-01-17:
> On Wed, Jan 16, 2013 at 06:21:11PM +0800, Yang Zhang wrote:
>> From: Yang Zhang <yang.z.zhang@Intel.com>
>> @@ -6103,6 +6206,53 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>  }
>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
>> set) +{ +	u32 exec_control, sec_exec_control; +	int msr; +	struct
>> vcpu_vmx *vmx = to_vmx(vcpu); + +	/* There is not point to enable
>> virtualize x2apic without enable +	 * apicv*/ +	if
>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +		return;
>> + +	if (set) { +		exec_control =
>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		/* virtualize x2apic mode
>> relies on tpr shadow */ +		if (!(exec_control & CPU_BASED_TPR_SHADOW))
>> +			return; +	} + +	sec_exec_control =
>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +	if (set) {
>> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	} else
>> { +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +		if
>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +			sec_exec_control
>> |= +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +	}
>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + +	if
>> (set) { +		for (msr = 0x800; msr <= 0x8ff; msr++)
>> +			vmx_intercept_for_msr_read_x2apic(msr, false); + +		/* According
>> SDM, in x2apic mode, the whole id reg is used. +		 * But in KVM, it
>> only use the highest eight bits. Need to +		 * intercept it */
>> +		vmx_intercept_for_msr_read_x2apic(0x802, true); +		/* TMCCT */
>> +		vmx_intercept_for_msr_read_x2apic(0x839, true); +		/* TPR */
>> +		vmx_intercept_for_msr_write_x2apic(0x808, false); +	}
> Do it during vmx init, not here. Here you only need to call
> vmx_set_msr_bitmap().
Sure. It is more reasonable. 

>> +	vmx_set_msr_bitmap(vcpu);
>> +}
>> +
>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
>>  exit_intr_info; @@ -7366,6 +7516,7 @@ static struct kvm_x86_ops
>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>  update_cr8_intercept,
>> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>> 
>>  	.set_tss_addr = vmx_set_tss_addr, 	.get_tdp_level = get_ept_level, @@
>>  -7419,11 +7570,19 @@ static int __init vmx_init(void) 	if
>>  (!vmx_msr_bitmap_legacy) 		goto out1;
>> +	vmx_msr_bitmap_legacy_x2apic =
>> +				(unsigned long *)__get_free_page(GFP_KERNEL);
>> +	if (!vmx_msr_bitmap_legacy_x2apic)
>> +		goto out2;
>> 
>>  	vmx_msr_bitmap_longmode = (unsigned long
>>  *)__get_free_page(GFP_KERNEL); 	if (!vmx_msr_bitmap_longmode)
>> -		goto out2;
>> +		goto out3;
>> 
>> +	vmx_msr_bitmap_longmode_x2apic =
>> +				(unsigned long *)__get_free_page(GFP_KERNEL);
>> +	if (!vmx_msr_bitmap_longmode_x2apic)
>> +		goto out4;
>> 
>>  	/* 	 * Allow direct access to the PC debug port (it is often used for
>>  I/O @@ -7456,6 +7615,11 @@ static int __init vmx_init(void)
>>  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
>>  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
>> +	memcpy(vmx_msr_bitmap_legacy_x2apic,
>> +			vmx_msr_bitmap_legacy, PAGE_SIZE);
>> +	memcpy(vmx_msr_bitmap_longmode_x2apic,
>> +			vmx_msr_bitmap_longmode, PAGE_SIZE);
>> +
>>  	if (enable_ept) {
>>  		kvm_mmu_set_mask_ptes(0ull,
>>  			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
>> @@ -7468,8 +7632,10 @@ static int __init vmx_init(void)
>> 
>>  	return 0;
>> -out3:
>> +out4:
>>  	free_page((unsigned long)vmx_msr_bitmap_longmode);
>> +out3:
>> +	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
>>  out2:
>>  	free_page((unsigned long)vmx_msr_bitmap_legacy);
>>  out1:
>> @@ -7481,6 +7647,8 @@ out:
>> 
>>  static void __exit vmx_exit(void)
>>  {
>> +	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
>> +	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>>  	free_page((unsigned long)vmx_msr_bitmap_legacy);
>>  	free_page((unsigned long)vmx_msr_bitmap_longmode);
>>  	free_page((unsigned long)vmx_io_bitmap_b);
>> --
>> 1.7.1
> 
> --
> 			Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Jan. 21, 2013, 7:59 p.m. UTC | #3
On Wed, Jan 16, 2013 at 06:21:11PM +0800, Yang Zhang wrote:
> From: Yang Zhang <yang.z.zhang@Intel.com>
> 
> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> Currently, we only enable it when guest is really using x2apic.
> 
> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>                    except APIC ID and TMCCT which need software's assistance to
> 		   get right value.
> 
> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/include/asm/vmx.h      |    1 +
>  arch/x86/kvm/lapic.c            |   20 ++--
>  arch/x86/kvm/lapic.h            |    5 +
>  arch/x86/kvm/svm.c              |    6 +
>  arch/x86/kvm/vmx.c              |  204 +++++++++++++++++++++++++++++++++++----
>  6 files changed, 209 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..35aa8e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -697,6 +697,7 @@ struct kvm_x86_ops {
>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>  	int (*get_tdp_level)(void);
>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 44c3f7e..0a54df0 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -139,6 +139,7 @@
>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
>  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
>  #define SECONDARY_EXEC_RDTSCP			0x00000008
> +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
>  #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
>  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
>  #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 0664c13..f39aee3 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -140,11 +140,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
>  	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
>  	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
>  
> -static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> -{
> -	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> -}
> -
>  static inline int kvm_apic_id(struct kvm_lapic *apic)
>  {
>  	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> @@ -1323,12 +1318,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
>  		value &= ~MSR_IA32_APICBASE_BSP;
>  
> -	vcpu->arch.apic_base = value;
> -	if (apic_x2apic_mode(apic)) {
> -		u32 id = kvm_apic_id(apic);
> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> -		kvm_apic_set_ldr(apic, ldr);
> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
> +		if (value & X2APIC_ENABLE) {
> +			u32 id = kvm_apic_id(apic);
> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> +			kvm_apic_set_ldr(apic, ldr);
> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> +		} else
> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>  	}
> +
> +	vcpu->arch.apic_base = value;

Simpler to have

if (apic_x2apic_mode(apic)) {
	...
	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
} else {
	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
}

Also it must be done after assignment of vcpu->arch.apic_base (this
patch has vcpu->arch.apic_base being read from
->set_virtual_x2apic_mode() path).

> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long *msr_bitmap;
> +
> +	if (apic_x2apic_mode(vcpu->arch.apic))

vcpu->arch.apic can be NULL.

> +static void vmx_intercept_for_msr_read_x2apic(u32 msr, bool set)
> +{
> +	if (set) {
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> +				msr, MSR_TYPE_R);
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> +				msr, MSR_TYPE_R);
> +	} else {
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> +				msr, MSR_TYPE_R);
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> +				msr, MSR_TYPE_R);
> +	}
> +}

Please retain the enable_intercept/disable_intercept naming in the
function name, instead of a set parameter.

> +static void vmx_intercept_for_msr_write_x2apic(u32 msr, bool set)
> +{
> +	if (set) {
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> +				msr, MSR_TYPE_W);
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> +				msr, MSR_TYPE_W);
> +	} else {
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> +				msr, MSR_TYPE_W);
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> +				msr, MSR_TYPE_W);
> +	}
>  }

Same here.

> @@ -3848,6 +3950,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>  	if (!enable_apicv_reg)
>  		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>  	return exec_control;

Unconditionally disabling SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE? Its
awkward.

> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	/* There is not point to enable virtualize x2apic without enable
> +	 * apicv*/
> +	if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg)
> +		return;
> +
> +	if (set) {
> +		exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +		/* virtualize x2apic mode relies on tpr shadow */
> +		if (!(exec_control & CPU_BASED_TPR_SHADOW))
> +			return;
> +	}
> +
> +	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +
> +	if (set) {
> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +	} else {
> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +		if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
> +			sec_exec_control |=
> +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +	}
> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
> +
> +	if (set) {
> +		for (msr = 0x800; msr <= 0x8ff; msr++)
> +			vmx_intercept_for_msr_read_x2apic(msr, false);
> +
> +		/* According SDM, in x2apic mode, the whole id reg is used.
> +		 * But in KVM, it only use the highest eight bits. Need to
> +		 * intercept it */
> +		vmx_intercept_for_msr_read_x2apic(0x802, true);
> +		/* TMCCT */
> +		vmx_intercept_for_msr_read_x2apic(0x839, true);
> +		/* TPR */
> +		vmx_intercept_for_msr_write_x2apic(0x808, false);
> +	}

Why not disable write intercept for all MSRs which represent APIC registers
that are virtualized? Why TPR is special?



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 21, 2013, 8:21 p.m. UTC | #4
On Mon, Jan 21, 2013 at 05:59:07PM -0200, Marcelo Tosatti wrote:
> On Wed, Jan 16, 2013 at 06:21:11PM +0800, Yang Zhang wrote:
> > From: Yang Zhang <yang.z.zhang@Intel.com>
> > 
> > basically to benefit from apicv, we need to enable virtualized x2apic mode.
> > Currently, we only enable it when guest is really using x2apic.
> > 
> > Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
> >     0x800 - 0x8ff: no read intercept for apicv register virtualization,
> >                    except APIC ID and TMCCT which need software's assistance to
> > 		   get right value.
> > 
> > Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> > Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |    1 +
> >  arch/x86/include/asm/vmx.h      |    1 +
> >  arch/x86/kvm/lapic.c            |   20 ++--
> >  arch/x86/kvm/lapic.h            |    5 +
> >  arch/x86/kvm/svm.c              |    6 +
> >  arch/x86/kvm/vmx.c              |  204 +++++++++++++++++++++++++++++++++++----
> >  6 files changed, 209 insertions(+), 28 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index c431b33..35aa8e6 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -697,6 +697,7 @@ struct kvm_x86_ops {
> >  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
> >  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
> >  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> > +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
> >  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> >  	int (*get_tdp_level)(void);
> >  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> > index 44c3f7e..0a54df0 100644
> > --- a/arch/x86/include/asm/vmx.h
> > +++ b/arch/x86/include/asm/vmx.h
> > @@ -139,6 +139,7 @@
> >  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
> >  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
> >  #define SECONDARY_EXEC_RDTSCP			0x00000008
> > +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
> >  #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
> >  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
> >  #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 0664c13..f39aee3 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -140,11 +140,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
> >  	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
> >  	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
> >  
> > -static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> > -{
> > -	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> > -}
> > -
> >  static inline int kvm_apic_id(struct kvm_lapic *apic)
> >  {
> >  	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> > @@ -1323,12 +1318,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> >  	if (!kvm_vcpu_is_bsp(apic->vcpu))
> >  		value &= ~MSR_IA32_APICBASE_BSP;
> >  
> > -	vcpu->arch.apic_base = value;
> > -	if (apic_x2apic_mode(apic)) {
> > -		u32 id = kvm_apic_id(apic);
> > -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> > -		kvm_apic_set_ldr(apic, ldr);
> > +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
> > +		if (value & X2APIC_ENABLE) {
> > +			u32 id = kvm_apic_id(apic);
> > +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> > +			kvm_apic_set_ldr(apic, ldr);
> > +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> > +		} else
> > +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> >  	}
> > +
> > +	vcpu->arch.apic_base = value;
> 
> Simpler to have
> 
> if (apic_x2apic_mode(apic)) {
> 	...
> 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> } else {
> 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> }
> 
This will not work during cpu init. That was discussed on one of
the previous iterations of the patch. When this code is called during
vcpu init vmcs is not loaded yet so set_virtual_x2apic_mode() cannot
write into it.

> Also it must be done after assignment of vcpu->arch.apic_base (this
> patch has vcpu->arch.apic_base being read from
> ->set_virtual_x2apic_mode() path).
> 
> > +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> > +{
> > +	unsigned long *msr_bitmap;
> > +
> > +	if (apic_x2apic_mode(vcpu->arch.apic))
> 
> vcpu->arch.apic can be NULL.
> 
> > +static void vmx_intercept_for_msr_read_x2apic(u32 msr, bool set)
> > +{
> > +	if (set) {
> > +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> > +				msr, MSR_TYPE_R);
> > +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> > +				msr, MSR_TYPE_R);
> > +	} else {
> > +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> > +				msr, MSR_TYPE_R);
> > +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> > +				msr, MSR_TYPE_R);
> > +	}
> > +}
> 
> Please retain the enable_intercept/disable_intercept naming in the
> function name, instead of a set parameter.
> 
> > +static void vmx_intercept_for_msr_write_x2apic(u32 msr, bool set)
> > +{
> > +	if (set) {
> > +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> > +				msr, MSR_TYPE_W);
> > +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> > +				msr, MSR_TYPE_W);
> > +	} else {
> > +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> > +				msr, MSR_TYPE_W);
> > +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> > +				msr, MSR_TYPE_W);
> > +	}
> >  }
> 
> Same here.
> 
> > @@ -3848,6 +3950,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
> >  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> >  	if (!enable_apicv_reg)
> >  		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> > +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >  	return exec_control;
> 
> Unconditionally disabling SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE? Its
> awkward.
> 
That is how vmx_secondary_exec_control works though. It takes what can
be set in secondary exec control and drops what should not be set there.

> > +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +
> > +	/* There is not point to enable virtualize x2apic without enable
> > +	 * apicv*/
> > +	if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg)
> > +		return;
> > +
> > +	if (set) {
> > +		exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> > +		/* virtualize x2apic mode relies on tpr shadow */
> > +		if (!(exec_control & CPU_BASED_TPR_SHADOW))
> > +			return;
> > +	}
> > +
> > +	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> > +
> > +	if (set) {
> > +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> > +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> > +	} else {
> > +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> > +		if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
> > +			sec_exec_control |=
> > +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> > +	}
> > +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
> > +
> > +	if (set) {
> > +		for (msr = 0x800; msr <= 0x8ff; msr++)
> > +			vmx_intercept_for_msr_read_x2apic(msr, false);
> > +
> > +		/* According SDM, in x2apic mode, the whole id reg is used.
> > +		 * But in KVM, it only use the highest eight bits. Need to
> > +		 * intercept it */
> > +		vmx_intercept_for_msr_read_x2apic(0x802, true);
> > +		/* TMCCT */
> > +		vmx_intercept_for_msr_read_x2apic(0x839, true);
> > +		/* TPR */
> > +		vmx_intercept_for_msr_write_x2apic(0x808, false);
> > +	}
> 
> Why not disable write intercept for all MSRs which represent APIC registers
> that are virtualized? Why TPR is special?
> 
This patch goes before vid is enabled. At this point only TPR is
vitalized. If APIC_WRITE exit will be generated on unhandled MSR write
then we can disable intercept for all x2apic MSRs here.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Jan. 21, 2013, 9:21 p.m. UTC | #5
On Mon, Jan 21, 2013 at 10:21:14PM +0200, Gleb Natapov wrote:
> > >  	}
> > > +
> > > +	vcpu->arch.apic_base = value;
> > 
> > Simpler to have
> > 
> > if (apic_x2apic_mode(apic)) {
> > 	...
> > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> > } else {
> > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> > }
> > 
> This will not work during cpu init. That was discussed on one of
> the previous iterations of the patch. When this code is called during
> vcpu init vmcs is not loaded yet so set_virtual_x2apic_mode() cannot
> write into it.

Are you saying that the logic to write on bit value change is due to 
ordering with cpu init or that the callback is at the wrong place?

> > 
> > Why not disable write intercept for all MSRs which represent APIC registers
> > that are virtualized? Why TPR is special?
> > 
> This patch goes before vid is enabled. At this point only TPR is
> vitalized. If APIC_WRITE exit will be generated on unhandled MSR write
> then we can disable intercept for all x2apic MSRs here.

-ENOPARSE, please be more verbose. "unhandled MSR write" ?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 21, 2013, 9:34 p.m. UTC | #6
On Mon, Jan 21, 2013 at 07:21:13PM -0200, Marcelo Tosatti wrote:
> On Mon, Jan 21, 2013 at 10:21:14PM +0200, Gleb Natapov wrote:
> > > >  	}
> > > > +
> > > > +	vcpu->arch.apic_base = value;
> > > 
> > > Simpler to have
> > > 
> > > if (apic_x2apic_mode(apic)) {
> > > 	...
> > > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> > > } else {
> > > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> > > }
> > > 
> > This will not work during cpu init. That was discussed on one of
> > the previous iterations of the patch. When this code is called during
> > vcpu init vmcs is not loaded yet so set_virtual_x2apic_mode() cannot
> > write into it.
> 
> Are you saying that the logic to write on bit value change is due to 
> ordering with cpu init or that the callback is at the wrong place?
> 
The logic is because of ordering with cpu init.

> > > 
> > > Why not disable write intercept for all MSRs which represent APIC registers
> > > that are virtualized? Why TPR is special?
> > > 
> > This patch goes before vid is enabled. At this point only TPR is
> > vitalized. If APIC_WRITE exit will be generated on unhandled MSR write
> > then we can disable intercept for all x2apic MSRs here.
> 
> -ENOPARSE, please be more verbose. "unhandled MSR write" ?
By unhandled I mean unintercepted. Write to x2apic MSR will either
generate MSR write exit if msr bitmap says so and then x2apic MSR will
be handled just like today, or, if we disable intercept, it will
generate APIC_WRITE exit (need to consult SDM here, not sure about it).
One is not really preferable to the other.
 
--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Jan. 21, 2013, 10:16 p.m. UTC | #7
On Mon, Jan 21, 2013 at 11:34:20PM +0200, Gleb Natapov wrote:
> On Mon, Jan 21, 2013 at 07:21:13PM -0200, Marcelo Tosatti wrote:
> > On Mon, Jan 21, 2013 at 10:21:14PM +0200, Gleb Natapov wrote:
> > > > >  	}
> > > > > +
> > > > > +	vcpu->arch.apic_base = value;
> > > > 
> > > > Simpler to have
> > > > 
> > > > if (apic_x2apic_mode(apic)) {
> > > > 	...
> > > > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> > > > } else {
> > > > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> > > > }
> > > > 
> > > This will not work during cpu init. That was discussed on one of
> > > the previous iterations of the patch. When this code is called during
> > > vcpu init vmcs is not loaded yet so set_virtual_x2apic_mode() cannot
> > > write into it.
> > 
> > Are you saying that the logic to write on bit value change is due to 
> > ordering with cpu init or that the callback is at the wrong place?
> > 
> The logic is because of ordering with cpu init.

OK. Still must move this conditional callback after assignment of apic_base.

> > > > Why not disable write intercept for all MSRs which represent APIC registers
> > > > that are virtualized? Why TPR is special?
> > > > 
> > > This patch goes before vid is enabled. At this point only TPR is
> > > vitalized. If APIC_WRITE exit will be generated on unhandled MSR write
> > > then we can disable intercept for all x2apic MSRs here.
> > 
> > -ENOPARSE, please be more verbose. "unhandled MSR write" ?
> By unhandled I mean unintercepted. Write to x2apic MSR will either
> generate MSR write exit if msr bitmap says so and then x2apic MSR will
> be handled just like today, or, if we disable intercept, it will
> generate APIC_WRITE exit (need to consult SDM here, not sure about it).
> One is not really preferable to the other.

It will generate APIC_WRITE, for example, if due to EOI virtualization
exiting.

The question is, why is intercept for EOI MSR address (0x80B) not being
disabled here, while TPR is? I don't see intercept disabled by other
patches either.

APIC_WRITE exit (conditional, see EOI virtualization page) is 
preferable to MSR write exit (unconditional).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Jan. 22, 2013, 12:33 a.m. UTC | #8
On Mon, Jan 21, 2013 at 08:16:18PM -0200, Marcelo Tosatti wrote:
> On Mon, Jan 21, 2013 at 11:34:20PM +0200, Gleb Natapov wrote:
> > On Mon, Jan 21, 2013 at 07:21:13PM -0200, Marcelo Tosatti wrote:
> > > On Mon, Jan 21, 2013 at 10:21:14PM +0200, Gleb Natapov wrote:
> > > > > >  	}
> > > > > > +
> > > > > > +	vcpu->arch.apic_base = value;
> > > > > 
> > > > > Simpler to have
> > > > > 
> > > > > if (apic_x2apic_mode(apic)) {
> > > > > 	...
> > > > > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> > > > > } else {
> > > > > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> > > > > }
> > > > > 
> > > > This will not work during cpu init. That was discussed on one of
> > > > the previous iterations of the patch. When this code is called during
> > > > vcpu init vmcs is not loaded yet so set_virtual_x2apic_mode() cannot
> > > > write into it.
> > > 
> > > Are you saying that the logic to write on bit value change is due to 
> > > ordering with cpu init or that the callback is at the wrong place?
> > > 
> > The logic is because of ordering with cpu init.
> 
> OK. Still must move this conditional callback after assignment of apic_base.
> 
> > > > > Why not disable write intercept for all MSRs which represent APIC registers
> > > > > that are virtualized? Why TPR is special?
> > > > > 
> > > > This patch goes before vid is enabled. At this point only TPR is
> > > > vitalized. If APIC_WRITE exit will be generated on unhandled MSR write
> > > > then we can disable intercept for all x2apic MSRs here.
> > > 
> > > -ENOPARSE, please be more verbose. "unhandled MSR write" ?
> > By unhandled I mean unintercepted. Write to x2apic MSR will either
> > generate MSR write exit if msr bitmap says so and then x2apic MSR will
> > be handled just like today, or, if we disable intercept, it will
> > generate APIC_WRITE exit (need to consult SDM here, not sure about it).
> > One is not really preferable to the other.
> 
> It will generate APIC_WRITE, for example, if due to EOI virtualization
> exiting.

Err, no, EOI induced vmexit is due to EOI virtualization.

> The question is, why is intercept for EOI MSR address (0x80B) not being
> disabled here, while TPR is? I don't see intercept disabled by other
> patches either.

Point still valid: why intercept for EOI MSR address not being disabled?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 22, 2013, 3:37 a.m. UTC | #9
Marcelo Tosatti wrote on 2013-01-22:
> On Mon, Jan 21, 2013 at 08:16:18PM -0200, Marcelo Tosatti wrote:
>> On Mon, Jan 21, 2013 at 11:34:20PM +0200, Gleb Natapov wrote:
>>> On Mon, Jan 21, 2013 at 07:21:13PM -0200, Marcelo Tosatti wrote:
>>>> On Mon, Jan 21, 2013 at 10:21:14PM +0200, Gleb Natapov wrote:
>>>>>>>  	}
>>>>>>> +
>>>>>>> +	vcpu->arch.apic_base = value;
>>>>>> 
>>>>>> Simpler to have
>>>>>> 
>>>>>> if (apic_x2apic_mode(apic)) {
>>>>>> 	...
>>>>>> 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
>>>>>> } else {
>>>>>> 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>>>>>> }
>>>>>> 
>>>>> This will not work during cpu init. That was discussed on one of
>>>>> the previous iterations of the patch. When this code is called during
>>>>> vcpu init vmcs is not loaded yet so set_virtual_x2apic_mode() cannot
>>>>> write into it.
>>>> 
>>>> Are you saying that the logic to write on bit value change is due to
>>>> ordering with cpu init or that the callback is at the wrong place?
>>>> 
>>> The logic is because of ordering with cpu init.
>> 
>> OK. Still must move this conditional callback after assignment of apic_base.
You are correct. will move it in next patch.

>>>>>> Why not disable write intercept for all MSRs which represent APIC
>>>>>> registers that are virtualized? Why TPR is special?
>>>>>> 
>>>>> This patch goes before vid is enabled. At this point only TPR is
>>>>> vitalized. If APIC_WRITE exit will be generated on unhandled MSR write
>>>>> then we can disable intercept for all x2apic MSRs here.
>>>> 
>>>> -ENOPARSE, please be more verbose. "unhandled MSR write" ?
>>> By unhandled I mean unintercepted. Write to x2apic MSR will either
>>> generate MSR write exit if msr bitmap says so and then x2apic MSR will
>>> be handled just like today, or, if we disable intercept, it will
>>> generate APIC_WRITE exit (need to consult SDM here, not sure about it).
>>> One is not really preferable to the other.
>> 
>> It will generate APIC_WRITE, for example, if due to EOI virtualization
>> exiting.
> 
> Err, no, EOI induced vmexit is due to EOI virtualization.
> 
>> The question is, why is intercept for EOI MSR address (0x80B) not being
>> disabled here, while TPR is? I don't see intercept disabled by other
>> patches either.
TPR shadow is required by virtual x2apic mode. So if enabled virtual x2apic mode, that means TPR shadow is also enabled, then we can disable TPR msr intercept.

> Point still valid: why intercept for EOI MSR address not being disabled?
Please see the third patch. Only vid is enabled then we will disable eoi msr intercept and self ipi.

@@ -6249,10 +6286,138 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
                vmx_intercept_for_msr_read_x2apic(0x839, true);
                /* TPR */
                vmx_intercept_for_msr_write_x2apic(0x808, false);
+               /* EOI */
+               vmx_intercept_for_msr_write_x2apic(0x80b, false);
+               /* SELF-IPI */
+               vmx_intercept_for_msr_write_x2apic(0x83f, false);
        }
        vmx_set_msr_bitmap(vcpu);

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 22, 2013, 9:42 a.m. UTC | #10
On Mon, Jan 21, 2013 at 08:16:18PM -0200, Marcelo Tosatti wrote:
> On Mon, Jan 21, 2013 at 11:34:20PM +0200, Gleb Natapov wrote:
> > On Mon, Jan 21, 2013 at 07:21:13PM -0200, Marcelo Tosatti wrote:
> > > On Mon, Jan 21, 2013 at 10:21:14PM +0200, Gleb Natapov wrote:
> > > > > >  	}
> > > > > > +
> > > > > > +	vcpu->arch.apic_base = value;
> > > > > 
> > > > > Simpler to have
> > > > > 
> > > > > if (apic_x2apic_mode(apic)) {
> > > > > 	...
> > > > > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> > > > > } else {
> > > > > 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> > > > > }
> > > > > 
> > > > This will not work during cpu init. That was discussed on one of
> > > > the previous iterations of the patch. When this code is called during
> > > > vcpu init vmcs is not loaded yet so set_virtual_x2apic_mode() cannot
> > > > write into it.
> > > 
> > > Are you saying that the logic to write on bit value change is due to 
> > > ordering with cpu init or that the callback is at the wrong place?
> > > 
> > The logic is because of ordering with cpu init.
> 
> OK. Still must move this conditional callback after assignment of apic_base.
> 
Sure.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 22, 2013, 9:45 a.m. UTC | #11
On Mon, Jan 21, 2013 at 10:33:46PM -0200, Marcelo Tosatti wrote:
> > The question is, why is intercept for EOI MSR address (0x80B) not being
> > disabled here, while TPR is? I don't see intercept disabled by other
> > patches either.
> 
> Point still valid: why intercept for EOI MSR address not being disabled?
Yang sent two version of the third patch. Second one disabled intercept
for EOI MSR. Ignore the first one.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 22, 2013, 12:21 p.m. UTC | #12
Marcelo Tosatti wrote on 2013-01-22:
> On Wed, Jan 16, 2013 at 06:21:11PM +0800, Yang Zhang wrote:
>> From: Yang Zhang <yang.z.zhang@Intel.com>
>> 
>> basically to benefit from apicv, we need to enable virtualized x2apic mode.
>> Currently, we only enable it when guest is really using x2apic.
>> 
>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
> x2apic:
>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>                    except APIC ID and TMCCT which need software's
> assistance to
>> 		   get right value.
>> 
>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    1 + arch/x86/include/asm/vmx.h   
>>    |    1 + arch/x86/kvm/lapic.c            |   20 ++--
>>  arch/x86/kvm/lapic.h            |    5 + arch/x86/kvm/svm.c           
>>    |    6 + arch/x86/kvm/vmx.c              |  204
>>  +++++++++++++++++++++++++++++++++++---- 6 files changed, 209
>>  insertions(+), 28 deletions(-)
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -697,6 +697,7 @@ struct kvm_x86_ops {
>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 	void
>>  (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>>  +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 	int
>>  (*get_tdp_level)(void); 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu,
>>  gfn_t gfn, bool is_mmio);
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 44c3f7e..0a54df0 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -139,6 +139,7 @@
>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 0664c13..f39aee3 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -140,11 +140,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
>>  	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
>>  	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
>> -static inline int apic_x2apic_mode(struct kvm_lapic *apic)
>> -{
>> -	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
>> -}
>> -
>>  static inline int kvm_apic_id(struct kvm_lapic *apic)
>>  {
>>  	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
>> @@ -1323,12 +1318,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu,
> u64 value)
>>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
>>  		value &= ~MSR_IA32_APICBASE_BSP;
>> -	vcpu->arch.apic_base = value;
>> -	if (apic_x2apic_mode(apic)) {
>> -		u32 id = kvm_apic_id(apic);
>> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>> -		kvm_apic_set_ldr(apic, ldr);
>> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
>> +		if (value & X2APIC_ENABLE) {
>> +			u32 id = kvm_apic_id(apic);
>> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>> +			kvm_apic_set_ldr(apic, ldr);
>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
>> +		} else
>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>>  	}
>> +
>> +	vcpu->arch.apic_base = value;
> 
> Simpler to have
> 
> if (apic_x2apic_mode(apic)) {
> 	...
> 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> } else {
> 	kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> }
> 
> Also it must be done after assignment of vcpu->arch.apic_base (this
> patch has vcpu->arch.apic_base being read from
> ->set_virtual_x2apic_mode() path).
> 
>> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long *msr_bitmap;
>> +
>> +	if (apic_x2apic_mode(vcpu->arch.apic))
> 
> vcpu->arch.apic can be NULL.
Actually, call apic_x2apic_mode to check whether use x2apic msr bitmap is wrong.
VCPU uses x2apic but it may not set virtual x2apic mode bit due to TPR shadow not enabled or irqchip not in kernel. Check the virtual x2apic mode bit in vmcs directly should be a better choice. How about the follow code:

static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
{
        unsigned long *msr_bitmap;

        if (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)
                if (is_long_mode(vcpu))
                        msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
                else
                        msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
        else
                if (is_long_mode(vcpu))
                        msr_bitmap = vmx_msr_bitmap_longmode;
                else
                        msr_bitmap = vmx_msr_bitmap_legacy;

        vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
}

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 22, 2013, 3:55 p.m. UTC | #13
On Tue, Jan 22, 2013 at 12:21:47PM +0000, Zhang, Yang Z wrote:
> >> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> >> +{
> >> +	unsigned long *msr_bitmap;
> >> +
> >> +	if (apic_x2apic_mode(vcpu->arch.apic))
> > 
> > vcpu->arch.apic can be NULL.
> Actually, call apic_x2apic_mode to check whether use x2apic msr bitmap is wrong.
> VCPU uses x2apic but it may not set virtual x2apic mode bit due to TPR shadow not enabled or irqchip not in kernel. Check the virtual x2apic mode bit in vmcs directly should be a better choice. How about the follow code:
> 
If TPR shadow it not enabled vmx_msr_bitmap_.*x2apic bitmap will have x2apic MSRs intercepted.

> static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> {
>         unsigned long *msr_bitmap;
> 
>         if (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)
>                 if (is_long_mode(vcpu))
>                         msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
>                 else
>                         msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
>         else
>                 if (is_long_mode(vcpu))
>                         msr_bitmap = vmx_msr_bitmap_longmode;
>                 else
>                         msr_bitmap = vmx_msr_bitmap_legacy;
> 
>         vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
> }
> 
> Best regards,
> Yang
> 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Jan. 22, 2013, 11:13 p.m. UTC | #14
On Tue, Jan 22, 2013 at 05:55:53PM +0200, Gleb Natapov wrote:
> On Tue, Jan 22, 2013 at 12:21:47PM +0000, Zhang, Yang Z wrote:
> > >> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> > >> +{
> > >> +	unsigned long *msr_bitmap;
> > >> +
> > >> +	if (apic_x2apic_mode(vcpu->arch.apic))
> > > 
> > > vcpu->arch.apic can be NULL.
> > Actually, call apic_x2apic_mode to check whether use x2apic msr bitmap is wrong.
> > VCPU uses x2apic but it may not set virtual x2apic mode bit due to TPR shadow not enabled or irqchip not in kernel. Check the virtual x2apic mode bit in vmcs directly should be a better choice. How about the follow code:
> > 
> If TPR shadow it not enabled vmx_msr_bitmap_.*x2apic bitmap will have x2apic MSRs intercepted.

And what is the problem? APIC register virt depends on TPR shadow.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 23, 2013, 12:35 a.m. UTC | #15
Marcelo Tosatti wrote on 2013-01-23:
> On Tue, Jan 22, 2013 at 05:55:53PM +0200, Gleb Natapov wrote:
>> On Tue, Jan 22, 2013 at 12:21:47PM +0000, Zhang, Yang Z wrote:
>>>>> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	unsigned long *msr_bitmap;
>>>>> +
>>>>> +	if (apic_x2apic_mode(vcpu->arch.apic))
>>>> 
>>>> vcpu->arch.apic can be NULL.
>>> Actually, call apic_x2apic_mode to check whether use x2apic msr bitmap
>>> is wrong. VCPU uses x2apic but it may not set virtual x2apic mode bit
>>> due to TPR
> shadow not enabled or irqchip not in kernel. Check the virtual x2apic mode bit in
> vmcs directly should be a better choice. How about the follow code:
>>> 
>> If TPR shadow it not enabled vmx_msr_bitmap_.*x2apic bitmap will have
> x2apic MSRs intercepted.
Right. So check virtual x2apic mode bit also covers the TPR shadow check. Or else, we need two check: one for apic mode and one for TPR shadow.

> And what is the problem? APIC register virt depends on TPR shadow.
No problem. The new implementation is more reasonable and needn't to do additional check of TPR shadow, and will not touch apic_base.



Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 23, 2013, 10:29 a.m. UTC | #16
On Tue, Jan 22, 2013 at 09:13:06PM -0200, Marcelo Tosatti wrote:
> On Tue, Jan 22, 2013 at 05:55:53PM +0200, Gleb Natapov wrote:
> > On Tue, Jan 22, 2013 at 12:21:47PM +0000, Zhang, Yang Z wrote:
> > > >> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> > > >> +{
> > > >> +	unsigned long *msr_bitmap;
> > > >> +
> > > >> +	if (apic_x2apic_mode(vcpu->arch.apic))
> > > > 
> > > > vcpu->arch.apic can be NULL.
> > > Actually, call apic_x2apic_mode to check whether use x2apic msr bitmap is wrong.
> > > VCPU uses x2apic but it may not set virtual x2apic mode bit due to TPR shadow not enabled or irqchip not in kernel. Check the virtual x2apic mode bit in vmcs directly should be a better choice. How about the follow code:
> > > 
> > If TPR shadow it not enabled vmx_msr_bitmap_.*x2apic bitmap will have x2apic MSRs intercepted.
> 
> And what is the problem? APIC register virt depends on TPR shadow.
No problem. I am saying that it is safe to set exit bitmap to
vmx_msr_bitmap_.*x2apic in this case, so no need to check vmcs,
apic_x2apic_mode() is sufficient.
--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c431b33..35aa8e6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -697,6 +697,7 @@  struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 44c3f7e..0a54df0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -139,6 +139,7 @@ 
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
 #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_RDTSCP			0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0664c13..f39aee3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -140,11 +140,6 @@  static inline int apic_enabled(struct kvm_lapic *apic)
 	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
-{
-	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
-}
-
 static inline int kvm_apic_id(struct kvm_lapic *apic)
 {
 	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
@@ -1323,12 +1318,17 @@  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 	if (!kvm_vcpu_is_bsp(apic->vcpu))
 		value &= ~MSR_IA32_APICBASE_BSP;
 
-	vcpu->arch.apic_base = value;
-	if (apic_x2apic_mode(apic)) {
-		u32 id = kvm_apic_id(apic);
-		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
-		kvm_apic_set_ldr(apic, ldr);
+	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
+		if (value & X2APIC_ENABLE) {
+			u32 id = kvm_apic_id(apic);
+			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+			kvm_apic_set_ldr(apic, ldr);
+			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
+		} else
+			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
 	}
+
+	vcpu->arch.apic_base = value;
 	apic->base_address = apic->vcpu->arch.apic_base &
 			     MSR_IA32_APICBASE_BASE;
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 9a8ee22..22a5397 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -126,4 +126,9 @@  static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
 	return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
 }
 
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd..38407e9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,11 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
+static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+	return;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4295,7 @@  static struct kvm_x86_ops svm_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
+	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0403634..526955d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -643,6 +643,8 @@  static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic;
 
 static bool cpu_has_load_ia32_efer;
 static bool cpu_has_load_perf_global_ctrl;
@@ -767,6 +769,12 @@  static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 }
 
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
 static inline bool cpu_has_vmx_apic_register_virt(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1830,6 +1838,24 @@  static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 	vmx->guest_msrs[from] = tmp;
 }
 
+static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+	unsigned long *msr_bitmap;
+
+	if (apic_x2apic_mode(vcpu->arch.apic))
+		if (is_long_mode(vcpu))
+			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+		else
+			msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+	else
+		if (is_long_mode(vcpu))
+			msr_bitmap = vmx_msr_bitmap_longmode;
+		else
+			msr_bitmap = vmx_msr_bitmap_legacy;
+
+	vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+}
+
 /*
  * Set up the vmcs to automatically save and restore system
  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
@@ -1838,7 +1864,6 @@  static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 static void setup_msrs(struct vcpu_vmx *vmx)
 {
 	int save_nmsrs, index;
-	unsigned long *msr_bitmap;
 
 	save_nmsrs = 0;
 #ifdef CONFIG_X86_64
@@ -1870,14 +1895,8 @@  static void setup_msrs(struct vcpu_vmx *vmx)
 
 	vmx->save_nmsrs = save_nmsrs;
 
-	if (cpu_has_vmx_msr_bitmap()) {
-		if (is_long_mode(&vmx->vcpu))
-			msr_bitmap = vmx_msr_bitmap_longmode;
-		else
-			msr_bitmap = vmx_msr_bitmap_legacy;
-
-		vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
-	}
+	if (cpu_has_vmx_msr_bitmap())
+		vmx_set_msr_bitmap(&vmx->vcpu);
 }
 
 /*
@@ -2543,6 +2562,7 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
 		min2 = 0;
 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 			SECONDARY_EXEC_WBINVD_EXITING |
 			SECONDARY_EXEC_ENABLE_VPID |
 			SECONDARY_EXEC_ENABLE_EPT |
@@ -3724,7 +3744,10 @@  static void free_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+#define MSR_TYPE_R	1
+#define MSR_TYPE_W	2
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+						u32 msr, int type)
 {
 	int f = sizeof(unsigned long);
 
@@ -3737,20 +3760,99 @@  static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 	 */
 	if (msr <= 0x1fff) {
-		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
-		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+		if (type & MSR_TYPE_R)
+			/* read-low */
+			__clear_bit(msr, msr_bitmap + 0x000 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-low */
+			__clear_bit(msr, msr_bitmap + 0x800 / f);
+
 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 		msr &= 0x1fff;
-		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
-		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+		if (type & MSR_TYPE_R)
+			/* read-high */
+			__clear_bit(msr, msr_bitmap + 0x400 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-high */
+			__clear_bit(msr, msr_bitmap + 0xc00 / f);
+
+	}
+}
+
+static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+						u32 msr, int type)
+{
+	int f = sizeof(unsigned long);
+
+	if (!cpu_has_vmx_msr_bitmap())
+		return;
+
+	/*
+	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+	 * have the write-low and read-high bitmap offsets the wrong way round.
+	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+	 */
+	if (msr <= 0x1fff) {
+		if (type & MSR_TYPE_R)
+			/* read-low */
+			__set_bit(msr, msr_bitmap + 0x000 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-low */
+			__set_bit(msr, msr_bitmap + 0x800 / f);
+
+	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+		msr &= 0x1fff;
+		if (type & MSR_TYPE_R)
+			/* read-high */
+			__set_bit(msr, msr_bitmap + 0x400 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-high */
+			__set_bit(msr, msr_bitmap + 0xc00 / f);
+
 	}
 }
 
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
 	if (!longmode_only)
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+						msr, MSR_TYPE_R | MSR_TYPE_W);
+	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+						msr, MSR_TYPE_R | MSR_TYPE_W);
+}
+
+static void vmx_intercept_for_msr_read_x2apic(u32 msr, bool set)
+{
+	if (set) {
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+				msr, MSR_TYPE_R);
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+				msr, MSR_TYPE_R);
+	} else {
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+				msr, MSR_TYPE_R);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+				msr, MSR_TYPE_R);
+	}
+}
+
+static void vmx_intercept_for_msr_write_x2apic(u32 msr, bool set)
+{
+	if (set) {
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+				msr, MSR_TYPE_W);
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+				msr, MSR_TYPE_W);
+	} else {
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+				msr, MSR_TYPE_W);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+				msr, MSR_TYPE_W);
+	}
 }
 
 /*
@@ -3848,6 +3950,7 @@  static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 	if (!enable_apicv_reg)
 		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 	return exec_control;
 }
 
@@ -6103,6 +6206,53 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
+static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+	u32 exec_control, sec_exec_control;
+	int msr;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/* There is not point to enable virtualize x2apic without enable
+	 * apicv*/
+	if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg)
+		return;
+
+	if (set) {
+		exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+		/* virtualize x2apic mode relies on tpr shadow */
+		if (!(exec_control & CPU_BASED_TPR_SHADOW))
+			return;
+	}
+
+	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+	if (set) {
+		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+	} else {
+		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+		if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+			sec_exec_control |=
+					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	}
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+
+	if (set) {
+		for (msr = 0x800; msr <= 0x8ff; msr++)
+			vmx_intercept_for_msr_read_x2apic(msr, false);
+
+		/* According SDM, in x2apic mode, the whole id reg is used.
+		 * But in KVM, it only use the highest eight bits. Need to
+		 * intercept it */
+		vmx_intercept_for_msr_read_x2apic(0x802, true);
+		/* TMCCT */
+		vmx_intercept_for_msr_read_x2apic(0x839, true);
+		/* TPR */
+		vmx_intercept_for_msr_write_x2apic(0x808, false);
+	}
+	vmx_set_msr_bitmap(vcpu);
+}
+
 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -7366,6 +7516,7 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
+	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
@@ -7419,11 +7570,19 @@  static int __init vmx_init(void)
 	if (!vmx_msr_bitmap_legacy)
 		goto out1;
 
+	vmx_msr_bitmap_legacy_x2apic =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_msr_bitmap_legacy_x2apic)
+		goto out2;
 
 	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_msr_bitmap_longmode)
-		goto out2;
+		goto out3;
 
+	vmx_msr_bitmap_longmode_x2apic =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_msr_bitmap_longmode_x2apic)
+		goto out4;
 
 	/*
 	 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7456,6 +7615,11 @@  static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
+	memcpy(vmx_msr_bitmap_legacy_x2apic,
+			vmx_msr_bitmap_legacy, PAGE_SIZE);
+	memcpy(vmx_msr_bitmap_longmode_x2apic,
+			vmx_msr_bitmap_longmode, PAGE_SIZE);
+
 	if (enable_ept) {
 		kvm_mmu_set_mask_ptes(0ull,
 			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
@@ -7468,8 +7632,10 @@  static int __init vmx_init(void)
 
 	return 0;
 
-out3:
+out4:
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
+out3:
+	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
 out2:
 	free_page((unsigned long)vmx_msr_bitmap_legacy);
 out1:
@@ -7481,6 +7647,8 @@  out:
 
 static void __exit vmx_exit(void)
 {
+	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 	free_page((unsigned long)vmx_msr_bitmap_legacy);
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
 	free_page((unsigned long)vmx_io_bitmap_b);