diff mbox

[v10,2/3] x86, apicv: add virtual x2apic support

Message ID 1358147615-24216-3-git-send-email-yang.z.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhang, Yang Z Jan. 14, 2013, 7:13 a.m. UTC
From: Yang Zhang <yang.z.zhang@Intel.com>

basically to benefit from apicv, we need to enable virtualized x2apic mode.
Currently, we only enable it when guest is really using x2apic.

Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
    0x800 - 0x8ff: no read intercept for apicv register virtualization,
                   except APIC ID and TMCCT.
    APIC ID and TMCCT: need software's assistance to get right value.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/include/asm/vmx.h      |    1 +
 arch/x86/kvm/lapic.c            |   15 +++-
 arch/x86/kvm/svm.c              |    6 ++
 arch/x86/kvm/vmx.c              |  162 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 173 insertions(+), 12 deletions(-)

Comments

Gleb Natapov Jan. 14, 2013, 10:48 a.m. UTC | #1
On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
> From: Yang Zhang <yang.z.zhang@Intel.com>
> 
> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> Currently, we only enable it when guest is really using x2apic.
> 
> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>                    except APIC ID and TMCCT.
>     APIC ID and TMCCT: need software's assistance to get right value.
> 
Actually since msr bitmap is shared between all vcpus this will break
guests that do not enable x2apic.

> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/include/asm/vmx.h      |    1 +
>  arch/x86/kvm/lapic.c            |   15 +++-
>  arch/x86/kvm/svm.c              |    6 ++
>  arch/x86/kvm/vmx.c              |  162 +++++++++++++++++++++++++++++++++++++--
>  5 files changed, 173 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..35aa8e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -697,6 +697,7 @@ struct kvm_x86_ops {
>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>  	int (*get_tdp_level)(void);
>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 44c3f7e..0a54df0 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -139,6 +139,7 @@
>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
>  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
>  #define SECONDARY_EXEC_RDTSCP			0x00000008
> +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
>  #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
>  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
>  #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 0664c13..2ef5e2b 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
>  		value &= ~MSR_IA32_APICBASE_BSP;
>  
> -	vcpu->arch.apic_base = value;
> -	if (apic_x2apic_mode(apic)) {
> -		u32 id = kvm_apic_id(apic);
> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> -		kvm_apic_set_ldr(apic, ldr);
> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
> +		if (value & X2APIC_ENABLE) {
> +			u32 id = kvm_apic_id(apic);
> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> +			kvm_apic_set_ldr(apic, ldr);
> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> +		} else
> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>  	}
> +
> +	vcpu->arch.apic_base = value;
>  	apic->base_address = apic->vcpu->arch.apic_base &
>  			     MSR_IA32_APICBASE_BASE;
>  
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d29d3cd..38407e9 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>  }
>  
> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
> +{
> +	return;
> +}
> +
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = {
>  	.enable_nmi_window = enable_nmi_window,
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>  
>  	.set_tss_addr = svm_set_tss_addr,
>  	.get_tdp_level = get_npt_level,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 0403634..847022e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -767,6 +767,12 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>  }
>  
> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> +{
> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +}
> +
>  static inline bool cpu_has_vmx_apic_register_virt(void)
>  {
>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
>  		min2 = 0;
>  		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>  			SECONDARY_EXEC_WBINVD_EXITING |
>  			SECONDARY_EXEC_ENABLE_VPID |
>  			SECONDARY_EXEC_ENABLE_EPT |
> @@ -3724,7 +3731,45 @@ static void free_vpid(struct vcpu_vmx *vmx)
>  	spin_unlock(&vmx_vpid_lock);
>  }
>  
> -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
> +#define MSR_TYPE_R	1
> +#define MSR_TYPE_W	2
> +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> +						u32 msr, int type)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (!cpu_has_vmx_msr_bitmap())
> +		return;
> +
> +	/*
> +	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
> +	 * have the write-low and read-high bitmap offsets the wrong way round.
> +	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
> +	 */
> +	if (msr <= 0x1fff) {
> +		if (type & MSR_TYPE_R)
> +			/* read-low */
> +			__clear_bit(msr, msr_bitmap + 0x000 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-low */
> +			__clear_bit(msr, msr_bitmap + 0x800 / f);
> +
> +	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> +		msr &= 0x1fff;
> +		if (type & MSR_TYPE_R)
> +			/* read-high */
> +			__clear_bit(msr, msr_bitmap + 0x400 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-high */
> +			__clear_bit(msr, msr_bitmap + 0xc00 / f);
> +
> +	}
> +}
> +
> +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
> +						u32 msr, int type)
>  {
>  	int f = sizeof(unsigned long);
>  
> @@ -3737,20 +3782,75 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>  	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
>  	 */
>  	if (msr <= 0x1fff) {
> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> +		if (type & MSR_TYPE_R)
> +			/* read-low */
> +			__set_bit(msr, msr_bitmap + 0x000 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-low */
> +			__set_bit(msr, msr_bitmap + 0x800 / f);
> +
>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>  		msr &= 0x1fff;
> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> +		if (type & MSR_TYPE_R)
> +			/* read-high */
> +			__set_bit(msr, msr_bitmap + 0x400 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-high */
> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
> +
>  	}
>  }
>  
> +
>  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
>  {
>  	if (!longmode_only)
> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> +}
> +
> +static void vmx_intercept_for_msr_read(u32 msr, bool longmode_only,
> +					bool set)
> +{
> +	if (!longmode_only) {
> +		if (set)
> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +					msr, MSR_TYPE_R);
> +		else
> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +					msr, MSR_TYPE_R);
> +
> +	}
> +	if (set)
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +				msr, MSR_TYPE_R);
> +	else
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +				msr, MSR_TYPE_R);
> +}
> +
> +static void vmx_intercept_for_msr_write(u32 msr, bool longmode_only,
> +					bool set)
> +{
> +	if (!longmode_only) {
> +		if (set)
> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +					msr, MSR_TYPE_W);
> +		else
> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +					msr, MSR_TYPE_W);
> +
> +	}
> +	if (set)
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +				msr, MSR_TYPE_W);
> +	else
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +				msr, MSR_TYPE_W);
>  }
>  
>  /*
> @@ -3848,6 +3948,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>  	if (!enable_apicv_reg)
>  		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>  	return exec_control;
>  }
>  
> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  	vmcs_write32(TPR_THRESHOLD, irr);
>  }
>  
> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
> +{
> +	u32 exec_control, sec_exec_control;
> +	int msr;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	/* There is not point to enable virtualize x2apic without enable
> +	 * apicv*/
> +	if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg)
> +		return;
> +
> +	if (set) {
> +		exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +		/* virtualize x2apic mode relies on tpr shadow */
> +		if (!(exec_control & CPU_BASED_TPR_SHADOW))
> +			return;
> +	}
> +
> +	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +
> +	if (set) {
> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +	} else {
> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +		if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
> +			sec_exec_control |=
> +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +	}
> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
> +
> +	for (msr = 0x800; msr <= 0x8ff; msr++)
> +		vmx_intercept_for_msr_read(msr, false, !set);
> +
> +	if (set) {
> +		/* According SDM, in x2apic mode, the whole id reg is used.
> +		 * But in KVM, it only use the highest eight bits. Need to
> +		 * intercept it*/
> +		vmx_intercept_for_msr_read(0x802, false, true);
> +		/* TMCCT */
> +		vmx_intercept_for_msr_read(0x839, false, true);
> +	}
> +	/* TPR */
> +	vmx_intercept_for_msr_write(0x808, false, !set);
> +}
> +
>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
>  {
>  	u32 exit_intr_info;
> @@ -7366,6 +7513,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
>  	.enable_nmi_window = enable_nmi_window,
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>  
>  	.set_tss_addr = vmx_set_tss_addr,
>  	.get_tdp_level = get_ept_level,
> -- 
> 1.7.1

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 14, 2013, 11:01 a.m. UTC | #2
Gleb Natapov wrote on 2013-01-14:
> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
>> From: Yang Zhang <yang.z.zhang@Intel.com>
>> 
>> basically to benefit from apicv, we need to enable virtualized x2apic mode.
>> Currently, we only enable it when guest is really using x2apic.
>> 
>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
> x2apic:
>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>                    except APIC ID and TMCCT.
>>     APIC ID and TMCCT: need software's assistance to get right value.
> Actually since msr bitmap is shared between all vcpus this will break
> guests that do not enable x2apic.
I don't think this case will exist. It will break the real OS too.
 
>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    1 + arch/x86/include/asm/vmx.h   
>>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
>>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c          
>>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 173
>>  insertions(+), 12 deletions(-)
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -697,6 +697,7 @@ struct kvm_x86_ops {
>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 	void
>>  (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>>  +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 	int
>>  (*get_tdp_level)(void); 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu,
>>  gfn_t gfn, bool is_mmio);
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 44c3f7e..0a54df0 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -139,6 +139,7 @@
>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 0664c13..2ef5e2b 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu,
> u64 value)
>>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
>>  		value &= ~MSR_IA32_APICBASE_BSP;
>> -	vcpu->arch.apic_base = value;
>> -	if (apic_x2apic_mode(apic)) {
>> -		u32 id = kvm_apic_id(apic);
>> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>> -		kvm_apic_set_ldr(apic, ldr);
>> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
>> +		if (value & X2APIC_ENABLE) {
>> +			u32 id = kvm_apic_id(apic);
>> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>> +			kvm_apic_set_ldr(apic, ldr);
>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
>> +		} else
>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>>  	}
>> +
>> +	vcpu->arch.apic_base = value;
>>  	apic->base_address = apic->vcpu->arch.apic_base &
>>  			     MSR_IA32_APICBASE_BASE;
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index d29d3cd..38407e9 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>  }
>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
>> +{
>> +	return;
>> +}
>> +
>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
>>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
>>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>  update_cr8_intercept,
>> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>> 
>>  	.set_tss_addr = svm_set_tss_addr,
>>  	.get_tdp_level = get_npt_level,
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 0403634..847022e 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -767,6 +767,12 @@ static inline bool
> cpu_has_vmx_virtualize_apic_accesses(void)
>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>  }
>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>> +{
>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>> +}
>> +
>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>  {
>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
>>  { 		min2 = 0; 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>  			SECONDARY_EXEC_WBINVD_EXITING | 			SECONDARY_EXEC_ENABLE_VPID |
>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 @@ static void
>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
>> -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
>> u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2 +static void
>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +						u32
>> msr, int type) +{ +	int f = sizeof(unsigned long); + +	if
>> (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See Intel PRM Vol.
>> 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 * have the write-low
>> and read-high bitmap offsets the wrong way round. +	 * We can control
>> MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. +	 */ +	if (msr
>> <= 0x1fff) { +		if (type & MSR_TYPE_R) +			/* read-low */
>> +			__clear_bit(msr, msr_bitmap + 0x000 / f); + +		if (type &
>> MSR_TYPE_W) +			/* write-low */ +			__clear_bit(msr, msr_bitmap + 0x800
>> / f); + +	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>> +		msr &= 0x1fff; +		if (type & MSR_TYPE_R) +			/* read-high */
>> +			__clear_bit(msr, msr_bitmap + 0x400 / f); + +		if (type &
>> MSR_TYPE_W) +			/* write-high */ +			__clear_bit(msr, msr_bitmap +
>> 0xc00 / f); + +	} +} + +static void
>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +						u32
>> msr, int type)
>>  {
>>  	int f = sizeof(unsigned long);
>> @@ -3737,20 +3782,75 @@ static void
> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>>  	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
>>  	 */
>>  	if (msr <= 0x1fff) {
>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
>> +		if (type & MSR_TYPE_R)
>> +			/* read-low */
>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
>> +
>> +		if (type & MSR_TYPE_W)
>> +			/* write-low */
>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
>> +
>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>  		msr &= 0x1fff;
>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
>> +		if (type & MSR_TYPE_R)
>> +			/* read-high */
>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
>> +
>> +		if (type & MSR_TYPE_W)
>> +			/* write-high */
>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
>> +
>>  	}
>>  }
>> +
>>  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
>>  {
>>  	if (!longmode_only)
>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +						msr, MSR_TYPE_R | MSR_TYPE_W);
>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +						msr, MSR_TYPE_R | MSR_TYPE_W);
>> +}
>> +
>> +static void vmx_intercept_for_msr_read(u32 msr, bool longmode_only,
>> +					bool set)
>> +{
>> +	if (!longmode_only) {
>> +		if (set)
>> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +					msr, MSR_TYPE_R);
>> +		else
>> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +					msr, MSR_TYPE_R);
>> +
>> +	}
>> +	if (set)
>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +				msr, MSR_TYPE_R);
>> +	else
>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +				msr, MSR_TYPE_R);
>> +}
>> +
>> +static void vmx_intercept_for_msr_write(u32 msr, bool longmode_only,
>> +					bool set)
>> +{
>> +	if (!longmode_only) {
>> +		if (set)
>> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +					msr, MSR_TYPE_W);
>> +		else
>> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +					msr, MSR_TYPE_W);
>> +
>> +	}
>> +	if (set)
>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +				msr, MSR_TYPE_W);
>> +	else
>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +				msr, MSR_TYPE_W);
>>  }
>>  
>>  /*
>> @@ -3848,6 +3948,7 @@ static u32 vmx_secondary_exec_control(struct
> vcpu_vmx *vmx)
>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
>>  (!enable_apicv_reg) 		exec_control &=
>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return exec_control; }
>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>  }
>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
>> set) +{ +	u32 exec_control, sec_exec_control; +	int msr; +	struct
>> vcpu_vmx *vmx = to_vmx(vcpu); + +	/* There is not point to enable
>> virtualize x2apic without enable +	 * apicv*/ +	if
>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +		return;
>> + +	if (set) { +		exec_control =
>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		/* virtualize x2apic mode
>> relies on tpr shadow */ +		if (!(exec_control & CPU_BASED_TPR_SHADOW))
>> +			return; +	} + +	sec_exec_control =
>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +	if (set) {
>> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	} else
>> { +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +		if
>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +			sec_exec_control
>> |= +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +	}
>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + +	for
>> (msr = 0x800; msr <= 0x8ff; msr++) +		vmx_intercept_for_msr_read(msr,
>> false, !set); + +	if (set) { +		/* According SDM, in x2apic mode, the
>> whole id reg is used. +		 * But in KVM, it only use the highest eight
>> bits. Need to +		 * intercept it*/ +		vmx_intercept_for_msr_read(0x802,
>> false, true); +		/* TMCCT */ +		vmx_intercept_for_msr_read(0x839,
>> false, true); +	} +	/* TPR */ +	vmx_intercept_for_msr_write(0x808,
>> false, !set); +} +
>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
>>  exit_intr_info; @@ -7366,6 +7513,7 @@ static struct kvm_x86_ops
>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>  update_cr8_intercept,
>> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>> 
>>  	.set_tss_addr = vmx_set_tss_addr,
>>  	.get_tdp_level = get_ept_level,
>> --
>> 1.7.1
> 
> --
> 			Gleb.


Best regards,
Yang

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 14, 2013, 11:05 a.m. UTC | #3
On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-14:
> > On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
> >> From: Yang Zhang <yang.z.zhang@Intel.com>
> >> 
> >> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> >> Currently, we only enable it when guest is really using x2apic.
> >> 
> >> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
> > x2apic:
> >>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
> >>                    except APIC ID and TMCCT.
> >>     APIC ID and TMCCT: need software's assistance to get right value.
> > Actually since msr bitmap is shared between all vcpus this will break
> > guests that do not enable x2apic.
> I don't think this case will exist. It will break the real OS too.
>  
Which case? One VM uses x2apic another one does not? Bitmap is shared
between all vcpus of all VMs.

> >> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> >> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    1 + arch/x86/include/asm/vmx.h   
> >>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
> >>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c          
> >>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 173
> >>  insertions(+), 12 deletions(-)
> >> diff --git a/arch/x86/include/asm/kvm_host.h
> >> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 ---
> >> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -697,6 +697,7 @@ struct kvm_x86_ops {
> >>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 	void
> >>  (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
> >>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> >>  +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
> >>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 	int
> >>  (*get_tdp_level)(void); 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu,
> >>  gfn_t gfn, bool is_mmio);
> >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> >> index 44c3f7e..0a54df0 100644
> >> --- a/arch/x86/include/asm/vmx.h
> >> +++ b/arch/x86/include/asm/vmx.h
> >> @@ -139,6 +139,7 @@
> >>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
> >>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
> >>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
> >>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
> >>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
> >>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
> >>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >> index 0664c13..2ef5e2b 100644
> >> --- a/arch/x86/kvm/lapic.c
> >> +++ b/arch/x86/kvm/lapic.c
> >> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu,
> > u64 value)
> >>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
> >>  		value &= ~MSR_IA32_APICBASE_BSP;
> >> -	vcpu->arch.apic_base = value;
> >> -	if (apic_x2apic_mode(apic)) {
> >> -		u32 id = kvm_apic_id(apic);
> >> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> >> -		kvm_apic_set_ldr(apic, ldr);
> >> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
> >> +		if (value & X2APIC_ENABLE) {
> >> +			u32 id = kvm_apic_id(apic);
> >> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> >> +			kvm_apic_set_ldr(apic, ldr);
> >> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> >> +		} else
> >> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> >>  	}
> >> +
> >> +	vcpu->arch.apic_base = value;
> >>  	apic->base_address = apic->vcpu->arch.apic_base &
> >>  			     MSR_IA32_APICBASE_BASE;
> >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >> index d29d3cd..38407e9 100644
> >> --- a/arch/x86/kvm/svm.c
> >> +++ b/arch/x86/kvm/svm.c
> >> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu
> > *vcpu, int tpr, int irr)
> >>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
> >>  }
> >> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
> >> +{
> >> +	return;
> >> +}
> >> +
> >>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
> >>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
> >>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>  update_cr8_intercept,
> >> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
> >> 
> >>  	.set_tss_addr = svm_set_tss_addr,
> >>  	.get_tdp_level = get_npt_level,
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> index 0403634..847022e 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -767,6 +767,12 @@ static inline bool
> > cpu_has_vmx_virtualize_apic_accesses(void)
> >>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>  }
> >> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> >> +{
> >> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >> +}
> >> +
> >>  static inline bool cpu_has_vmx_apic_register_virt(void)
> >>  {
> >>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
> > vmcs_config *vmcs_conf)
> >>  	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
> >>  { 		min2 = 0; 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> >>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
> >>  			SECONDARY_EXEC_WBINVD_EXITING | 			SECONDARY_EXEC_ENABLE_VPID |
> >>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 @@ static void
> >>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
> >> -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> >> u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2 +static void
> >> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +						u32
> >> msr, int type) +{ +	int f = sizeof(unsigned long); + +	if
> >> (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See Intel PRM Vol.
> >> 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 * have the write-low
> >> and read-high bitmap offsets the wrong way round. +	 * We can control
> >> MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. +	 */ +	if (msr
> >> <= 0x1fff) { +		if (type & MSR_TYPE_R) +			/* read-low */
> >> +			__clear_bit(msr, msr_bitmap + 0x000 / f); + +		if (type &
> >> MSR_TYPE_W) +			/* write-low */ +			__clear_bit(msr, msr_bitmap + 0x800
> >> / f); + +	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> >> +		msr &= 0x1fff; +		if (type & MSR_TYPE_R) +			/* read-high */
> >> +			__clear_bit(msr, msr_bitmap + 0x400 / f); + +		if (type &
> >> MSR_TYPE_W) +			/* write-high */ +			__clear_bit(msr, msr_bitmap +
> >> 0xc00 / f); + +	} +} + +static void
> >> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +						u32
> >> msr, int type)
> >>  {
> >>  	int f = sizeof(unsigned long);
> >> @@ -3737,20 +3782,75 @@ static void
> > __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
> >>  	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
> >>  	 */
> >>  	if (msr <= 0x1fff) {
> >> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> >> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> >> +		if (type & MSR_TYPE_R)
> >> +			/* read-low */
> >> +			__set_bit(msr, msr_bitmap + 0x000 / f);
> >> +
> >> +		if (type & MSR_TYPE_W)
> >> +			/* write-low */
> >> +			__set_bit(msr, msr_bitmap + 0x800 / f);
> >> +
> >>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> >>  		msr &= 0x1fff;
> >> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> >> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> >> +		if (type & MSR_TYPE_R)
> >> +			/* read-high */
> >> +			__set_bit(msr, msr_bitmap + 0x400 / f);
> >> +
> >> +		if (type & MSR_TYPE_W)
> >> +			/* write-high */
> >> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
> >> +
> >>  	}
> >>  }
> >> +
> >>  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
> >>  {
> >>  	if (!longmode_only)
> >> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> >> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> >> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> >> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> >> +}
> >> +
> >> +static void vmx_intercept_for_msr_read(u32 msr, bool longmode_only,
> >> +					bool set)
> >> +{
> >> +	if (!longmode_only) {
> >> +		if (set)
> >> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +					msr, MSR_TYPE_R);
> >> +		else
> >> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +					msr, MSR_TYPE_R);
> >> +
> >> +	}
> >> +	if (set)
> >> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +				msr, MSR_TYPE_R);
> >> +	else
> >> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +				msr, MSR_TYPE_R);
> >> +}
> >> +
> >> +static void vmx_intercept_for_msr_write(u32 msr, bool longmode_only,
> >> +					bool set)
> >> +{
> >> +	if (!longmode_only) {
> >> +		if (set)
> >> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +					msr, MSR_TYPE_W);
> >> +		else
> >> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +					msr, MSR_TYPE_W);
> >> +
> >> +	}
> >> +	if (set)
> >> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +				msr, MSR_TYPE_W);
> >> +	else
> >> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +				msr, MSR_TYPE_W);
> >>  }
> >>  
> >>  /*
> >> @@ -3848,6 +3948,7 @@ static u32 vmx_secondary_exec_control(struct
> > vcpu_vmx *vmx)
> >>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
> >>  (!enable_apicv_reg) 		exec_control &=
> >>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
> >>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return exec_control; }
> >> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct kvm_vcpu
> > *vcpu, int tpr, int irr)
> >>  	vmcs_write32(TPR_THRESHOLD, irr);
> >>  }
> >> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
> >> set) +{ +	u32 exec_control, sec_exec_control; +	int msr; +	struct
> >> vcpu_vmx *vmx = to_vmx(vcpu); + +	/* There is not point to enable
> >> virtualize x2apic without enable +	 * apicv*/ +	if
> >> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +		return;
> >> + +	if (set) { +		exec_control =
> >> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		/* virtualize x2apic mode
> >> relies on tpr shadow */ +		if (!(exec_control & CPU_BASED_TPR_SHADOW))
> >> +			return; +	} + +	sec_exec_control =
> >> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +	if (set) {
> >> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	} else
> >> { +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +		if
> >> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +			sec_exec_control
> >> |= +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +	}
> >> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + +	for
> >> (msr = 0x800; msr <= 0x8ff; msr++) +		vmx_intercept_for_msr_read(msr,
> >> false, !set); + +	if (set) { +		/* According SDM, in x2apic mode, the
> >> whole id reg is used. +		 * But in KVM, it only use the highest eight
> >> bits. Need to +		 * intercept it*/ +		vmx_intercept_for_msr_read(0x802,
> >> false, true); +		/* TMCCT */ +		vmx_intercept_for_msr_read(0x839,
> >> false, true); +	} +	/* TPR */ +	vmx_intercept_for_msr_write(0x808,
> >> false, !set); +} +
> >>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
> >>  exit_intr_info; @@ -7366,6 +7513,7 @@ static struct kvm_x86_ops
> >>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>  update_cr8_intercept,
> >> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
> >> 
> >>  	.set_tss_addr = vmx_set_tss_addr,
> >>  	.get_tdp_level = get_ept_level,
> >> --
> >> 1.7.1
> > 
> > --
> > 			Gleb.
> 
> 
> Best regards,
> Yang

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 14, 2013, 11:10 a.m. UTC | #4
Gleb Natapov wrote on 2013-01-14:
> On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-14:
>>> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
>>>> 
>>>> basically to benefit from apicv, we need to enable virtualized x2apic mode.
>>>> Currently, we only enable it when guest is really using x2apic.
>>>> 
>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
>>> x2apic:
>>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>>>                    except APIC ID and TMCCT.
>>>>     APIC ID and TMCCT: need software's assistance to get right value.
>>> Actually since msr bitmap is shared between all vcpus this will break
>>> guests that do not enable x2apic.
>> I don't think this case will exist. It will break the real OS too.
>> 
> Which case? One VM uses x2apic another one does not? Bitmap is shared
> between all vcpus of all VMs.
Sorry. I misread your comments.

Yes, it is really a problem. Maybe we need to use per VM msr bitmap instead global bitmap.
 
>>>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>>> ---
>>>>  arch/x86/include/asm/kvm_host.h |    1 + arch/x86/include/asm/vmx.h
>>>>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
>>>>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c
>>>>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files
> changed, 173
>>>>  insertions(+), 12 deletions(-)
>>>> diff --git a/arch/x86/include/asm/kvm_host.h
>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 ---
>>>> a/arch/x86/include/asm/kvm_host.h +++
>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,7 @@ struct
>>>> kvm_x86_ops {
>>>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 	void
>>>>  (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
>>>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>>>>  +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
>>>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 	int
>>>>  (*get_tdp_level)(void); 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu,
>>>>  gfn_t gfn, bool is_mmio);
>>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>>> index 44c3f7e..0a54df0 100644
>>>> --- a/arch/x86/include/asm/vmx.h
>>>> +++ b/arch/x86/include/asm/vmx.h
>>>> @@ -139,6 +139,7 @@
>>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
>>>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
>>>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
>>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>> index 0664c13..2ef5e2b 100644
>>>> --- a/arch/x86/kvm/lapic.c
>>>> +++ b/arch/x86/kvm/lapic.c
>>>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu
> *vcpu,
>>> u64 value)
>>>>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
>>>>  		value &= ~MSR_IA32_APICBASE_BSP;
>>>> -	vcpu->arch.apic_base = value;
>>>> -	if (apic_x2apic_mode(apic)) {
>>>> -		u32 id = kvm_apic_id(apic);
>>>> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>> -		kvm_apic_set_ldr(apic, ldr);
>>>> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
>>>> +		if (value & X2APIC_ENABLE) {
>>>> +			u32 id = kvm_apic_id(apic);
>>>> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>> +			kvm_apic_set_ldr(apic, ldr);
>>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
>>>> +		} else
>>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>>>>  	}
>>>> +
>>>> +	vcpu->arch.apic_base = value;
>>>>  	apic->base_address = apic->vcpu->arch.apic_base &
>>>>  			     MSR_IA32_APICBASE_BASE;
>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>> index d29d3cd..38407e9 100644
>>>> --- a/arch/x86/kvm/svm.c
>>>> +++ b/arch/x86/kvm/svm.c
>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
> kvm_vcpu
>>> *vcpu, int tpr, int irr)
>>>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>>>  }
>>>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
>>>> set) +{ +	return; +} +
>>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
>>>>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
>>>>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>>>  update_cr8_intercept,
>>>> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>>>> 
>>>>  	.set_tss_addr = svm_set_tss_addr,
>>>>  	.get_tdp_level = get_npt_level,
>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>> index 0403634..847022e 100644
>>>> --- a/arch/x86/kvm/vmx.c
>>>> +++ b/arch/x86/kvm/vmx.c
>>>> @@ -767,6 +767,12 @@ static inline bool
>>> cpu_has_vmx_virtualize_apic_accesses(void)
>>>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>  }
>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>>>> +{
>>>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>> +}
>>>> +
>>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>>>  {
>>>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
>>> vmcs_config *vmcs_conf)
>>>>  	if (_cpu_based_exec_control &
>>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 		min2 = 0; 		opt2 =
>>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>>>  			SECONDARY_EXEC_WBINVD_EXITING | 	SECONDARY_EXEC_ENABLE_VPID |
>>>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 @@ static void
>>>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
>>>> -static void __vmx_disable_intercept_for_msr(unsigned long
>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2
>>>> +static void __vmx_disable_intercept_for_msr(unsigned long
>>>> *msr_bitmap, + 					u32 msr, int type) +{ +	int f = sizeof(unsigned
>>>> long); + +	if (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See
>>>> Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 *
>>>> have the write-low and read-high bitmap offsets the wrong way round.
>>>> +	 * We can control MSRs 0x00000000-0x00001fff and
>>>> 0xc0000000-0xc0001fff. +	 */ +	if (msr <= 0x1fff) { +		if (type &
>>>> MSR_TYPE_R) +			/* read-low */ +			__clear_bit(msr, msr_bitmap +
>>>> 0x000 / f); + +		if (type & MSR_TYPE_W) +			/* write-low */ +
>>>> 	__clear_bit(msr, msr_bitmap + 0x800 / f); + +	} else if ((msr >=
>>>> 0xc0000000) && (msr <= 0xc0001fff)) { +		msr &= 0x1fff; +		if (type &
>>>> MSR_TYPE_R) + 	/* read-high */ +			__clear_bit(msr, msr_bitmap +
>>>> 0x400 / f); + +		if (type & MSR_TYPE_W) +			/* write-high */ +
>>>> 	__clear_bit(msr, msr_bitmap + 0xc00 / f); + +	} +} + +static void
>>>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + 					u32
>>>> msr, int type)
>>>>  {
>>>>  	int f = sizeof(unsigned long);
>>>> @@ -3737,20 +3782,75 @@ static void
>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>>>>  	 * We can control MSRs 0x00000000-0x00001fff and
>>>>  0xc0000000-0xc0001fff. 	 */ 	if (msr <= 0x1fff) {
>>>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>>>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
>>>> +		if (type & MSR_TYPE_R)
>>>> +			/* read-low */
>>>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
>>>> +
>>>> +		if (type & MSR_TYPE_W)
>>>> +			/* write-low */
>>>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
>>>> +
>>>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>>>  		msr &= 0x1fff;
>>>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
>>>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
>>>> +		if (type & MSR_TYPE_R)
>>>> +			/* read-high */
>>>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
>>>> +
>>>> +		if (type & MSR_TYPE_W)
>>>> +			/* write-high */
>>>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
>>>> +
>>>>  	} } + static void vmx_disable_intercept_for_msr(u32 msr, bool
>>>>  longmode_only) { 	if (!longmode_only)
>>>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
>>>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +						msr,
>>>> MSR_TYPE_R | MSR_TYPE_W);
>>>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>> +						msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void
>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, +					bool
>>>> set) +{ +	if (!longmode_only) { +		if (set) +
>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_R); +		else +
>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_R); + +	} +	if (set)
>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_R); +	else
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_R); +} + +static void vmx_intercept_for_msr_write(u32 msr,
>>>> bool longmode_only, +					bool set) +{ +	if (!longmode_only) { +		if
>>>> (set) + 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
>>>> +					msr, MSR_TYPE_W); +		else +
>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_W); + +	} +	if (set)
>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_W); +	else
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_W);
>>>>  }
>>>>  
>>>>  /*
>>>> @@ -3848,6 +3948,7 @@ static u32 vmx_secondary_exec_control(struct
>>> vcpu_vmx *vmx)
>>>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
>>>>  (!enable_apicv_reg) 		exec_control &=
>>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
>>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return
> exec_control; }
>>>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct
> kvm_vcpu
>>> *vcpu, int tpr, int irr)
>>>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>>>  }
>>>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
>>>> set) +{ +	u32 exec_control, sec_exec_control; +	int msr; +	struct
>>>> vcpu_vmx *vmx = to_vmx(vcpu); + +	/* There is not point to enable
>>>> virtualize x2apic without enable +	 * apicv*/ +	if
>>>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +
>>>> 	return; + +	if (set) { +		exec_control =
>>>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		/* virtualize x2apic mode
>>>> relies on tpr shadow */ +		if (!(exec_control &
>>>> CPU_BASED_TPR_SHADOW)) +			return; +	} + +	sec_exec_control =
>>>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +	if (set) {
>>>> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	}
>>>> else { +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>> +		if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +
>>>> 	sec_exec_control |= +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>> +	} +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); +
>>>> +	for (msr = 0x800; msr <= 0x8ff; msr++) +
>>>> 	vmx_intercept_for_msr_read(msr, false, !set); + +	if (set) { +		/*
>>>> According SDM, in x2apic mode, the whole id reg is used. +		 * But in
>>>> KVM, it only use the highest eight bits. Need to +		 * intercept it*/
>>>> + 	vmx_intercept_for_msr_read(0x802, false, true); +		/* TMCCT */ +
>>>> 	vmx_intercept_for_msr_read(0x839, false, true); +	} +	/* TPR */
>>>> +	vmx_intercept_for_msr_write(0x808, false, !set); +} +
>>>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
>>>>  exit_intr_info; @@ -7366,6 +7513,7 @@ static struct kvm_x86_ops
>>>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>>>  update_cr8_intercept,
>>>> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>>>> 
>>>>  	.set_tss_addr = vmx_set_tss_addr,
>>>>  	.get_tdp_level = get_ept_level,
>>>> --
>>>> 1.7.1
>>> 
>>> --
>>> 			Gleb.
>> 
>> 
>> Best regards,
>> Yang
> 
> --
> 			Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 14, 2013, 11:16 a.m. UTC | #5
On Mon, Jan 14, 2013 at 11:10:26AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-14:
> > On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote:
> >> Gleb Natapov wrote on 2013-01-14:
> >>> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
> >>>> From: Yang Zhang <yang.z.zhang@Intel.com>
> >>>> 
> >>>> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> >>>> Currently, we only enable it when guest is really using x2apic.
> >>>> 
> >>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
> >>> x2apic:
> >>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
> >>>>                    except APIC ID and TMCCT.
> >>>>     APIC ID and TMCCT: need software's assistance to get right value.
> >>> Actually since msr bitmap is shared between all vcpus this will break
> >>> guests that do not enable x2apic.
> >> I don't think this case will exist. It will break the real OS too.
> >> 
> > Which case? One VM uses x2apic another one does not? Bitmap is shared
> > between all vcpus of all VMs.
> Sorry. I misread your comments.
> 
I miswrote it. Forgot to include "of all VMs" there.

> Yes, it is really a problem. Maybe we need to use per VM msr bitmap instead global bitmap.
Are you sure cpus cannot be in different modes during boot and smp
initialization? Where spec says that?

>  
> >>>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> >>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >>>> ---
> >>>>  arch/x86/include/asm/kvm_host.h |    1 + arch/x86/include/asm/vmx.h
> >>>>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
> >>>>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c
> >>>>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files
> > changed, 173
> >>>>  insertions(+), 12 deletions(-)
> >>>> diff --git a/arch/x86/include/asm/kvm_host.h
> >>>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 ---
> >>>> a/arch/x86/include/asm/kvm_host.h +++
> >>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,7 @@ struct
> >>>> kvm_x86_ops {
> >>>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 	void
> >>>>  (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
> >>>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> >>>>  +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
> >>>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 	int
> >>>>  (*get_tdp_level)(void); 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu,
> >>>>  gfn_t gfn, bool is_mmio);
> >>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> >>>> index 44c3f7e..0a54df0 100644
> >>>> --- a/arch/x86/include/asm/vmx.h
> >>>> +++ b/arch/x86/include/asm/vmx.h
> >>>> @@ -139,6 +139,7 @@
> >>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
> >>>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
> >>>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
> >>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
> >>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
> >>>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
> >>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> >>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >>>> index 0664c13..2ef5e2b 100644
> >>>> --- a/arch/x86/kvm/lapic.c
> >>>> +++ b/arch/x86/kvm/lapic.c
> >>>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu
> > *vcpu,
> >>> u64 value)
> >>>>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
> >>>>  		value &= ~MSR_IA32_APICBASE_BSP;
> >>>> -	vcpu->arch.apic_base = value;
> >>>> -	if (apic_x2apic_mode(apic)) {
> >>>> -		u32 id = kvm_apic_id(apic);
> >>>> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> >>>> -		kvm_apic_set_ldr(apic, ldr);
> >>>> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
> >>>> +		if (value & X2APIC_ENABLE) {
> >>>> +			u32 id = kvm_apic_id(apic);
> >>>> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> >>>> +			kvm_apic_set_ldr(apic, ldr);
> >>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> >>>> +		} else
> >>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> >>>>  	}
> >>>> +
> >>>> +	vcpu->arch.apic_base = value;
> >>>>  	apic->base_address = apic->vcpu->arch.apic_base &
> >>>>  			     MSR_IA32_APICBASE_BASE;
> >>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >>>> index d29d3cd..38407e9 100644
> >>>> --- a/arch/x86/kvm/svm.c
> >>>> +++ b/arch/x86/kvm/svm.c
> >>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
> > kvm_vcpu
> >>> *vcpu, int tpr, int irr)
> >>>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
> >>>>  }
> >>>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
> >>>> set) +{ +	return; +} +
> >>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
> >>>>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
> >>>>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>>>  update_cr8_intercept,
> >>>> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
> >>>> 
> >>>>  	.set_tss_addr = svm_set_tss_addr,
> >>>>  	.get_tdp_level = get_npt_level,
> >>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >>>> index 0403634..847022e 100644
> >>>> --- a/arch/x86/kvm/vmx.c
> >>>> +++ b/arch/x86/kvm/vmx.c
> >>>> @@ -767,6 +767,12 @@ static inline bool
> >>> cpu_has_vmx_virtualize_apic_accesses(void)
> >>>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>>>  }
> >>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> >>>> +{
> >>>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >>>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >>>> +}
> >>>> +
> >>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
> >>>>  {
> >>>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >>>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
> >>> vmcs_config *vmcs_conf)
> >>>>  	if (_cpu_based_exec_control &
> >>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 		min2 = 0; 		opt2 =
> >>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> >>>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
> >>>>  			SECONDARY_EXEC_WBINVD_EXITING | 	SECONDARY_EXEC_ENABLE_VPID |
> >>>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 @@ static void
> >>>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
> >>>> -static void __vmx_disable_intercept_for_msr(unsigned long
> >>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2
> >>>> +static void __vmx_disable_intercept_for_msr(unsigned long
> >>>> *msr_bitmap, + 					u32 msr, int type) +{ +	int f = sizeof(unsigned
> >>>> long); + +	if (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See
> >>>> Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 *
> >>>> have the write-low and read-high bitmap offsets the wrong way round.
> >>>> +	 * We can control MSRs 0x00000000-0x00001fff and
> >>>> 0xc0000000-0xc0001fff. +	 */ +	if (msr <= 0x1fff) { +		if (type &
> >>>> MSR_TYPE_R) +			/* read-low */ +			__clear_bit(msr, msr_bitmap +
> >>>> 0x000 / f); + +		if (type & MSR_TYPE_W) +			/* write-low */ +
> >>>> 	__clear_bit(msr, msr_bitmap + 0x800 / f); + +	} else if ((msr >=
> >>>> 0xc0000000) && (msr <= 0xc0001fff)) { +		msr &= 0x1fff; +		if (type &
> >>>> MSR_TYPE_R) + 	/* read-high */ +			__clear_bit(msr, msr_bitmap +
> >>>> 0x400 / f); + +		if (type & MSR_TYPE_W) +			/* write-high */ +
> >>>> 	__clear_bit(msr, msr_bitmap + 0xc00 / f); + +	} +} + +static void
> >>>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + 					u32
> >>>> msr, int type)
> >>>>  {
> >>>>  	int f = sizeof(unsigned long);
> >>>> @@ -3737,20 +3782,75 @@ static void
> >>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
> >>>>  	 * We can control MSRs 0x00000000-0x00001fff and
> >>>>  0xc0000000-0xc0001fff. 	 */ 	if (msr <= 0x1fff) {
> >>>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> >>>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> >>>> +		if (type & MSR_TYPE_R)
> >>>> +			/* read-low */
> >>>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
> >>>> +
> >>>> +		if (type & MSR_TYPE_W)
> >>>> +			/* write-low */
> >>>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
> >>>> +
> >>>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> >>>>  		msr &= 0x1fff;
> >>>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> >>>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> >>>> +		if (type & MSR_TYPE_R)
> >>>> +			/* read-high */
> >>>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
> >>>> +
> >>>> +		if (type & MSR_TYPE_W)
> >>>> +			/* write-high */
> >>>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
> >>>> +
> >>>>  	} } + static void vmx_disable_intercept_for_msr(u32 msr, bool
> >>>>  longmode_only) { 	if (!longmode_only)
> >>>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> >>>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> >>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +						msr,
> >>>> MSR_TYPE_R | MSR_TYPE_W);
> >>>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >>>> +						msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void
> >>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, +					bool
> >>>> set) +{ +	if (!longmode_only) { +		if (set) +
> >>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
> >>>> MSR_TYPE_R); +		else +
> >>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
> >>>> MSR_TYPE_R); + +	} +	if (set)
> >>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
> >>>> MSR_TYPE_R); +	else
> >>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
> >>>> MSR_TYPE_R); +} + +static void vmx_intercept_for_msr_write(u32 msr,
> >>>> bool longmode_only, +					bool set) +{ +	if (!longmode_only) { +		if
> >>>> (set) + 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >>>> +					msr, MSR_TYPE_W); +		else +
> >>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
> >>>> MSR_TYPE_W); + +	} +	if (set)
> >>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
> >>>> MSR_TYPE_W); +	else
> >>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
> >>>> MSR_TYPE_W);
> >>>>  }
> >>>>  
> >>>>  /*
> >>>> @@ -3848,6 +3948,7 @@ static u32 vmx_secondary_exec_control(struct
> >>> vcpu_vmx *vmx)
> >>>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
> >>>>  (!enable_apicv_reg) 		exec_control &=
> >>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
> >>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return
> > exec_control; }
> >>>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct
> > kvm_vcpu
> >>> *vcpu, int tpr, int irr)
> >>>>  	vmcs_write32(TPR_THRESHOLD, irr);
> >>>>  }
> >>>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
> >>>> set) +{ +	u32 exec_control, sec_exec_control; +	int msr; +	struct
> >>>> vcpu_vmx *vmx = to_vmx(vcpu); + +	/* There is not point to enable
> >>>> virtualize x2apic without enable +	 * apicv*/ +	if
> >>>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +
> >>>> 	return; + +	if (set) { +		exec_control =
> >>>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		/* virtualize x2apic mode
> >>>> relies on tpr shadow */ +		if (!(exec_control &
> >>>> CPU_BASED_TPR_SHADOW)) +			return; +	} + +	sec_exec_control =
> >>>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +	if (set) {
> >>>> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>>> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	}
> >>>> else { +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >>>> +		if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +
> >>>> 	sec_exec_control |= +					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>>> +	} +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); +
> >>>> +	for (msr = 0x800; msr <= 0x8ff; msr++) +
> >>>> 	vmx_intercept_for_msr_read(msr, false, !set); + +	if (set) { +		/*
> >>>> According SDM, in x2apic mode, the whole id reg is used. +		 * But in
> >>>> KVM, it only use the highest eight bits. Need to +		 * intercept it*/
> >>>> + 	vmx_intercept_for_msr_read(0x802, false, true); +		/* TMCCT */ +
> >>>> 	vmx_intercept_for_msr_read(0x839, false, true); +	} +	/* TPR */
> >>>> +	vmx_intercept_for_msr_write(0x808, false, !set); +} +
> >>>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
> >>>>  exit_intr_info; @@ -7366,6 +7513,7 @@ static struct kvm_x86_ops
> >>>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>>>  update_cr8_intercept,
> >>>> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
> >>>> 
> >>>>  	.set_tss_addr = vmx_set_tss_addr,
> >>>>  	.get_tdp_level = get_ept_level,
> >>>> --
> >>>> 1.7.1
> >>> 
> >>> --
> >>> 			Gleb.
> >> 
> >> 
> >> Best regards,
> >> Yang
> > 
> > --
> > 			Gleb.
> 
> 
> Best regards,
> Yang
> 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 14, 2013, 11:25 a.m. UTC | #6
Gleb Natapov wrote on 2013-01-14:
> On Mon, Jan 14, 2013 at 11:10:26AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-14:
>>> On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote:
>>>> Gleb Natapov wrote on 2013-01-14:
>>>>> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
>>>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
>>>>>> 
>>>>>> basically to benefit from apicv, we need to enable virtualized
>>>>>> x2apic mode. Currently, we only enable it when guest is really
>>>>>> using x2apic.
>>>>>> 
>>>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest
> enabled
>>>>> x2apic:
>>>>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>>>>>                    except APIC ID and TMCCT.
>>>>>>     APIC ID and TMCCT: need software's assistance to get right value.
>>>>> Actually since msr bitmap is shared between all vcpus this will break
>>>>> guests that do not enable x2apic.
>>>> I don't think this case will exist. It will break the real OS too.
>>>> 
>>> Which case? One VM uses x2apic another one does not? Bitmap is shared
>>> between all vcpus of all VMs.
>> Sorry. I misread your comments.
>> 
> I miswrote it. Forgot to include "of all VMs" there.
> 
>> Yes, it is really a problem. Maybe we need to use per VM msr bitmap instead
> global bitmap.
> Are you sure cpus cannot be in different modes during boot and smp
> initialization? Where spec says that?
I don't think hardware has this limitation. I mean use per vcpu's msr bitmap instead global bitmap.

>> 
>>>>>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>>>>> ---
>>>>>>  arch/x86/include/asm/kvm_host.h |    1 +
> arch/x86/include/asm/vmx.h
>>>>>>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
>>>>>>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c
>>>>>>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files
>>> changed, 173
>>>>>>  insertions(+), 12 deletions(-)
>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h
>>>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 ---
>>>>>> a/arch/x86/include/asm/kvm_host.h +++
>>>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,7 @@ struct
>>>>>> kvm_x86_ops {
>>>>>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 	void
>>>>>>  (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
>>>>>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>>>>>>  +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool
>>>>>>  set); 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>>>>>  	int (*get_tdp_level)(void); 	u64 (*get_mt_mask)(struct kvm_vcpu
>>>>>>  *vcpu, gfn_t gfn, bool is_mmio);
>>>>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>>>>> index 44c3f7e..0a54df0 100644
>>>>>> --- a/arch/x86/include/asm/vmx.h
>>>>>> +++ b/arch/x86/include/asm/vmx.h
>>>>>> @@ -139,6 +139,7 @@
>>>>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
>>>>>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
>>>>>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
>>>>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>>>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>>>>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>>>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>>>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>>>> index 0664c13..2ef5e2b 100644
>>>>>> --- a/arch/x86/kvm/lapic.c
>>>>>> +++ b/arch/x86/kvm/lapic.c
>>>>>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu
>>> *vcpu,
>>>>> u64 value)
>>>>>>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
>>>>>>  		value &= ~MSR_IA32_APICBASE_BSP;
>>>>>> -	vcpu->arch.apic_base = value;
>>>>>> -	if (apic_x2apic_mode(apic)) {
>>>>>> -		u32 id = kvm_apic_id(apic);
>>>>>> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>>> -		kvm_apic_set_ldr(apic, ldr);
>>>>>> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
>>>>>> +		if (value & X2APIC_ENABLE) {
>>>>>> +			u32 id = kvm_apic_id(apic);
>>>>>> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>>> +			kvm_apic_set_ldr(apic, ldr);
>>>>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
>>>>>> +		} else
>>>>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>>>>>>  	}
>>>>>> +
>>>>>> +	vcpu->arch.apic_base = value;
>>>>>>  	apic->base_address = apic->vcpu->arch.apic_base &
>>>>>>  			     MSR_IA32_APICBASE_BASE;
>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>>>> index d29d3cd..38407e9 100644
>>>>>> --- a/arch/x86/kvm/svm.c
>>>>>> +++ b/arch/x86/kvm/svm.c
>>>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
>>> kvm_vcpu
>>>>> *vcpu, int tpr, int irr)
>>>>>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>>>>>  }
>>>>>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
>>>>>> set) +{ +	return; +} +
>>>>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct
>>>>>>  vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct
>>>>>>  kvm_x86_ops svm_x86_ops = { 	.enable_nmi_window =
>>>>>>  enable_nmi_window, 	.enable_irq_window = enable_irq_window,
>>>>>>  	.update_cr8_intercept = update_cr8_intercept,
>>>>>> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>>>>>> 
>>>>>>  	.set_tss_addr = svm_set_tss_addr,
>>>>>>  	.get_tdp_level = get_npt_level,
>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>>>> index 0403634..847022e 100644
>>>>>> --- a/arch/x86/kvm/vmx.c
>>>>>> +++ b/arch/x86/kvm/vmx.c
>>>>>> @@ -767,6 +767,12 @@ static inline bool
>>>>> cpu_has_vmx_virtualize_apic_accesses(void)
>>>>>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>>  }
>>>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>>>>>> +{
>>>>>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>>>> +}
>>>>>> +
>>>>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>>>>>  {
>>>>>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
>>>>> vmcs_config *vmcs_conf)
>>>>>>  	if (_cpu_based_exec_control &
>>>>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 		min2 = 0; 		opt2 =
>>>>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>>>>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>>>>>  			SECONDARY_EXEC_WBINVD_EXITING | 	SECONDARY_EXEC_ENABLE_VPID |
>>>>>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 @@ static void
>>>>>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
>>>>>> -static void __vmx_disable_intercept_for_msr(unsigned long
>>>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2
>>>>>> +static void __vmx_disable_intercept_for_msr(unsigned long
>>>>>> *msr_bitmap, + 					u32 msr, int type) +{ +	int f = sizeof(unsigned
>>>>>> long); + +	if (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* + 	 *
>>>>>> See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	
>>>>>> * have the write-low and read-high bitmap offsets the wrong way
>>>>>> round. +	 * We can control MSRs 0x00000000-0x00001fff and
>>>>>> 0xc0000000-0xc0001fff. +	 */ +	if (msr <= 0x1fff) { +		if (type &
>>>>>> MSR_TYPE_R) +			/* read-low */ + 	__clear_bit(msr, msr_bitmap +
>>>>>> 0x000 / f); + +		if (type & MSR_TYPE_W) +			/* write-low */ +
>>>>>> 	__clear_bit(msr, msr_bitmap + 0x800 / f); + +	} else if ((msr >=
>>>>>> 0xc0000000) && (msr <= 0xc0001fff)) { +		msr &= 0x1fff; + 	if (type
>>>>>> & MSR_TYPE_R) + 	/* read-high */ +			__clear_bit(msr, msr_bitmap +
>>>>>> 0x400 / f); + +		if (type & MSR_TYPE_W) +			/* write-high */ +
>>>>>> 	__clear_bit(msr, msr_bitmap + 0xc00 / f); + +	} +} + +static void
>>>>>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + 				u32
>>>>>> msr, int type)
>>>>>>  {
>>>>>>  	int f = sizeof(unsigned long);
>>>>>> @@ -3737,20 +3782,75 @@ static void
>>>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>>>>>>  	 * We can control MSRs 0x00000000-0x00001fff and
>>>>>>  0xc0000000-0xc0001fff. 	 */ 	if (msr <= 0x1fff) {
>>>>>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>>>>>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
>>>>>> +		if (type & MSR_TYPE_R)
>>>>>> +			/* read-low */
>>>>>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
>>>>>> +
>>>>>> +		if (type & MSR_TYPE_W)
>>>>>> +			/* write-low */
>>>>>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
>>>>>> +
>>>>>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>>>>>  		msr &= 0x1fff;
>>>>>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
>>>>>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
>>>>>> +		if (type & MSR_TYPE_R)
>>>>>> +			/* read-high */
>>>>>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
>>>>>> +
>>>>>> +		if (type & MSR_TYPE_W)
>>>>>> +			/* write-high */
>>>>>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
>>>>>> +
>>>>>>  	} } + static void vmx_disable_intercept_for_msr(u32 msr, bool
>>>>>>  longmode_only) { 	if (!longmode_only)
>>>>>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
>>>>>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>>>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>> 						msr, MSR_TYPE_R | MSR_TYPE_W);
>>>>>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +						msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void
>>>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, + 			bool
>>>>>> set) +{ +	if (!longmode_only) { +		if (set) +
>>>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, + 				msr,
>>>>>> MSR_TYPE_R); +		else +
>>>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + 				msr,
>>>>>> MSR_TYPE_R); + +	} +	if (set)
>>>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +				msr, MSR_TYPE_R); +	else
>>>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +				msr, MSR_TYPE_R); +} + +static void
>>>>>> vmx_intercept_for_msr_write(u32 msr, bool longmode_only, +					bool
>>>>>> set) +{ +	if (!longmode_only) { +		if (set) +
>>>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>>>> MSR_TYPE_W); +		else +
>>>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + 				msr,
>>>>>> MSR_TYPE_W); + +	} +	if (set)
>>>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +				msr, MSR_TYPE_W); +	else
>>>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +				msr, MSR_TYPE_W);
>>>>>>  }
>>>>>>  
>>>>>>  /*
>>>>>> @@ -3848,6 +3948,7 @@ static u32 vmx_secondary_exec_control(struct
>>>>> vcpu_vmx *vmx)
>>>>>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
>>>>>>  (!enable_apicv_reg) 		exec_control &=
>>>>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
>>>>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return
>>> exec_control; }
>>>>>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct
>>> kvm_vcpu
>>>>> *vcpu, int tpr, int irr)
>>>>>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>>>>>  }
>>>>>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu,
>>>>>> bool set) +{ +	u32 exec_control, sec_exec_control; +	int msr;
>>>>>> +	struct vcpu_vmx *vmx = to_vmx(vcpu); + +	/* There is not point to
>>>>>> enable virtualize x2apic without enable +	 * apicv*/ +	if
>>>>>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +
>>>>>> 	return; + +	if (set) { +		exec_control =
>>>>>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		/* virtualize x2apic
>>>>>> mode relies on tpr shadow */ +		if (!(exec_control &
>>>>>> CPU_BASED_TPR_SHADOW)) +			return; +	} + + 	sec_exec_control =
>>>>>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +	if (set) {
>>>>>> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	}
>>>>>> else { +		sec_exec_control &=
>>>>>> ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +		if
>>>>>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +
>>>>>> 	sec_exec_control |= + 	SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>> +	} +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); +
>>>>>> +	for (msr = 0x800; msr <= 0x8ff; msr++) +
>>>>>> 	vmx_intercept_for_msr_read(msr, false, !set); + +	if (set) { +		/*
>>>>>> According SDM, in x2apic mode, the whole id reg is used. +		 * But
>>>>>> in KVM, it only use the highest eight bits. Need to +		 * intercept
>>>>>> it*/ + 	vmx_intercept_for_msr_read(0x802, false, true); +		/* TMCCT
>>>>>> */ + 	vmx_intercept_for_msr_read(0x839, false, true); +	} +	/* TPR
>>>>>> */ +	vmx_intercept_for_msr_write(0x808, false, !set); +} +
>>>>>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
>>>>>>  exit_intr_info; @@ -7366,6 +7513,7 @@ static struct kvm_x86_ops
>>>>>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>>>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>>>>>  update_cr8_intercept,
>>>>>> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>>>>>> 
>>>>>>  	.set_tss_addr = vmx_set_tss_addr,
>>>>>>  	.get_tdp_level = get_ept_level,
>>>>>> --
>>>>>> 1.7.1
>>>>> 
>>>>> --
>>>>> 			Gleb.
>>>> 
>>>> 
>>>> Best regards,
>>>> Yang
>>> 
>>> --
>>> 			Gleb.
>> 
>> 
>> Best regards,
>> Yang
>> 
> 
> --
> 			Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 14, 2013, 11:38 a.m. UTC | #7
On Mon, Jan 14, 2013 at 11:25:45AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-14:
> > On Mon, Jan 14, 2013 at 11:10:26AM +0000, Zhang, Yang Z wrote:
> >> Gleb Natapov wrote on 2013-01-14:
> >>> On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote:
> >>>> Gleb Natapov wrote on 2013-01-14:
> >>>>> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
> >>>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
> >>>>>> 
> >>>>>> basically to benefit from apicv, we need to enable virtualized
> >>>>>> x2apic mode. Currently, we only enable it when guest is really
> >>>>>> using x2apic.
> >>>>>> 
> >>>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest
> > enabled
> >>>>> x2apic:
> >>>>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
> >>>>>>                    except APIC ID and TMCCT.
> >>>>>>     APIC ID and TMCCT: need software's assistance to get right value.
> >>>>> Actually since msr bitmap is shared between all vcpus this will break
> >>>>> guests that do not enable x2apic.
> >>>> I don't think this case will exist. It will break the real OS too.
> >>>> 
> >>> Which case? One VM uses x2apic another one does not? Bitmap is shared
> >>> between all vcpus of all VMs.
> >> Sorry. I misread your comments.
> >> 
> > I miswrote it. Forgot to include "of all VMs" there.
> > 
> >> Yes, it is really a problem. Maybe we need to use per VM msr bitmap instead
> > global bitmap.
> > Are you sure cpus cannot be in different modes during boot and smp
> > initialization? Where spec says that?
> I don't think hardware has this limitation. I mean use per vcpu's msr bitmap instead global bitmap.
> 
Even if HW has such limitation we cannot have per VM bitmap since
malicious guest can abuse it.

Per cpu msr bitmap means that each vcpu will waste one more page of
memory. We already have different global msr bitmap for long mode
and legacy mode, may be make it 4: x2apic X long X legacy.

> >> 
> >>>>>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> >>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >>>>>> ---
> >>>>>>  arch/x86/include/asm/kvm_host.h |    1 +
> > arch/x86/include/asm/vmx.h
> >>>>>>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
> >>>>>>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c
> >>>>>>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files
> >>> changed, 173
> >>>>>>  insertions(+), 12 deletions(-)
> >>>>>> diff --git a/arch/x86/include/asm/kvm_host.h
> >>>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 ---
> >>>>>> a/arch/x86/include/asm/kvm_host.h +++
> >>>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,7 @@ struct
> >>>>>> kvm_x86_ops {
> >>>>>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 	void
> >>>>>>  (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
> >>>>>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> >>>>>>  +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool
> >>>>>>  set); 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> >>>>>>  	int (*get_tdp_level)(void); 	u64 (*get_mt_mask)(struct kvm_vcpu
> >>>>>>  *vcpu, gfn_t gfn, bool is_mmio);
> >>>>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> >>>>>> index 44c3f7e..0a54df0 100644
> >>>>>> --- a/arch/x86/include/asm/vmx.h
> >>>>>> +++ b/arch/x86/include/asm/vmx.h
> >>>>>> @@ -139,6 +139,7 @@
> >>>>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
> >>>>>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
> >>>>>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
> >>>>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
> >>>>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
> >>>>>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
> >>>>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> >>>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >>>>>> index 0664c13..2ef5e2b 100644
> >>>>>> --- a/arch/x86/kvm/lapic.c
> >>>>>> +++ b/arch/x86/kvm/lapic.c
> >>>>>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu
> >>> *vcpu,
> >>>>> u64 value)
> >>>>>>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
> >>>>>>  		value &= ~MSR_IA32_APICBASE_BSP;
> >>>>>> -	vcpu->arch.apic_base = value;
> >>>>>> -	if (apic_x2apic_mode(apic)) {
> >>>>>> -		u32 id = kvm_apic_id(apic);
> >>>>>> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> >>>>>> -		kvm_apic_set_ldr(apic, ldr);
> >>>>>> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
> >>>>>> +		if (value & X2APIC_ENABLE) {
> >>>>>> +			u32 id = kvm_apic_id(apic);
> >>>>>> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> >>>>>> +			kvm_apic_set_ldr(apic, ldr);
> >>>>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> >>>>>> +		} else
> >>>>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> >>>>>>  	}
> >>>>>> +
> >>>>>> +	vcpu->arch.apic_base = value;
> >>>>>>  	apic->base_address = apic->vcpu->arch.apic_base &
> >>>>>>  			     MSR_IA32_APICBASE_BASE;
> >>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >>>>>> index d29d3cd..38407e9 100644
> >>>>>> --- a/arch/x86/kvm/svm.c
> >>>>>> +++ b/arch/x86/kvm/svm.c
> >>>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
> >>> kvm_vcpu
> >>>>> *vcpu, int tpr, int irr)
> >>>>>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
> >>>>>>  }
> >>>>>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
> >>>>>> set) +{ +	return; +} +
> >>>>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct
> >>>>>>  vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct
> >>>>>>  kvm_x86_ops svm_x86_ops = { 	.enable_nmi_window =
> >>>>>>  enable_nmi_window, 	.enable_irq_window = enable_irq_window,
> >>>>>>  	.update_cr8_intercept = update_cr8_intercept,
> >>>>>> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
> >>>>>> 
> >>>>>>  	.set_tss_addr = svm_set_tss_addr,
> >>>>>>  	.get_tdp_level = get_npt_level,
> >>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >>>>>> index 0403634..847022e 100644
> >>>>>> --- a/arch/x86/kvm/vmx.c
> >>>>>> +++ b/arch/x86/kvm/vmx.c
> >>>>>> @@ -767,6 +767,12 @@ static inline bool
> >>>>> cpu_has_vmx_virtualize_apic_accesses(void)
> >>>>>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>>>>>  }
> >>>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> >>>>>> +{
> >>>>>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >>>>>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >>>>>> +}
> >>>>>> +
> >>>>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
> >>>>>>  {
> >>>>>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >>>>>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
> >>>>> vmcs_config *vmcs_conf)
> >>>>>>  	if (_cpu_based_exec_control &
> >>>>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 		min2 = 0; 		opt2 =
> >>>>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> >>>>>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
> >>>>>>  			SECONDARY_EXEC_WBINVD_EXITING | 	SECONDARY_EXEC_ENABLE_VPID |
> >>>>>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 @@ static void
> >>>>>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
> >>>>>> -static void __vmx_disable_intercept_for_msr(unsigned long
> >>>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2
> >>>>>> +static void __vmx_disable_intercept_for_msr(unsigned long
> >>>>>> *msr_bitmap, + 					u32 msr, int type) +{ +	int f = sizeof(unsigned
> >>>>>> long); + +	if (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* + 	 *
> >>>>>> See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	
> >>>>>> * have the write-low and read-high bitmap offsets the wrong way
> >>>>>> round. +	 * We can control MSRs 0x00000000-0x00001fff and
> >>>>>> 0xc0000000-0xc0001fff. +	 */ +	if (msr <= 0x1fff) { +		if (type &
> >>>>>> MSR_TYPE_R) +			/* read-low */ + 	__clear_bit(msr, msr_bitmap +
> >>>>>> 0x000 / f); + +		if (type & MSR_TYPE_W) +			/* write-low */ +
> >>>>>> 	__clear_bit(msr, msr_bitmap + 0x800 / f); + +	} else if ((msr >=
> >>>>>> 0xc0000000) && (msr <= 0xc0001fff)) { +		msr &= 0x1fff; + 	if (type
> >>>>>> & MSR_TYPE_R) + 	/* read-high */ +			__clear_bit(msr, msr_bitmap +
> >>>>>> 0x400 / f); + +		if (type & MSR_TYPE_W) +			/* write-high */ +
> >>>>>> 	__clear_bit(msr, msr_bitmap + 0xc00 / f); + +	} +} + +static void
> >>>>>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + 				u32
> >>>>>> msr, int type)
> >>>>>>  {
> >>>>>>  	int f = sizeof(unsigned long);
> >>>>>> @@ -3737,20 +3782,75 @@ static void
> >>>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
> >>>>>>  	 * We can control MSRs 0x00000000-0x00001fff and
> >>>>>>  0xc0000000-0xc0001fff. 	 */ 	if (msr <= 0x1fff) {
> >>>>>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> >>>>>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> >>>>>> +		if (type & MSR_TYPE_R)
> >>>>>> +			/* read-low */
> >>>>>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
> >>>>>> +
> >>>>>> +		if (type & MSR_TYPE_W)
> >>>>>> +			/* write-low */
> >>>>>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
> >>>>>> +
> >>>>>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> >>>>>>  		msr &= 0x1fff;
> >>>>>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> >>>>>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> >>>>>> +		if (type & MSR_TYPE_R)
> >>>>>> +			/* read-high */
> >>>>>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
> >>>>>> +
> >>>>>> +		if (type & MSR_TYPE_W)
> >>>>>> +			/* write-high */
> >>>>>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
> >>>>>> +
> >>>>>>  	} } + static void vmx_disable_intercept_for_msr(u32 msr, bool
> >>>>>>  longmode_only) { 	if (!longmode_only)
> >>>>>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> >>>>>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> >>>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
> >>>>>> 						msr, MSR_TYPE_R | MSR_TYPE_W);
> >>>>>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >>>>>> +						msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void
> >>>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, + 			bool
> >>>>>> set) +{ +	if (!longmode_only) { +		if (set) +
> >>>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, + 				msr,
> >>>>>> MSR_TYPE_R); +		else +
> >>>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + 				msr,
> >>>>>> MSR_TYPE_R); + +	} +	if (set)
> >>>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >>>>>> +				msr, MSR_TYPE_R); +	else
> >>>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >>>>>> +				msr, MSR_TYPE_R); +} + +static void
> >>>>>> vmx_intercept_for_msr_write(u32 msr, bool longmode_only, +					bool
> >>>>>> set) +{ +	if (!longmode_only) { +		if (set) +
> >>>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
> >>>>>> MSR_TYPE_W); +		else +
> >>>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + 				msr,
> >>>>>> MSR_TYPE_W); + +	} +	if (set)
> >>>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >>>>>> +				msr, MSR_TYPE_W); +	else
> >>>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >>>>>> +				msr, MSR_TYPE_W);
> >>>>>>  }
> >>>>>>  
> >>>>>>  /*
> >>>>>> @@ -3848,6 +3948,7 @@ static u32 vmx_secondary_exec_control(struct
> >>>>> vcpu_vmx *vmx)
> >>>>>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
> >>>>>>  (!enable_apicv_reg) 		exec_control &=
> >>>>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
> >>>>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return
> >>> exec_control; }
> >>>>>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct
> >>> kvm_vcpu
> >>>>> *vcpu, int tpr, int irr)
> >>>>>>  	vmcs_write32(TPR_THRESHOLD, irr);
> >>>>>>  }
> >>>>>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu,
> >>>>>> bool set) +{ +	u32 exec_control, sec_exec_control; +	int msr;
> >>>>>> +	struct vcpu_vmx *vmx = to_vmx(vcpu); + +	/* There is not point to
> >>>>>> enable virtualize x2apic without enable +	 * apicv*/ +	if
> >>>>>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +
> >>>>>> 	return; + +	if (set) { +		exec_control =
> >>>>>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		/* virtualize x2apic
> >>>>>> mode relies on tpr shadow */ +		if (!(exec_control &
> >>>>>> CPU_BASED_TPR_SHADOW)) +			return; +	} + + 	sec_exec_control =
> >>>>>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +	if (set) {
> >>>>>> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>>>>> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	}
> >>>>>> else { +		sec_exec_control &=
> >>>>>> ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +		if
> >>>>>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +
> >>>>>> 	sec_exec_control |= + 	SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>>>>> +	} +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); +
> >>>>>> +	for (msr = 0x800; msr <= 0x8ff; msr++) +
> >>>>>> 	vmx_intercept_for_msr_read(msr, false, !set); + +	if (set) { +		/*
> >>>>>> According SDM, in x2apic mode, the whole id reg is used. +		 * But
> >>>>>> in KVM, it only use the highest eight bits. Need to +		 * intercept
> >>>>>> it*/ + 	vmx_intercept_for_msr_read(0x802, false, true); +		/* TMCCT
> >>>>>> */ + 	vmx_intercept_for_msr_read(0x839, false, true); +	} +	/* TPR
> >>>>>> */ +	vmx_intercept_for_msr_write(0x808, false, !set); +} +
> >>>>>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
> >>>>>>  exit_intr_info; @@ -7366,6 +7513,7 @@ static struct kvm_x86_ops
> >>>>>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>>>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>>>>>  update_cr8_intercept,
> >>>>>> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
> >>>>>> 
> >>>>>>  	.set_tss_addr = vmx_set_tss_addr,
> >>>>>>  	.get_tdp_level = get_ept_level,
> >>>>>> --
> >>>>>> 1.7.1
> >>>>> 
> >>>>> --
> >>>>> 			Gleb.
> >>>> 
> >>>> 
> >>>> Best regards,
> >>>> Yang
> >>> 
> >>> --
> >>> 			Gleb.
> >> 
> >> 
> >> Best regards,
> >> Yang
> >> 
> > 
> > --
> > 			Gleb.
> 
> 
> Best regards,
> Yang
> 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 15, 2013, 3:21 a.m. UTC | #8
Attached patch is modified according your comments to use 4 msr bitmaps. Please help to review it.

Also, is there any comments for the other part of this series? If yes, welcome. :)

Gleb Natapov wrote on 2013-01-14:
> On Mon, Jan 14, 2013 at 11:25:45AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-14:
>>> On Mon, Jan 14, 2013 at 11:10:26AM +0000, Zhang, Yang Z wrote:
>>>> Gleb Natapov wrote on 2013-01-14:
>>>>> On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote:
>>>>>> Gleb Natapov wrote on 2013-01-14:
>>>>>>> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
>>>>>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
>>>>>>>> 
>>>>>>>> basically to benefit from apicv, we need to enable virtualized
>>>>>>>> x2apic mode. Currently, we only enable it when guest is really
>>>>>>>> using x2apic.
>>>>>>>> 
>>>>>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest
>>> enabled
>>>>>>> x2apic:
>>>>>>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>>>>>>>                    except APIC ID and TMCCT.
>>>>>>>>     APIC ID and TMCCT: need software's assistance to get right value.
>>>>>>> Actually since msr bitmap is shared between all vcpus this will break
>>>>>>> guests that do not enable x2apic.
>>>>>> I don't think this case will exist. It will break the real OS too.
>>>>>> 
>>>>> Which case? One VM uses x2apic another one does not? Bitmap is shared
>>>>> between all vcpus of all VMs.
>>>> Sorry. I misread your comments.
>>>> 
>>> I miswrote it. Forgot to include "of all VMs" there.
>>> 
>>>> Yes, it is really a problem. Maybe we need to use per VM msr bitmap instead
>>> global bitmap.
>>> Are you sure cpus cannot be in different modes during boot and smp
>>> initialization? Where spec says that?
>> I don't think hardware has this limitation. I mean use per vcpu's msr
>> bitmap instead global bitmap.
>> 
> Even if HW has such limitation we cannot have per VM bitmap since
> malicious guest can abuse it.
> 
> Per cpu msr bitmap means that each vcpu will waste one more page of
> memory. We already have different global msr bitmap for long mode
> and legacy mode, may be make it 4: x2apic X long X legacy.
> 
>>>> 
>>>>>>>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>>>>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>>>>>>> ---
>>>>>>>>  arch/x86/include/asm/kvm_host.h |    1 +
>>> arch/x86/include/asm/vmx.h
>>>>>>>>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
>>>>>>>>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c
>>>>>>>>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files
>>>>> changed, 173
>>>>>>>>  insertions(+), 12 deletions(-)
>>>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h
>>>>>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644
>>>>>>>> --- a/arch/x86/include/asm/kvm_host.h +++
>>>>>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,7 @@ struct
>>>>>>>> kvm_x86_ops {
>>>>>>>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 	void
>>>>>>>>  (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
>>>>>>>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int
>>>>>>>>  irr); +	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu,
>>>>>>>>  bool set); 	int (*set_tss_addr)(struct kvm *kvm, unsigned int
>>>>>>>>  addr); 	int (*get_tdp_level)(void); 	u64 (*get_mt_mask)(struct
>>>>>>>>  kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>>>>>>>> diff --git a/arch/x86/include/asm/vmx.h
>>>>>>>> b/arch/x86/include/asm/vmx.h index 44c3f7e..0a54df0 100644 ---
>>>>>>>> a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@
>>>>>>>> -139,6 +139,7 @@
>>>>>>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
>>>>>>>>  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
>>>>>>>>  #define SECONDARY_EXEC_RDTSCP			0x00000008 +#define
>>>>>>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>>>>>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>>>>>>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>>>>>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>>>>>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>>>>>> index 0664c13..2ef5e2b 100644
>>>>>>>> --- a/arch/x86/kvm/lapic.c
>>>>>>>> +++ b/arch/x86/kvm/lapic.c
>>>>>>>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu
>>>>> *vcpu,
>>>>>>> u64 value)
>>>>>>>>  	if (!kvm_vcpu_is_bsp(apic->vcpu))
>>>>>>>>  		value &= ~MSR_IA32_APICBASE_BSP;
>>>>>>>> -	vcpu->arch.apic_base = value;
>>>>>>>> -	if (apic_x2apic_mode(apic)) {
>>>>>>>> -		u32 id = kvm_apic_id(apic);
>>>>>>>> -		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>>>>> -		kvm_apic_set_ldr(apic, ldr);
>>>>>>>> +	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
>>>>>>>> +		if (value & X2APIC_ENABLE) {
>>>>>>>> +			u32 id = kvm_apic_id(apic);
>>>>>>>> +			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>>>>> +			kvm_apic_set_ldr(apic, ldr);
>>>>>>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
>>>>>>>> +		} else
>>>>>>>> +			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
>>>>>>>>  	}
>>>>>>>> +
>>>>>>>> +	vcpu->arch.apic_base = value;
>>>>>>>>  	apic->base_address = apic->vcpu->arch.apic_base &
>>>>>>>>  			     MSR_IA32_APICBASE_BASE;
>>>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>>>>>> index d29d3cd..38407e9 100644
>>>>>>>> --- a/arch/x86/kvm/svm.c
>>>>>>>> +++ b/arch/x86/kvm/svm.c
>>>>>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
>>>>> kvm_vcpu
>>>>>>> *vcpu, int tpr, int irr)
>>>>>>>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>>>>>>>  }
>>>>>>>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu,
>>>>>>>> bool set) +{ +	return; +} +
>>>>>>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct
>>>>>>>>  vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static
>>>>>>>>  struct kvm_x86_ops svm_x86_ops = { 	.enable_nmi_window =
>>>>>>>>  enable_nmi_window, 	.enable_irq_window = enable_irq_window,
>>>>>>>>  	.update_cr8_intercept = update_cr8_intercept,
>>>>>>>> +	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>>>>>>>> 
>>>>>>>>  	.set_tss_addr = svm_set_tss_addr,
>>>>>>>>  	.get_tdp_level = get_npt_level,
>>>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>>>>>> index 0403634..847022e 100644
>>>>>>>> --- a/arch/x86/kvm/vmx.c
>>>>>>>> +++ b/arch/x86/kvm/vmx.c
>>>>>>>> @@ -767,6 +767,12 @@ static inline bool
>>>>>>> cpu_has_vmx_virtualize_apic_accesses(void)
>>>>>>>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>>>>  }
>>>>>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>>>>>>>> +{
>>>>>>>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>>>>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>>>>>>>  {
>>>>>>>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>>>>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
>>>>>>> vmcs_config *vmcs_conf)
>>>>>>>>  	if (_cpu_based_exec_control &
>>>>>>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 		min2 = 0; 		opt2 =
>>>>>>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>>>>>>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>>>>>>>  			SECONDARY_EXEC_WBINVD_EXITING | 	SECONDARY_EXEC_ENABLE_VPID |
>>>>>>>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 @@ static
>>>>>>>>  void free_vpid(struct vcpu_vmx *vmx)
> 	spin_unlock(&vmx_vpid_lock); }
>>>>>>>> -static void __vmx_disable_intercept_for_msr(unsigned long
>>>>>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2
>>>>>>>> +static void __vmx_disable_intercept_for_msr(unsigned long
>>>>>>>> *msr_bitmap, + 					u32 msr, int type) +{ +	int f =
>>>>>>>> sizeof(unsigned long); + +	if (!cpu_has_vmx_msr_bitmap())
>>>>>>>> +		return; + +	/* + 	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap
>>>>>>>> Address). Early manuals +
>>>>>>>> 
>>>>>>>> * have the write-low and read-high bitmap offsets the wrong way
>>>>>>>> round. +	 * We can control MSRs 0x00000000-0x00001fff and
>>>>>>>> 0xc0000000-0xc0001fff. +	 */ +	if (msr <= 0x1fff) { +		if (type &
>>>>>>>> MSR_TYPE_R) +			/* read-low */ + 	__clear_bit(msr, msr_bitmap +
>>>>>>>> 0x000 / f); + +		if (type & MSR_TYPE_W) +			/* write-low */ +
>>>>>>>> 	__clear_bit(msr, msr_bitmap + 0x800 / f); + +	} else if ((msr >=
>>>>>>>> 0xc0000000) && (msr <= 0xc0001fff)) { +		msr &= 0x1fff; + 	if
>>>>>>>> (type & MSR_TYPE_R) + 	/* read-high */ +			__clear_bit(msr,
>>>>>>>> msr_bitmap + 0x400 / f); + +		if (type & MSR_TYPE_W) +			/*
>>>>>>>> write-high */ + 	__clear_bit(msr, msr_bitmap + 0xc00 / f); + +	}
>>>>>>>> +} + +static void __vmx_enable_intercept_for_msr(unsigned long
>>>>>>>> *msr_bitmap, + 				u32 msr, int type)
>>>>>>>>  {
>>>>>>>>  	int f = sizeof(unsigned long);
>>>>>>>> @@ -3737,20 +3782,75 @@ static void
>>>>>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32
> msr)
>>>>>>>>  	 * We can control MSRs 0x00000000-0x00001fff and
>>>>>>>>  0xc0000000-0xc0001fff. 	 */ 	if (msr <= 0x1fff) {
>>>>>>>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>>>>>>>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
>>>>>>>> +		if (type & MSR_TYPE_R)
>>>>>>>> +			/* read-low */
>>>>>>>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
>>>>>>>> +
>>>>>>>> +		if (type & MSR_TYPE_W)
>>>>>>>> +			/* write-low */
>>>>>>>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
>>>>>>>> +
>>>>>>>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>>>>>>>  		msr &= 0x1fff;
>>>>>>>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
>>>>>>>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
>>>>>>>> +		if (type & MSR_TYPE_R)
>>>>>>>> +			/* read-high */
>>>>>>>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
>>>>>>>> +
>>>>>>>> +		if (type & MSR_TYPE_W)
>>>>>>>> +			/* write-high */
>>>>>>>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
>>>>>>>> +
>>>>>>>>  	} } + static void vmx_disable_intercept_for_msr(u32 msr, bool
>>>>>>>>  longmode_only) { 	if (!longmode_only)
>>>>>>>> - 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
>>>>>>>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>>>>>>>> + 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>>> 						msr, MSR_TYPE_R | MSR_TYPE_W);
>>>>>>>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +						msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void
>>>>>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, + 		bool
>>>>>>>> set) +{ +	if (!longmode_only) { +		if (set) +
>>>>>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, + 				msr,
>>>>>>>> MSR_TYPE_R); +		else +
>>>>>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>>> 				msr, MSR_TYPE_R); + +	} +	if (set) +
>>>>>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +				msr, MSR_TYPE_R); +	else +
>>>>>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +				msr, MSR_TYPE_R); +} + +static void
>>>>>>>> vmx_intercept_for_msr_write(u32 msr, bool longmode_only, +
>>>>>>>> 				bool set) +{ +	if (!longmode_only) { +		if (set) +
>>>>>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>>> 					msr, MSR_TYPE_W); +		else +
>>>>>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>>> 				msr, MSR_TYPE_W); + +	} +	if (set) +
>>>>>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +				msr, MSR_TYPE_W); +	else +
>>>>>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +				msr, MSR_TYPE_W);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  /*
>>>>>>>> @@ -3848,6 +3948,7 @@ static u32
> vmx_secondary_exec_control(struct
>>>>>>> vcpu_vmx *vmx)
>>>>>>>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
>>>>>>>>  (!enable_apicv_reg) 		exec_control &=
>>>>>>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
>>>>>>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return
>>>>> exec_control; }
>>>>>>>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct
>>>>> kvm_vcpu
>>>>>>> *vcpu, int tpr, int irr)
>>>>>>>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>>>>>>>  }
>>>>>>>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu,
>>>>>>>> bool set) +{ +	u32 exec_control, sec_exec_control; +	int msr;
>>>>>>>> +	struct vcpu_vmx *vmx = to_vmx(vcpu); + +	/* There is not point
>>>>>>>> to enable virtualize x2apic without enable +	 * apicv*/ +	if
>>>>>>>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +
>>>>>>>> 	return; + +	if (set) { +		exec_control =
>>>>>>>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +		/* virtualize x2apic
>>>>>>>> mode relies on tpr shadow */ +		if (!(exec_control &
>>>>>>>> CPU_BASED_TPR_SHADOW)) +			return; +	} + + 	sec_exec_control =
>>>>>>>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +	if (set) {
>>>>>>>> +		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>>>> +		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	}
>>>>>>>> else { +		sec_exec_control &=
>>>>>>>> ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +		if
>>>>>>>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +
>>>>>>>> 	sec_exec_control |= + 	SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>>>> +	} +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
>>>>>>>> + +	for (msr = 0x800; msr <= 0x8ff; msr++) +
>>>>>>>> 	vmx_intercept_for_msr_read(msr, false, !set); + +	if (set) { +
>>>>>>>> 	/* According SDM, in x2apic mode, the whole id reg is used. +		
>>>>>>>> * But in KVM, it only use the highest eight bits. Need to +		 *
>>>>>>>> intercept it*/ + 	vmx_intercept_for_msr_read(0x802, false, true);
>>>>>>>> + 	/* TMCCT */ + 	vmx_intercept_for_msr_read(0x839, false, true);
>>>>>>>> +	} + 	/* TPR */ +	vmx_intercept_for_msr_write(0x808, false,
>>>>>>>> !set); +} +
>>>>>>>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) {
>>>>>>>>  	u32 exit_intr_info; @@ -7366,6 +7513,7 @@ static struct
>>>>>>>>  kvm_x86_ops vmx_x86_ops = { 	.enable_nmi_window =
>>>>>>>>  enable_nmi_window, 	.enable_irq_window = enable_irq_window,
>>>>>>>>  	.update_cr8_intercept = update_cr8_intercept,
>>>>>>>> +	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>>>>>>>> 
>>>>>>>>  	.set_tss_addr = vmx_set_tss_addr,
>>>>>>>>  	.get_tdp_level = get_ept_level,
>>>>>>>> --
>>>>>>>> 1.7.1
>>>>>>> 
>>>>>>> --
>>>>>>> 			Gleb.
>>>>>> 
>>>>>> 
>>>>>> Best regards,
>>>>>> Yang
>>>>> 
>>>>> --
>>>>> 			Gleb.
>>>> 
>>>> 
>>>> Best regards,
>>>> Yang
>>>> 
>>> 
>>> --
>>> 			Gleb.
>> 
>> 
>> Best regards,
>> Yang
>> 
> 
> --
> 			Gleb.


Best regards,
Yang
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c431b33..35aa8e6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -697,6 +697,7 @@  struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 44c3f7e..0a54df0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -139,6 +139,7 @@ 
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
 #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_RDTSCP			0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0664c13..2ef5e2b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1323,12 +1323,17 @@  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 	if (!kvm_vcpu_is_bsp(apic->vcpu))
 		value &= ~MSR_IA32_APICBASE_BSP;
 
-	vcpu->arch.apic_base = value;
-	if (apic_x2apic_mode(apic)) {
-		u32 id = kvm_apic_id(apic);
-		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
-		kvm_apic_set_ldr(apic, ldr);
+	if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
+		if (value & X2APIC_ENABLE) {
+			u32 id = kvm_apic_id(apic);
+			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+			kvm_apic_set_ldr(apic, ldr);
+			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
+		} else
+			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
 	}
+
+	vcpu->arch.apic_base = value;
 	apic->base_address = apic->vcpu->arch.apic_base &
 			     MSR_IA32_APICBASE_BASE;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd..38407e9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,11 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
+static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+	return;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4295,7 @@  static struct kvm_x86_ops svm_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
+	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0403634..847022e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -767,6 +767,12 @@  static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 }
 
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
 static inline bool cpu_has_vmx_apic_register_virt(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -2543,6 +2549,7 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
 		min2 = 0;
 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 			SECONDARY_EXEC_WBINVD_EXITING |
 			SECONDARY_EXEC_ENABLE_VPID |
 			SECONDARY_EXEC_ENABLE_EPT |
@@ -3724,7 +3731,45 @@  static void free_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+#define MSR_TYPE_R	1
+#define MSR_TYPE_W	2
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+						u32 msr, int type)
+{
+	int f = sizeof(unsigned long);
+
+	if (!cpu_has_vmx_msr_bitmap())
+		return;
+
+	/*
+	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+	 * have the write-low and read-high bitmap offsets the wrong way round.
+	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+	 */
+	if (msr <= 0x1fff) {
+		if (type & MSR_TYPE_R)
+			/* read-low */
+			__clear_bit(msr, msr_bitmap + 0x000 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-low */
+			__clear_bit(msr, msr_bitmap + 0x800 / f);
+
+	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+		msr &= 0x1fff;
+		if (type & MSR_TYPE_R)
+			/* read-high */
+			__clear_bit(msr, msr_bitmap + 0x400 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-high */
+			__clear_bit(msr, msr_bitmap + 0xc00 / f);
+
+	}
+}
+
+static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+						u32 msr, int type)
 {
 	int f = sizeof(unsigned long);
 
@@ -3737,20 +3782,75 @@  static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 	 */
 	if (msr <= 0x1fff) {
-		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
-		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+		if (type & MSR_TYPE_R)
+			/* read-low */
+			__set_bit(msr, msr_bitmap + 0x000 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-low */
+			__set_bit(msr, msr_bitmap + 0x800 / f);
+
 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 		msr &= 0x1fff;
-		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
-		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+		if (type & MSR_TYPE_R)
+			/* read-high */
+			__set_bit(msr, msr_bitmap + 0x400 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-high */
+			__set_bit(msr, msr_bitmap + 0xc00 / f);
+
 	}
 }
 
+
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
 	if (!longmode_only)
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+						msr, MSR_TYPE_R | MSR_TYPE_W);
+	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+						msr, MSR_TYPE_R | MSR_TYPE_W);
+}
+
+static void vmx_intercept_for_msr_read(u32 msr, bool longmode_only,
+					bool set)
+{
+	if (!longmode_only) {
+		if (set)
+			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
+					msr, MSR_TYPE_R);
+		else
+			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+					msr, MSR_TYPE_R);
+
+	}
+	if (set)
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
+				msr, MSR_TYPE_R);
+	else
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+				msr, MSR_TYPE_R);
+}
+
+static void vmx_intercept_for_msr_write(u32 msr, bool longmode_only,
+					bool set)
+{
+	if (!longmode_only) {
+		if (set)
+			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
+					msr, MSR_TYPE_W);
+		else
+			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+					msr, MSR_TYPE_W);
+
+	}
+	if (set)
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
+				msr, MSR_TYPE_W);
+	else
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+				msr, MSR_TYPE_W);
 }
 
 /*
@@ -3848,6 +3948,7 @@  static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 	if (!enable_apicv_reg)
 		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 	return exec_control;
 }
 
@@ -6103,6 +6204,52 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
+static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+	u32 exec_control, sec_exec_control;
+	int msr;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/* There is not point to enable virtualize x2apic without enable
+	 * apicv*/
+	if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg)
+		return;
+
+	if (set) {
+		exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+		/* virtualize x2apic mode relies on tpr shadow */
+		if (!(exec_control & CPU_BASED_TPR_SHADOW))
+			return;
+	}
+
+	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+	if (set) {
+		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+	} else {
+		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+		if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+			sec_exec_control |=
+					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	}
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+
+	for (msr = 0x800; msr <= 0x8ff; msr++)
+		vmx_intercept_for_msr_read(msr, false, !set);
+
+	if (set) {
+		/* According SDM, in x2apic mode, the whole id reg is used.
+		 * But in KVM, it only use the highest eight bits. Need to
+		 * intercept it*/
+		vmx_intercept_for_msr_read(0x802, false, true);
+		/* TMCCT */
+		vmx_intercept_for_msr_read(0x839, false, true);
+	}
+	/* TPR */
+	vmx_intercept_for_msr_write(0x808, false, !set);
+}
+
 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -7366,6 +7513,7 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
+	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,