diff mbox

[v9,2/3] x86, apicv: add virtual x2apic support

Message ID 1357802768-15816-3-git-send-email-yang.z.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhang, Yang Z Jan. 10, 2013, 7:26 a.m. UTC
From: Yang Zhang <yang.z.zhang@Intel.com>

basically to benefit from apicv, we need to enable virtualized x2apic mode.
Currently, we only enable it when guest is really using x2apic.

Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
    0x800 - 0x8ff: no read intercept for apicv register virtualization,
    		   except APIC ID and TMCCT.
    APIC ID and TMCCT: need software's assistance to get right value.
    TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
---
 arch/x86/include/asm/kvm_host.h |    2 +
 arch/x86/include/asm/vmx.h      |    1 +
 arch/x86/kvm/lapic.c            |    5 +-
 arch/x86/kvm/svm.c              |    6 +
 arch/x86/kvm/vmx.c              |  194 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 200 insertions(+), 8 deletions(-)

Comments

Gleb Natapov Jan. 10, 2013, 7:55 a.m. UTC | #1
On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
> From: Yang Zhang <yang.z.zhang@Intel.com>
> 
> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> Currently, we only enable it when guest is really using x2apic.
> 
> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic:
>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>     		   except APIC ID and TMCCT.
>     APIC ID and TMCCT: need software's assistance to get right value.
>     TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
> 
> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    2 +
>  arch/x86/include/asm/vmx.h      |    1 +
>  arch/x86/kvm/lapic.c            |    5 +-
>  arch/x86/kvm/svm.c              |    6 +
>  arch/x86/kvm/vmx.c              |  194 +++++++++++++++++++++++++++++++++++++--
>  5 files changed, 200 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..572a562 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -697,6 +697,8 @@ struct kvm_x86_ops {
>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> +	void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
> +	void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
Make one callback with enable/disable parameter. And do not forget SVM.


>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>  	int (*get_tdp_level)(void);
>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 44c3f7e..0a54df0 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -139,6 +139,7 @@
>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
>  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
>  #define SECONDARY_EXEC_RDTSCP			0x00000008
> +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
>  #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
>  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
>  #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 0664c13..ec38906 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
>  		u32 id = kvm_apic_id(apic);
>  		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>  		kvm_apic_set_ldr(apic, ldr);
> -	}
> +		kvm_x86_ops->enable_virtual_x2apic_mode(vcpu);
> +	} else
> +		kvm_x86_ops->disable_virtual_x2apic_mode(vcpu);
> +
You just broke SVM.

>  	apic->base_address = apic->vcpu->arch.apic_base &
>  			     MSR_IA32_APICBASE_BASE;
>  
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d29d3cd..0b82cb1 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>  }
>  
> +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> +{
> +	return;
> +}
> +
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = {
>  	.enable_nmi_window = enable_nmi_window,
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
> +	.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
>  
>  	.set_tss_addr = svm_set_tss_addr,
>  	.get_tdp_level = get_npt_level,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 688f43f..b203ce7 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -433,6 +433,8 @@ struct vcpu_vmx {
>  
>  	bool rdtscp_enabled;
>  
> +	bool virtual_x2apic_enabled;
> +
>  	/* Support for a guest hypervisor (nested VMX) */
>  	struct nested_vmx nested;
>  };
> @@ -767,12 +769,23 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>  }
>  
> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> +{
> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +}
> +
>  static inline bool cpu_has_vmx_apic_register_virt(void)
>  {
>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
>  }
>  
> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
> +{
> +	return false;
> +}
> +
>  static inline bool cpu_has_vmx_flexpriority(void)
>  {
>  	return cpu_has_vmx_tpr_shadow() &&
> @@ -2544,6 +2557,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
>  		min2 = 0;
>  		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>  			SECONDARY_EXEC_WBINVD_EXITING |
>  			SECONDARY_EXEC_ENABLE_VPID |
>  			SECONDARY_EXEC_ENABLE_EPT |
> @@ -3731,7 +3745,45 @@ static void free_vpid(struct vcpu_vmx *vmx)
>  	spin_unlock(&vmx_vpid_lock);
>  }
>  
> -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
> +#define MSR_TYPE_R	1
> +#define MSR_TYPE_W	2
> +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> +						u32 msr, int type)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (!cpu_has_vmx_msr_bitmap())
> +		return;
> +
> +	/*
> +	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
> +	 * have the write-low and read-high bitmap offsets the wrong way round.
> +	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
> +	 */
> +	if (msr <= 0x1fff) {
> +		if (type & MSR_TYPE_R)
> +			/* read-low */
> +			__clear_bit(msr, msr_bitmap + 0x000 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-low */
> +			__clear_bit(msr, msr_bitmap + 0x800 / f);
> +
> +	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> +		msr &= 0x1fff;
> +		if (type & MSR_TYPE_R)
> +			/* read-high */
> +			__clear_bit(msr, msr_bitmap + 0x400 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-high */
> +			__clear_bit(msr, msr_bitmap + 0xc00 / f);
> +
> +	}
> +}
> +
> +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
> +						u32 msr, int type)
>  {
>  	int f = sizeof(unsigned long);
>  
> @@ -3744,20 +3796,75 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>  	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
>  	 */
>  	if (msr <= 0x1fff) {
> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> +		if (type & MSR_TYPE_R)
> +			/* read-low */
> +			__set_bit(msr, msr_bitmap + 0x000 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-low */
> +			__set_bit(msr, msr_bitmap + 0x800 / f);
> +
>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>  		msr &= 0x1fff;
> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> +		if (type & MSR_TYPE_R)
> +			/* read-high */
> +			__set_bit(msr, msr_bitmap + 0x400 / f);
> +
> +		if (type & MSR_TYPE_W)
> +			/* write-high */
> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
> +
>  	}
>  }
>  
> +
>  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
>  {
>  	if (!longmode_only)
> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> +}
> +
> +static void vmx_intercept_for_msr_read(u32 msr, bool longmode_only,
> +					bool set)
> +{
> +	if (!longmode_only) {
> +		if (set)
> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +					msr, MSR_TYPE_R);
> +		else
> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +					msr, MSR_TYPE_R);
> +
> +	}
> +	if (set)
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +				msr, MSR_TYPE_R);
> +	else
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +				msr, MSR_TYPE_R);
> +}
> +
> +static void vmx_intercept_for_msr_write(u32 msr, bool longmode_only,
> +					bool set)
> +{
> +	if (!longmode_only) {
> +		if (set)
> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +					msr, MSR_TYPE_W);
> +		else
> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> +					msr, MSR_TYPE_W);
> +
> +	}
> +	if (set)
> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +				msr, MSR_TYPE_W);
> +	else
> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> +				msr, MSR_TYPE_W);
>  }
>  
>  /*
> @@ -3855,6 +3962,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>  	if (!enable_apicv_reg_vid)
>  		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>  	return exec_control;
>  }
>  
> @@ -6110,6 +6218,76 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  	vmcs_write32(TPR_THRESHOLD, irr);
>  }
>  
> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> +{
> +	u32 exec_control;
> +	int msr;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
> +		return;
> +
> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +	/* virtualize x2apic mode relies on tpr shadow */
> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
> +		return;
> +
> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> +	vmx->virtual_x2apic_enabled = true;
Why track it?

> +
> +	if (!cpu_has_vmx_virtual_intr_delivery())
> +		return;
> +
You need to test whether vid is enabled, not whether it can be enabled.
And you need to test it at the beginning of the function. If vid is
disabled we do not want to set SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE.
Also force disable vid if !cpu_has_vmx_virtualize_x2apic_mode().

> +	for (msr = 0x800; msr <= 0x8ff; msr++)
> +		vmx_intercept_for_msr_read(msr, false, false);
> +
> +	/* APIC ID */
> +	vmx_intercept_for_msr_read(0x802, false, true);
> +	/* TMCCT */
> +	vmx_intercept_for_msr_read(0x839, false, true);
> +	/* TPR */
> +	vmx_intercept_for_msr_write(0x808, false, false);
> +	/* EOI */
> +	vmx_intercept_for_msr_write(0x80b, false, false);
> +	/* SELF-IPI */
> +	vmx_intercept_for_msr_write(0x83f, false, false);
> +
> +}
> +
> +static void vmx_disable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> +{
> +	u32 second_exec_control;
> +	int msr;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	/* If doesn't enable virtual x2apic before, do nothing*/
> +	if (!vmx->virtual_x2apic_enabled)
> +		return;
> +
> +	second_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +	/* disalbe virtual x2apic*/
> +	second_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +	second_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, second_exec_control);
> +	vmx->virtual_x2apic_enabled = false;
> +
> +	if (!cpu_has_vmx_virtual_intr_delivery())
> +		return;
> +
> +	for (msr = 0x800; msr <= 0x8ff; msr++)
> +		vmx_intercept_for_msr_read(msr, false, true);
> +
> +	/* TPR */
> +	vmx_intercept_for_msr_write(0x808, false, true);
> +	/* EOI */
> +	vmx_intercept_for_msr_write(0x80b, false, true);
> +	/* SELF-IPI */
> +	vmx_intercept_for_msr_write(0x83f, false, true);
> +}
> +
>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
>  {
>  	u32 exit_intr_info;
> @@ -7373,6 +7551,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
>  	.enable_nmi_window = enable_nmi_window,
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
> +	.enable_virtual_x2apic_mode = vmx_enable_virtual_x2apic_mode,
> +	.disable_virtual_x2apic_mode = vmx_disable_virtual_x2apic_mode,
>  
>  	.set_tss_addr = vmx_set_tss_addr,
>  	.get_tdp_level = get_ept_level,
> -- 
> 1.7.1

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 10, 2013, 8:25 a.m. UTC | #2
On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> +{
> +	u32 exec_control;
> +	int msr;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
> +		return;
> +
> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +	/* virtualize x2apic mode relies on tpr shadow */
> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
> +		return;
> +
> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> +	vmx->virtual_x2apic_enabled = true;
> +
> +	if (!cpu_has_vmx_virtual_intr_delivery())
> +		return;
> +
> +	for (msr = 0x800; msr <= 0x8ff; msr++)
> +		vmx_intercept_for_msr_read(msr, false, false);
> +
> +	/* APIC ID */
> +	vmx_intercept_for_msr_read(0x802, false, true);
Why are you enabling apic id read intercept?

> +	/* TMCCT */
> +	vmx_intercept_for_msr_read(0x839, false, true);
> +	/* TPR */
> +	vmx_intercept_for_msr_write(0x808, false, false);
> +	/* EOI */
> +	vmx_intercept_for_msr_write(0x80b, false, false);
> +	/* SELF-IPI */
> +	vmx_intercept_for_msr_write(0x83f, false, false);
> +
> +}
> +

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 10, 2013, 8:31 a.m. UTC | #3
Gleb Natapov wrote on 2013-01-10:
> On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
>> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
>> +{
>> +	u32 exec_control;
>> +	int msr;
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
>> +		return;
>> +
>> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>> +	/* virtualize x2apic mode relies on tpr shadow */
>> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
>> +		return;
>> +
>> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
>> +	vmx->virtual_x2apic_enabled = true;
>> +
>> +	if (!cpu_has_vmx_virtual_intr_delivery())
>> +		return;
>> +
>> +	for (msr = 0x800; msr <= 0x8ff; msr++)
>> +		vmx_intercept_for_msr_read(msr, false, false);
>> +
>> +	/* APIC ID */
>> +	vmx_intercept_for_msr_read(0x802, false, true);
> Why are you enabling apic id read intercept?
Current code to read apic id in x2apic mode has some hacks:

if (apic_x2apic_mode(apic))
       val = kvm_apic_id(apic);
else
       val = kvm_apic_id(apic) << 24;

static inline int kvm_apic_id(struct kvm_lapic *apic)
{
        return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}

According SPEC, in x2apic mode, the whole id reg is used, but in KVM, it only use the highest eight bits.

>> +	/* TMCCT */
>> +	vmx_intercept_for_msr_read(0x839, false, true);
>> +	/* TPR */
>> +	vmx_intercept_for_msr_write(0x808, false, false);
>> +	/* EOI */
>> +	vmx_intercept_for_msr_write(0x80b, false, false);
>> +	/* SELF-IPI */
>> +	vmx_intercept_for_msr_write(0x83f, false, false);
>> +
>> +}
>> +
> 
> --
> 			Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 10, 2013, 8:32 a.m. UTC | #4
Gleb Natapov wrote on 2013-01-10:
> On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
>> From: Yang Zhang <yang.z.zhang@Intel.com>
>> 
>> basically to benefit from apicv, we need to enable virtualized x2apic mode.
>> Currently, we only enable it when guest is really using x2apic.
>> 
>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
> x2apic:
>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>     		   except APIC ID and TMCCT.
>>     APIC ID and TMCCT: need software's assistance to get right value.
>>     TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    2 + arch/x86/include/asm/vmx.h   
>>    |    1 + arch/x86/kvm/lapic.c            |    5 +-
>>  arch/x86/kvm/svm.c              |    6 + arch/x86/kvm/vmx.c           
>>    |  194 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 200
>>  insertions(+), 8 deletions(-)
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -697,6 +697,8 @@ struct kvm_x86_ops {
>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>> +	void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
>> +	void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
> Make one callback with enable/disable parameter. And do not forget SVM.
> 
>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>  	int (*get_tdp_level)(void);
>>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 44c3f7e..0a54df0 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -139,6 +139,7 @@
>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 0664c13..ec38906 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu,
> u64 value)
>>  		u32 id = kvm_apic_id(apic);
>>  		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>  		kvm_apic_set_ldr(apic, ldr);
>> -	}
>> +		kvm_x86_ops->enable_virtual_x2apic_mode(vcpu);
>> +	} else
>> +		kvm_x86_ops->disable_virtual_x2apic_mode(vcpu);
>> +
> You just broke SVM.
>>  	apic->base_address = apic->vcpu->arch.apic_base &
>>  			     MSR_IA32_APICBASE_BASE;
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index d29d3cd..0b82cb1 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>  }
>> +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
>> +{
>> +	return;
>> +}
>> +
>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
>>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
>>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>  update_cr8_intercept,
>> +	.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
>> 
>>  	.set_tss_addr = svm_set_tss_addr,
>>  	.get_tdp_level = get_npt_level,
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 688f43f..b203ce7 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -433,6 +433,8 @@ struct vcpu_vmx {
>> 
>>  	bool rdtscp_enabled;
>> +	bool virtual_x2apic_enabled;
>> +
>>  	/* Support for a guest hypervisor (nested VMX) */
>>  	struct nested_vmx nested;
>>  };
>> @@ -767,12 +769,23 @@ static inline bool
> cpu_has_vmx_virtualize_apic_accesses(void)
>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>  }
>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>> +{
>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>> +}
>> +
>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>  {
>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
>>  }
>> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
>> +{
>> +	return false;
>> +}
>> +
>>  static inline bool cpu_has_vmx_flexpriority(void)
>>  {
>>  	return cpu_has_vmx_tpr_shadow() &&
>> @@ -2544,6 +2557,7 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
>>  { 		min2 = 0; 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>  			SECONDARY_EXEC_WBINVD_EXITING | 			SECONDARY_EXEC_ENABLE_VPID |
>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3731,7 +3745,45 @@ static void
>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
>> -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
>> u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2 +static void
>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +						u32
>> msr, int type) +{ +	int f = sizeof(unsigned long); + +	if
>> (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See Intel PRM Vol.
>> 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 * have the write-low
>> and read-high bitmap offsets the wrong way round. +	 * We can control
>> MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. +	 */ +	if (msr
>> <= 0x1fff) { +		if (type & MSR_TYPE_R) +			/* read-low */
>> +			__clear_bit(msr, msr_bitmap + 0x000 / f); + +		if (type &
>> MSR_TYPE_W) +			/* write-low */ +			__clear_bit(msr, msr_bitmap + 0x800
>> / f); + +	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>> +		msr &= 0x1fff; +		if (type & MSR_TYPE_R) +			/* read-high */
>> +			__clear_bit(msr, msr_bitmap + 0x400 / f); + +		if (type &
>> MSR_TYPE_W) +			/* write-high */ +			__clear_bit(msr, msr_bitmap +
>> 0xc00 / f); + +	} +} + +static void
>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +						u32
>> msr, int type)
>>  {
>>  	int f = sizeof(unsigned long);
>> @@ -3744,20 +3796,75 @@ static void
> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>>  	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
>>  	 */
>>  	if (msr <= 0x1fff) {
>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
>> +		if (type & MSR_TYPE_R)
>> +			/* read-low */
>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
>> +
>> +		if (type & MSR_TYPE_W)
>> +			/* write-low */
>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
>> +
>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>  		msr &= 0x1fff;
>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
>> +		if (type & MSR_TYPE_R)
>> +			/* read-high */
>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
>> +
>> +		if (type & MSR_TYPE_W)
>> +			/* write-high */
>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
>> +
>>  	}
>>  }
>> +
>>  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
>>  {
>>  	if (!longmode_only)
>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +						msr, MSR_TYPE_R | MSR_TYPE_W);
>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +						msr, MSR_TYPE_R | MSR_TYPE_W);
>> +}
>> +
>> +static void vmx_intercept_for_msr_read(u32 msr, bool longmode_only,
>> +					bool set)
>> +{
>> +	if (!longmode_only) {
>> +		if (set)
>> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +					msr, MSR_TYPE_R);
>> +		else
>> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +					msr, MSR_TYPE_R);
>> +
>> +	}
>> +	if (set)
>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +				msr, MSR_TYPE_R);
>> +	else
>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +				msr, MSR_TYPE_R);
>> +}
>> +
>> +static void vmx_intercept_for_msr_write(u32 msr, bool longmode_only,
>> +					bool set)
>> +{
>> +	if (!longmode_only) {
>> +		if (set)
>> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +					msr, MSR_TYPE_W);
>> +		else
>> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
>> +					msr, MSR_TYPE_W);
>> +
>> +	}
>> +	if (set)
>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +				msr, MSR_TYPE_W);
>> +	else
>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>> +				msr, MSR_TYPE_W);
>>  }
>>  
>>  /*
>> @@ -3855,6 +3962,7 @@ static u32 vmx_secondary_exec_control(struct
> vcpu_vmx *vmx)
>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
>>  (!enable_apicv_reg_vid) 		exec_control &=
>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return exec_control; }
>> @@ -6110,6 +6218,76 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>  }
>> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
>> +{
>> +	u32 exec_control;
>> +	int msr;
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
>> +		return;
>> +
>> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>> +	/* virtualize x2apic mode relies on tpr shadow */
>> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
>> +		return;
>> +
>> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
>> +	vmx->virtual_x2apic_enabled = true;
> Why track it?
With this flag, we don't need to read vmcs to check whether we enabled virtua x2apic before.

>> +
>> +	if (!cpu_has_vmx_virtual_intr_delivery())
>> +		return;
>> +
> You need to test whether vid is enabled, not whether it can be enabled.
> And you need to test it at the beginning of the function. If vid is
> disabled we do not want to set SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE.
> Also force disable vid if !cpu_has_vmx_virtualize_x2apic_mode().
vid is not depend on virtualize x2apic.
The right logic should be:
When register virtualization enabled, read all apic msr(except apic id reg and tmcct which have the hook) not cause vmexit and only write TPR not cause vmexit.
When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
It's better to put the patch of envirtualize x2apic mode as first patch.

>> +	for (msr = 0x800; msr <= 0x8ff; msr++)
>> +		vmx_intercept_for_msr_read(msr, false, false);
>> +
>> +	/* APIC ID */
>> +	vmx_intercept_for_msr_read(0x802, false, true);
>> +	/* TMCCT */
>> +	vmx_intercept_for_msr_read(0x839, false, true);
>> +	/* TPR */
>> +	vmx_intercept_for_msr_write(0x808, false, false);
>> +	/* EOI */
>> +	vmx_intercept_for_msr_write(0x80b, false, false);
>> +	/* SELF-IPI */
>> +	vmx_intercept_for_msr_write(0x83f, false, false);
>> +
>> +}
>> +
>> +static void vmx_disable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
>> +{
>> +	u32 second_exec_control;
>> +	int msr;
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +	/* If doesn't enable virtual x2apic before, do nothing*/
>> +	if (!vmx->virtual_x2apic_enabled)
>> +		return;
>> +
>> +	second_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>> +	/* disalbe virtual x2apic*/
>> +	second_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>> +	second_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, second_exec_control);
>> +	vmx->virtual_x2apic_enabled = false;
>> +
>> +	if (!cpu_has_vmx_virtual_intr_delivery())
>> +		return;
>> +
>> +	for (msr = 0x800; msr <= 0x8ff; msr++)
>> +		vmx_intercept_for_msr_read(msr, false, true);
>> +
>> +	/* TPR */
>> +	vmx_intercept_for_msr_write(0x808, false, true);
>> +	/* EOI */
>> +	vmx_intercept_for_msr_write(0x80b, false, true);
>> +	/* SELF-IPI */
>> +	vmx_intercept_for_msr_write(0x83f, false, true);
>> +}
>> +
>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
>>  exit_intr_info; @@ -7373,6 +7551,8 @@ static struct kvm_x86_ops
>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>  update_cr8_intercept,
>> +	.enable_virtual_x2apic_mode = vmx_enable_virtual_x2apic_mode,
>> +	.disable_virtual_x2apic_mode = vmx_disable_virtual_x2apic_mode,
>> 
>>  	.set_tss_addr = vmx_set_tss_addr,
>>  	.get_tdp_level = get_ept_level,
>> --
>> 1.7.1
> 
> --
> 			Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 10, 2013, 8:52 a.m. UTC | #5
On Thu, Jan 10, 2013 at 08:32:06AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-10:
> > On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
> >> From: Yang Zhang <yang.z.zhang@Intel.com>
> >> 
> >> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> >> Currently, we only enable it when guest is really using x2apic.
> >> 
> >> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
> > x2apic:
> >>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
> >>     		   except APIC ID and TMCCT.
> >>     APIC ID and TMCCT: need software's assistance to get right value.
> >>     TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
> >> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> >> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    2 + arch/x86/include/asm/vmx.h   
> >>    |    1 + arch/x86/kvm/lapic.c            |    5 +-
> >>  arch/x86/kvm/svm.c              |    6 + arch/x86/kvm/vmx.c           
> >>    |  194 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 200
> >>  insertions(+), 8 deletions(-)
> >> diff --git a/arch/x86/include/asm/kvm_host.h
> >> b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
> >> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -697,6 +697,8 @@ struct kvm_x86_ops {
> >>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
> >>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
> >>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> >> +	void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
> >> +	void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
> > Make one callback with enable/disable parameter. And do not forget SVM.
> > 
> >>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> >>  	int (*get_tdp_level)(void);
> >>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> >> index 44c3f7e..0a54df0 100644
> >> --- a/arch/x86/include/asm/vmx.h
> >> +++ b/arch/x86/include/asm/vmx.h
> >> @@ -139,6 +139,7 @@
> >>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
> >>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
> >>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
> >>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
> >>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
> >>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
> >>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >> index 0664c13..ec38906 100644
> >> --- a/arch/x86/kvm/lapic.c
> >> +++ b/arch/x86/kvm/lapic.c
> >> @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu,
> > u64 value)
> >>  		u32 id = kvm_apic_id(apic);
> >>  		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> >>  		kvm_apic_set_ldr(apic, ldr);
> >> -	}
> >> +		kvm_x86_ops->enable_virtual_x2apic_mode(vcpu);
> >> +	} else
> >> +		kvm_x86_ops->disable_virtual_x2apic_mode(vcpu);
> >> +
> > You just broke SVM.
> >>  	apic->base_address = apic->vcpu->arch.apic_base &
> >>  			     MSR_IA32_APICBASE_BASE;
> >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >> index d29d3cd..0b82cb1 100644
> >> --- a/arch/x86/kvm/svm.c
> >> +++ b/arch/x86/kvm/svm.c
> >> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu
> > *vcpu, int tpr, int irr)
> >>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
> >>  }
> >> +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> >> +{
> >> +	return;
> >> +}
> >> +
> >>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
> >>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
> >>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>  update_cr8_intercept,
> >> +	.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
> >> 
> >>  	.set_tss_addr = svm_set_tss_addr,
> >>  	.get_tdp_level = get_npt_level,
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> index 688f43f..b203ce7 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -433,6 +433,8 @@ struct vcpu_vmx {
> >> 
> >>  	bool rdtscp_enabled;
> >> +	bool virtual_x2apic_enabled;
> >> +
> >>  	/* Support for a guest hypervisor (nested VMX) */
> >>  	struct nested_vmx nested;
> >>  };
> >> @@ -767,12 +769,23 @@ static inline bool
> > cpu_has_vmx_virtualize_apic_accesses(void)
> >>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>  }
> >> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> >> +{
> >> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >> +}
> >> +
> >>  static inline bool cpu_has_vmx_apic_register_virt(void)
> >>  {
> >>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
> >>  }
> >> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
> >> +{
> >> +	return false;
> >> +}
> >> +
> >>  static inline bool cpu_has_vmx_flexpriority(void)
> >>  {
> >>  	return cpu_has_vmx_tpr_shadow() &&
> >> @@ -2544,6 +2557,7 @@ static __init int setup_vmcs_config(struct
> > vmcs_config *vmcs_conf)
> >>  	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
> >>  { 		min2 = 0; 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> >>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
> >>  			SECONDARY_EXEC_WBINVD_EXITING | 			SECONDARY_EXEC_ENABLE_VPID |
> >>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3731,7 +3745,45 @@ static void
> >>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
> >> -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> >> u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2 +static void
> >> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +						u32
> >> msr, int type) +{ +	int f = sizeof(unsigned long); + +	if
> >> (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See Intel PRM Vol.
> >> 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 * have the write-low
> >> and read-high bitmap offsets the wrong way round. +	 * We can control
> >> MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. +	 */ +	if (msr
> >> <= 0x1fff) { +		if (type & MSR_TYPE_R) +			/* read-low */
> >> +			__clear_bit(msr, msr_bitmap + 0x000 / f); + +		if (type &
> >> MSR_TYPE_W) +			/* write-low */ +			__clear_bit(msr, msr_bitmap + 0x800
> >> / f); + +	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> >> +		msr &= 0x1fff; +		if (type & MSR_TYPE_R) +			/* read-high */
> >> +			__clear_bit(msr, msr_bitmap + 0x400 / f); + +		if (type &
> >> MSR_TYPE_W) +			/* write-high */ +			__clear_bit(msr, msr_bitmap +
> >> 0xc00 / f); + +	} +} + +static void
> >> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +						u32
> >> msr, int type)
> >>  {
> >>  	int f = sizeof(unsigned long);
> >> @@ -3744,20 +3796,75 @@ static void
> > __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
> >>  	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
> >>  	 */
> >>  	if (msr <= 0x1fff) {
> >> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> >> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> >> +		if (type & MSR_TYPE_R)
> >> +			/* read-low */
> >> +			__set_bit(msr, msr_bitmap + 0x000 / f);
> >> +
> >> +		if (type & MSR_TYPE_W)
> >> +			/* write-low */
> >> +			__set_bit(msr, msr_bitmap + 0x800 / f);
> >> +
> >>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> >>  		msr &= 0x1fff;
> >> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> >> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> >> +		if (type & MSR_TYPE_R)
> >> +			/* read-high */
> >> +			__set_bit(msr, msr_bitmap + 0x400 / f);
> >> +
> >> +		if (type & MSR_TYPE_W)
> >> +			/* write-high */
> >> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
> >> +
> >>  	}
> >>  }
> >> +
> >>  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
> >>  {
> >>  	if (!longmode_only)
> >> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> >> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> >> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> >> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +						msr, MSR_TYPE_R | MSR_TYPE_W);
> >> +}
> >> +
> >> +static void vmx_intercept_for_msr_read(u32 msr, bool longmode_only,
> >> +					bool set)
> >> +{
> >> +	if (!longmode_only) {
> >> +		if (set)
> >> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +					msr, MSR_TYPE_R);
> >> +		else
> >> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +					msr, MSR_TYPE_R);
> >> +
> >> +	}
> >> +	if (set)
> >> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +				msr, MSR_TYPE_R);
> >> +	else
> >> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +				msr, MSR_TYPE_R);
> >> +}
> >> +
> >> +static void vmx_intercept_for_msr_write(u32 msr, bool longmode_only,
> >> +					bool set)
> >> +{
> >> +	if (!longmode_only) {
> >> +		if (set)
> >> +			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +					msr, MSR_TYPE_W);
> >> +		else
> >> +			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >> +					msr, MSR_TYPE_W);
> >> +
> >> +	}
> >> +	if (set)
> >> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +				msr, MSR_TYPE_W);
> >> +	else
> >> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >> +				msr, MSR_TYPE_W);
> >>  }
> >>  
> >>  /*
> >> @@ -3855,6 +3962,7 @@ static u32 vmx_secondary_exec_control(struct
> > vcpu_vmx *vmx)
> >>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
> >>  (!enable_apicv_reg_vid) 		exec_control &=
> >>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
> >>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return exec_control; }
> >> @@ -6110,6 +6218,76 @@ static void update_cr8_intercept(struct kvm_vcpu
> > *vcpu, int tpr, int irr)
> >>  	vmcs_write32(TPR_THRESHOLD, irr);
> >>  }
> >> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> >> +{
> >> +	u32 exec_control;
> >> +	int msr;
> >> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +
> >> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
> >> +		return;
> >> +
> >> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> >> +	/* virtualize x2apic mode relies on tpr shadow */
> >> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
> >> +		return;
> >> +
> >> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> >> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> >> +	vmx->virtual_x2apic_enabled = true;
> > Why track it?
> With this flag, we don't need to read vmcs to check whether we enabled virtua x2apic before.
> 
Why do you care? Just disabled it regardless.

> >> +
> >> +	if (!cpu_has_vmx_virtual_intr_delivery())
> >> +		return;
> >> +
> > You need to test whether vid is enabled, not whether it can be enabled.
> > And you need to test it at the beginning of the function. If vid is
> > disabled we do not want to set SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE.
> > Also force disable vid if !cpu_has_vmx_virtualize_x2apic_mode().
> vid is not depend on virtualize x2apic.
It is in our implementation. If vid is enabled and virtualize x2apic is
not our code will not work if guest uses x2apic.

> The right logic should be:
> When register virtualization enabled, read all apic msr(except apic id reg and tmcct which have the hook) not cause vmexit and only write TPR not cause vmexit.
> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
> It's better to put the patch of envirtualize x2apic mode as first patch.
> 
There is no point whatsoever to enable virtualize x2apic without
allowing at least one non intercepted access to x2apic MSR range and
this is what your patch is doing when run on cpu without vid capability.

> >> +	for (msr = 0x800; msr <= 0x8ff; msr++)
> >> +		vmx_intercept_for_msr_read(msr, false, false);
> >> +
> >> +	/* APIC ID */
> >> +	vmx_intercept_for_msr_read(0x802, false, true);
> >> +	/* TMCCT */
> >> +	vmx_intercept_for_msr_read(0x839, false, true);
> >> +	/* TPR */
> >> +	vmx_intercept_for_msr_write(0x808, false, false);
> >> +	/* EOI */
> >> +	vmx_intercept_for_msr_write(0x80b, false, false);
> >> +	/* SELF-IPI */
> >> +	vmx_intercept_for_msr_write(0x83f, false, false);
> >> +
> >> +}
> >> +
> >> +static void vmx_disable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> >> +{
> >> +	u32 second_exec_control;
> >> +	int msr;
> >> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +
> >> +	/* If doesn't enable virtual x2apic before, do nothing*/
> >> +	if (!vmx->virtual_x2apic_enabled)
> >> +		return;
> >> +
> >> +	second_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> >> +	/* disalbe virtual x2apic*/
> >> +	second_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >> +	second_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, second_exec_control);
> >> +	vmx->virtual_x2apic_enabled = false;
> >> +
> >> +	if (!cpu_has_vmx_virtual_intr_delivery())
> >> +		return;
> >> +
> >> +	for (msr = 0x800; msr <= 0x8ff; msr++)
> >> +		vmx_intercept_for_msr_read(msr, false, true);
> >> +
> >> +	/* TPR */
> >> +	vmx_intercept_for_msr_write(0x808, false, true);
> >> +	/* EOI */
> >> +	vmx_intercept_for_msr_write(0x80b, false, true);
> >> +	/* SELF-IPI */
> >> +	vmx_intercept_for_msr_write(0x83f, false, true);
> >> +}
> >> +
> >>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
> >>  exit_intr_info; @@ -7373,6 +7551,8 @@ static struct kvm_x86_ops
> >>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>  update_cr8_intercept,
> >> +	.enable_virtual_x2apic_mode = vmx_enable_virtual_x2apic_mode,
> >> +	.disable_virtual_x2apic_mode = vmx_disable_virtual_x2apic_mode,
> >> 
> >>  	.set_tss_addr = vmx_set_tss_addr,
> >>  	.get_tdp_level = get_ept_level,
> >> --
> >> 1.7.1
> > 
> > --
> > 			Gleb.
> 
> 
> Best regards,
> Yang
> 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 10, 2013, 8:53 a.m. UTC | #6
On Thu, Jan 10, 2013 at 08:31:51AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-10:
> > On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
> >> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> >> +{
> >> +	u32 exec_control;
> >> +	int msr;
> >> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +
> >> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
> >> +		return;
> >> +
> >> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> >> +	/* virtualize x2apic mode relies on tpr shadow */
> >> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
> >> +		return;
> >> +
> >> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> >> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> >> +	vmx->virtual_x2apic_enabled = true;
> >> +
> >> +	if (!cpu_has_vmx_virtual_intr_delivery())
> >> +		return;
> >> +
> >> +	for (msr = 0x800; msr <= 0x8ff; msr++)
> >> +		vmx_intercept_for_msr_read(msr, false, false);
> >> +
> >> +	/* APIC ID */
> >> +	vmx_intercept_for_msr_read(0x802, false, true);
> > Why are you enabling apic id read intercept?
> Current code to read apic id in x2apic mode has some hacks:
> 
> if (apic_x2apic_mode(apic))
>        val = kvm_apic_id(apic);
> else
>        val = kvm_apic_id(apic) << 24;
> 
> static inline int kvm_apic_id(struct kvm_lapic *apic)
> {
>         return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> }
> 
> According SPEC, in x2apic mode, the whole id reg is used, but in KVM, it only use the highest eight bits.
> 
Correct. Put the comment above apic id intercept that we do that since
in x2apic mode apic id is not correctly reflected in apic page.

> >> +	/* TMCCT */
> >> +	vmx_intercept_for_msr_read(0x839, false, true);
> >> +	/* TPR */
> >> +	vmx_intercept_for_msr_write(0x808, false, false);
> >> +	/* EOI */
> >> +	vmx_intercept_for_msr_write(0x80b, false, false);
> >> +	/* SELF-IPI */
> >> +	vmx_intercept_for_msr_write(0x83f, false, false);
> >> +
> >> +}
> >> +
> > 
> > --
> > 			Gleb.
> 
> 
> Best regards,
> Yang
> 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 10, 2013, 11:54 a.m. UTC | #7
Gleb Natapov wrote on 2013-01-10:
> On Thu, Jan 10, 2013 at 08:32:06AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-10:
>>> On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
>>>> 
>>>> basically to benefit from apicv, we need to enable virtualized x2apic mode.
>>>> Currently, we only enable it when guest is really using x2apic.
>>>> 
>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
>>> x2apic:
>>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>>>     		   except APIC ID and TMCCT.
>>>>     APIC ID and TMCCT: need software's assistance to get right value.
>>>>     TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
>>>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>>> ---
>>>>  arch/x86/include/asm/kvm_host.h |    2 + arch/x86/include/asm/vmx.h
>>>>    |    1 + arch/x86/kvm/lapic.c            |    5 +-
>>>>    arch/x86/kvm/svm.c              |    6 + arch/x86/kvm/vmx.c |  194
>>>>    +++++++++++++++++++++++++++++++++++++-- 5 files
> changed, 200
>>>>  insertions(+), 8 deletions(-)
>>>> diff --git a/arch/x86/include/asm/kvm_host.h
>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
>>>> a/arch/x86/include/asm/kvm_host.h +++
>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct
>>>> kvm_x86_ops {
>>>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>>>>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>>>>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>>>> +	void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
>>>> +	void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
>>> Make one callback with enable/disable parameter. And do not forget SVM.
>>> 
>>>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>>>  	int (*get_tdp_level)(void);
>>>>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool
> is_mmio);
>>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>>> index 44c3f7e..0a54df0 100644
>>>> --- a/arch/x86/include/asm/vmx.h
>>>> +++ b/arch/x86/include/asm/vmx.h
>>>> @@ -139,6 +139,7 @@
>>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
>>>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
>>>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
>>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>> index 0664c13..ec38906 100644
>>>> --- a/arch/x86/kvm/lapic.c
>>>> +++ b/arch/x86/kvm/lapic.c
>>>> @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu
> *vcpu,
>>> u64 value)
>>>>  		u32 id = kvm_apic_id(apic);
>>>>  		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>  		kvm_apic_set_ldr(apic, ldr);
>>>> -	}
>>>> +		kvm_x86_ops->enable_virtual_x2apic_mode(vcpu);
>>>> +	} else
>>>> +		kvm_x86_ops->disable_virtual_x2apic_mode(vcpu);
>>>> +
>>> You just broke SVM.
>>>>  	apic->base_address = apic->vcpu->arch.apic_base &
>>>>  			     MSR_IA32_APICBASE_BASE;
>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>> index d29d3cd..0b82cb1 100644
>>>> --- a/arch/x86/kvm/svm.c
>>>> +++ b/arch/x86/kvm/svm.c
>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
> kvm_vcpu
>>> *vcpu, int tpr, int irr)
>>>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>>>  }
>>>> +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	return;
>>>> +}
>>>> +
>>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
>>>>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
>>>>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>>>  update_cr8_intercept,
>>>> +	.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
>>>> 
>>>>  	.set_tss_addr = svm_set_tss_addr,
>>>>  	.get_tdp_level = get_npt_level,
>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>> index 688f43f..b203ce7 100644
>>>> --- a/arch/x86/kvm/vmx.c
>>>> +++ b/arch/x86/kvm/vmx.c
>>>> @@ -433,6 +433,8 @@ struct vcpu_vmx {
>>>> 
>>>>  	bool rdtscp_enabled;
>>>> +	bool virtual_x2apic_enabled;
>>>> +
>>>>  	/* Support for a guest hypervisor (nested VMX) */
>>>>  	struct nested_vmx nested;
>>>>  };
>>>> @@ -767,12 +769,23 @@ static inline bool
>>> cpu_has_vmx_virtualize_apic_accesses(void)
>>>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>  }
>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>>>> +{
>>>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>> +}
>>>> +
>>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>>>  {
>>>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
>>>>  }
>>>> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
>>>> +{
>>>> +	return false;
>>>> +}
>>>> +
>>>>  static inline bool cpu_has_vmx_flexpriority(void)
>>>>  {
>>>>  	return cpu_has_vmx_tpr_shadow() &&
>>>> @@ -2544,6 +2557,7 @@ static __init int setup_vmcs_config(struct
>>> vmcs_config *vmcs_conf)
>>>>  	if (_cpu_based_exec_control &
>>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 		min2 = 0; 		opt2 =
>>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>>>  			SECONDARY_EXEC_WBINVD_EXITING | 	SECONDARY_EXEC_ENABLE_VPID |
>>>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3731,7 +3745,45 @@ static void
>>>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
>>>> -static void __vmx_disable_intercept_for_msr(unsigned long
>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2
>>>> +static void __vmx_disable_intercept_for_msr(unsigned long
>>>> *msr_bitmap, + 					u32 msr, int type) +{ +	int f = sizeof(unsigned
>>>> long); + +	if (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See
>>>> Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 *
>>>> have the write-low and read-high bitmap offsets the wrong way round.
>>>> +	 * We can control MSRs 0x00000000-0x00001fff and
>>>> 0xc0000000-0xc0001fff. +	 */ +	if (msr <= 0x1fff) { +		if (type &
>>>> MSR_TYPE_R) +			/* read-low */ +			__clear_bit(msr, msr_bitmap +
>>>> 0x000 / f); + +		if (type & MSR_TYPE_W) +			/* write-low */ +
>>>> 	__clear_bit(msr, msr_bitmap + 0x800 / f); + +	} else if ((msr >=
>>>> 0xc0000000) && (msr <= 0xc0001fff)) { +		msr &= 0x1fff; +		if (type &
>>>> MSR_TYPE_R) + 	/* read-high */ +			__clear_bit(msr, msr_bitmap +
>>>> 0x400 / f); + +		if (type & MSR_TYPE_W) +			/* write-high */ +
>>>> 	__clear_bit(msr, msr_bitmap + 0xc00 / f); + +	} +} + +static void
>>>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + 					u32
>>>> msr, int type)
>>>>  {
>>>>  	int f = sizeof(unsigned long);
>>>> @@ -3744,20 +3796,75 @@ static void
>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>>>>  	 * We can control MSRs 0x00000000-0x00001fff and
>>>>  0xc0000000-0xc0001fff. 	 */ 	if (msr <= 0x1fff) {
>>>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>>>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
>>>> +		if (type & MSR_TYPE_R)
>>>> +			/* read-low */
>>>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
>>>> +
>>>> +		if (type & MSR_TYPE_W)
>>>> +			/* write-low */
>>>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
>>>> +
>>>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>>>  		msr &= 0x1fff;
>>>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
>>>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
>>>> +		if (type & MSR_TYPE_R)
>>>> +			/* read-high */
>>>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
>>>> +
>>>> +		if (type & MSR_TYPE_W)
>>>> +			/* write-high */
>>>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
>>>> +
>>>>  	} } + static void vmx_disable_intercept_for_msr(u32 msr, bool
>>>>  longmode_only) { 	if (!longmode_only)
>>>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
>>>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +						msr,
>>>> MSR_TYPE_R | MSR_TYPE_W);
>>>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>> +						msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void
>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, +					bool
>>>> set) +{ +	if (!longmode_only) { +		if (set) +
>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_R); +		else +
>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_R); + +	} +	if (set)
>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_R); +	else
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_R); +} + +static void vmx_intercept_for_msr_write(u32 msr,
>>>> bool longmode_only, +					bool set) +{ +	if (!longmode_only) { +		if
>>>> (set) + 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
>>>> +					msr, MSR_TYPE_W); +		else +
>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_W); + +	} +	if (set)
>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_W); +	else
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_W);
>>>>  }
>>>>  
>>>>  /*
>>>> @@ -3855,6 +3962,7 @@ static u32 vmx_secondary_exec_control(struct
>>> vcpu_vmx *vmx)
>>>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
>>>>  (!enable_apicv_reg_vid) 		exec_control &=
>>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
>>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return
> exec_control; }
>>>> @@ -6110,6 +6218,76 @@ static void update_cr8_intercept(struct
> kvm_vcpu
>>> *vcpu, int tpr, int irr)
>>>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>>>  }
>>>> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	u32 exec_control;
>>>> +	int msr;
>>>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>> +
>>>> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
>>>> +		return;
>>>> +
>>>> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>>>> +	/* virtualize x2apic mode relies on tpr shadow */
>>>> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
>>>> +		return;
>>>> +
>>>> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>>>> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
>>>> +	vmx->virtual_x2apic_enabled = true;
>>> Why track it?
>> With this flag, we don't need to read vmcs to check whether we enabled
>> virtua x2apic before.
>> 
> Why do you care? Just disabled it regardless.
>>>> +
>>>> +	if (!cpu_has_vmx_virtual_intr_delivery())
>>>> +		return;
>>>> +
>>> You need to test whether vid is enabled, not whether it can be
>>> enabled. And you need to test it at the beginning of the function. If
>>> vid is disabled we do not want to set
>>> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE. Also force disable vid if
>>> !cpu_has_vmx_virtualize_x2apic_mode().
>> vid is not depend on virtualize x2apic.
> It is in our implementation. If vid is enabled and virtualize x2apic is
> not our code will not work if guest uses x2apic.
> 
>> The right logic should be:
>> When register virtualization enabled, read all apic msr(except apic id reg and
> tmcct which have the hook) not cause vmexit and only write TPR not cause
> vmexit.
>> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
>> It's better to put the patch of envirtualize x2apic mode as first patch.
>> 
> There is no point whatsoever to enable virtualize x2apic without
> allowing at least one non intercepted access to x2apic MSR range and
> this is what your patch is doing when run on cpu without vid capability.
No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is enabled.
I am not sure whether I understand your comments right in previous discussion, here is my thinking:
1. enable virtualize x2apic mode if guest is really using x2apic and clear TPR in msr read  and write bitmap. This will benefit reading TPR.
2. If APIC registers virtualization is enabled, clear all bit in rang 0x800-0x8ff(except apic id reg and tmcct).
3. If vid is enabled, clear EOI and SELF IPI in msr write map.

One concern you mentioned is that vid enabled and virtualize x2apic is disabled but guest still use x2apic. In this case, we still use MSR bitmap to intercept x2apic. This means the vEOI update will never happen. But we still can benefit from interrupt window.

>>>> +	for (msr = 0x800; msr <= 0x8ff; msr++)
>>>> +		vmx_intercept_for_msr_read(msr, false, false); + +	/* APIC ID */
>>>> +	vmx_intercept_for_msr_read(0x802, false, true); +	/* TMCCT */
>>>> +	vmx_intercept_for_msr_read(0x839, false, true); +	/* TPR */
>>>> +	vmx_intercept_for_msr_write(0x808, false, false); +	/* EOI */
>>>> +	vmx_intercept_for_msr_write(0x80b, false, false); +	/* SELF-IPI */
>>>> +	vmx_intercept_for_msr_write(0x83f, false, false); + +} + +static
>>>> void vmx_disable_virtual_x2apic_mode(struct kvm_vcpu *vcpu) +{ +	u32
>>>> second_exec_control; +	int msr; +	struct vcpu_vmx *vmx =
>>>> to_vmx(vcpu); + +	/* If doesn't enable virtual x2apic before, do
>>>> nothing*/ +	if (!vmx->virtual_x2apic_enabled) +		return; +
>>>> +	second_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); +	/*
>>>> disalbe virtual x2apic*/ +	second_exec_control &=
>>>> ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +	second_exec_control |=
>>>> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, second_exec_control);
>>>> +	vmx->virtual_x2apic_enabled = false; + +	if
>>>> (!cpu_has_vmx_virtual_intr_delivery()) +		return; + +	for (msr =
>>>> 0x800; msr <= 0x8ff; msr++) +		vmx_intercept_for_msr_read(msr, false,
>>>> true); + +	/* TPR */ +	vmx_intercept_for_msr_write(0x808, false,
>>>> true); +	/* EOI */ +	vmx_intercept_for_msr_write(0x80b, false, true);
>>>> +	/* SELF-IPI */ +	vmx_intercept_for_msr_write(0x83f, false, true);
>>>> +} +
>>>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
>>>>  exit_intr_info; @@ -7373,6 +7551,8 @@ static struct kvm_x86_ops
>>>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>>>  update_cr8_intercept,
>>>> +	.enable_virtual_x2apic_mode = vmx_enable_virtual_x2apic_mode,
>>>> +	.disable_virtual_x2apic_mode = vmx_disable_virtual_x2apic_mode,
>>>> 
>>>>  	.set_tss_addr = vmx_set_tss_addr,
>>>>  	.get_tdp_level = get_ept_level,
>>>> --
>>>> 1.7.1
>>> 
>>> --
>>> 			Gleb.
>> 
>> 
>> Best regards,
>> Yang
>> 
> 
> --
> 			Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Best regards,
Yang

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 10, 2013, 12:16 p.m. UTC | #8
On Thu, Jan 10, 2013 at 11:54:21AM +0000, Zhang, Yang Z wrote:
> >> The right logic should be:
> >> When register virtualization enabled, read all apic msr(except apic id reg and
> > tmcct which have the hook) not cause vmexit and only write TPR not cause
> > vmexit.
> >> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
> >> It's better to put the patch of envirtualize x2apic mode as first patch.
> >> 
> > There is no point whatsoever to enable virtualize x2apic without
> > allowing at least one non intercepted access to x2apic MSR range and
> > this is what your patch is doing when run on cpu without vid capability.
> No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is enabled.
For that you need to disable 808H MSR intercept, which your patch does not do.

> I am not sure whether I understand your comments right in previous discussion, here is my thinking:
> 1. enable virtualize x2apic mode if guest is really using x2apic and clear TPR in msr read  and write bitmap. This will benefit reading TPR.
> 2. If APIC registers virtualization is enabled, clear all bit in rang 0x800-0x8ff(except apic id reg and tmcct).
Clear all read bits in the range.

> 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
> 
Looks OK.

> One concern you mentioned is that vid enabled and virtualize x2apic is disabled but guest still use x2apic. In this case, we still use MSR bitmap to intercept x2apic. This means the vEOI update will never happen. But we still can benefit from interrupt window.
> 
What interrupt windows? Without virtualized x2apic TPR/EOI
virtualization will not happen and we rely on that in the code.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 10, 2013, 12:22 p.m. UTC | #9
Gleb Natapov wrote on 2013-01-10:
> On Thu, Jan 10, 2013 at 11:54:21AM +0000, Zhang, Yang Z wrote:
>>>> The right logic should be:
>>>> When register virtualization enabled, read all apic msr(except apic id reg and
>>> tmcct which have the hook) not cause vmexit and only write TPR not cause
>>> vmexit.
>>>> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
>>>> It's better to put the patch of envirtualize x2apic mode as first patch.
>>>> 
>>> There is no point whatsoever to enable virtualize x2apic without
>>> allowing at least one non intercepted access to x2apic MSR range and
>>> this is what your patch is doing when run on cpu without vid capability.
>> No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is
> enabled.
> For that you need to disable 808H MSR intercept, which your patch does not do.
I want to do this in next patch.
 
>> I am not sure whether I understand your comments right in previous
>> discussion, here is my thinking: 1. enable virtualize x2apic mode if
>> guest is really using x2apic and clear TPR in msr read  and write
>> bitmap. This will benefit reading TPR. 2. If APIC registers
>> virtualization is enabled, clear all bit in rang
> 0x800-0x8ff(except apic id reg and tmcct).
> Clear all read bits in the range.
> 
>> 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
>> 
> Looks OK.
> 
>> One concern you mentioned is that vid enabled and virtualize x2apic is disabled
> but guest still use x2apic. In this case, we still use MSR bitmap to intercept x2apic.
> This means the vEOI update will never happen. But we still can benefit from
> interrupt window.
>> 
> What interrupt windows? Without virtualized x2apic TPR/EOI
> virtualization will not happen and we rely on that in the code.
Yes, but we can eliminate vmexit of interrupt window. Interrupt still can delivery to guest without vmexit when guest enables interrupt if vid is enabled. See SDM vol 3, 29.2.2.

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 10, 2013, 12:34 p.m. UTC | #10
On Thu, Jan 10, 2013 at 12:22:51PM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-10:
> > On Thu, Jan 10, 2013 at 11:54:21AM +0000, Zhang, Yang Z wrote:
> >>>> The right logic should be:
> >>>> When register virtualization enabled, read all apic msr(except apic id reg and
> >>> tmcct which have the hook) not cause vmexit and only write TPR not cause
> >>> vmexit.
> >>>> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
> >>>> It's better to put the patch of envirtualize x2apic mode as first patch.
> >>>> 
> >>> There is no point whatsoever to enable virtualize x2apic without
> >>> allowing at least one non intercepted access to x2apic MSR range and
> >>> this is what your patch is doing when run on cpu without vid capability.
> >> No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is
> > enabled.
> > For that you need to disable 808H MSR intercept, which your patch does not do.
> I want to do this in next patch.
>  
Then don't. There is no point in the patch that enables virtualize
x2apic and does not allow at least one non-intercepted MSR access.

> >> I am not sure whether I understand your comments right in previous
> >> discussion, here is my thinking: 1. enable virtualize x2apic mode if
> >> guest is really using x2apic and clear TPR in msr read  and write
> >> bitmap. This will benefit reading TPR. 2. If APIC registers
> >> virtualization is enabled, clear all bit in rang
> > 0x800-0x8ff(except apic id reg and tmcct).
> > Clear all read bits in the range.
> > 
> >> 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
> >> 
> > Looks OK.
> > 
> >> One concern you mentioned is that vid enabled and virtualize x2apic is disabled
> > but guest still use x2apic. In this case, we still use MSR bitmap to intercept x2apic.
> > This means the vEOI update will never happen. But we still can benefit from
> > interrupt window.
> >> 
> > What interrupt windows? Without virtualized x2apic TPR/EOI
> > virtualization will not happen and we rely on that in the code.
> Yes, but we can eliminate vmexit of interrupt window. Interrupt still can delivery to guest without vmexit when guest enables interrupt if vid is enabled. See SDM vol 3, 29.2.2.
> 
What does it matter that you eliminated vmexit of interrupt window if
you broke everything else? The vid patch assumes that apic emulation
either entirely happens in a software when vid is disabled or in a CPU
if vid is enabled. Mixed mode will not work (apic->isr_count = 1 trick
will break things for instance). And it is not worth it to complicate
the code to make it work.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 11, 2013, 2:37 a.m. UTC | #11
Gleb Natapov wrote on 2013-01-10:
> On Thu, Jan 10, 2013 at 08:32:06AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-10:
>>> On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
>>>> 
>>>> basically to benefit from apicv, we need to enable virtualized x2apic mode.
>>>> Currently, we only enable it when guest is really using x2apic.
>>>> 
>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
>>> x2apic:
>>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>>>     		   except APIC ID and TMCCT.
>>>>     APIC ID and TMCCT: need software's assistance to get right value.
>>>>     TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
>>>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>>> ---
>>>>  arch/x86/include/asm/kvm_host.h |    2 + arch/x86/include/asm/vmx.h
>>>>    |    1 + arch/x86/kvm/lapic.c            |    5 +-
>>>>    arch/x86/kvm/svm.c              |    6 + arch/x86/kvm/vmx.c |  194
>>>>    +++++++++++++++++++++++++++++++++++++-- 5 files
> changed, 200
>>>>  insertions(+), 8 deletions(-)
>>>> diff --git a/arch/x86/include/asm/kvm_host.h
>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
>>>> a/arch/x86/include/asm/kvm_host.h +++
>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct
>>>> kvm_x86_ops {
>>>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>>>>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>>>>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>>>> +	void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
>>>> +	void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
>>> Make one callback with enable/disable parameter. And do not forget SVM.
>>> 
>>>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>>>  	int (*get_tdp_level)(void);
>>>>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool
> is_mmio);
>>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>>> index 44c3f7e..0a54df0 100644
>>>> --- a/arch/x86/include/asm/vmx.h
>>>> +++ b/arch/x86/include/asm/vmx.h
>>>> @@ -139,6 +139,7 @@
>>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
>>>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
>>>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
>>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>> index 0664c13..ec38906 100644
>>>> --- a/arch/x86/kvm/lapic.c
>>>> +++ b/arch/x86/kvm/lapic.c
>>>> @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu
> *vcpu,
>>> u64 value)
>>>>  		u32 id = kvm_apic_id(apic);
>>>>  		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>  		kvm_apic_set_ldr(apic, ldr);
>>>> -	}
>>>> +		kvm_x86_ops->enable_virtual_x2apic_mode(vcpu);
>>>> +	} else
>>>> +		kvm_x86_ops->disable_virtual_x2apic_mode(vcpu);
>>>> +
>>> You just broke SVM.
>>>>  	apic->base_address = apic->vcpu->arch.apic_base &
>>>>  			     MSR_IA32_APICBASE_BASE;
>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>> index d29d3cd..0b82cb1 100644
>>>> --- a/arch/x86/kvm/svm.c
>>>> +++ b/arch/x86/kvm/svm.c
>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
> kvm_vcpu
>>> *vcpu, int tpr, int irr)
>>>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>>>  }
>>>> +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	return;
>>>> +}
>>>> +
>>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
>>>>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
>>>>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>>>  update_cr8_intercept,
>>>> +	.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
>>>> 
>>>>  	.set_tss_addr = svm_set_tss_addr,
>>>>  	.get_tdp_level = get_npt_level,
>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>> index 688f43f..b203ce7 100644
>>>> --- a/arch/x86/kvm/vmx.c
>>>> +++ b/arch/x86/kvm/vmx.c
>>>> @@ -433,6 +433,8 @@ struct vcpu_vmx {
>>>> 
>>>>  	bool rdtscp_enabled;
>>>> +	bool virtual_x2apic_enabled;
>>>> +
>>>>  	/* Support for a guest hypervisor (nested VMX) */
>>>>  	struct nested_vmx nested;
>>>>  };
>>>> @@ -767,12 +769,23 @@ static inline bool
>>> cpu_has_vmx_virtualize_apic_accesses(void)
>>>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>  }
>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>>>> +{
>>>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>> +}
>>>> +
>>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>>>  {
>>>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
>>>>  }
>>>> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
>>>> +{
>>>> +	return false;
>>>> +}
>>>> +
>>>>  static inline bool cpu_has_vmx_flexpriority(void)
>>>>  {
>>>>  	return cpu_has_vmx_tpr_shadow() &&
>>>> @@ -2544,6 +2557,7 @@ static __init int setup_vmcs_config(struct
>>> vmcs_config *vmcs_conf)
>>>>  	if (_cpu_based_exec_control &
>>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 		min2 = 0; 		opt2 =
>>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>>>  			SECONDARY_EXEC_WBINVD_EXITING | 	SECONDARY_EXEC_ENABLE_VPID |
>>>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3731,7 +3745,45 @@ static void
>>>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
>>>> -static void __vmx_disable_intercept_for_msr(unsigned long
>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2
>>>> +static void __vmx_disable_intercept_for_msr(unsigned long
>>>> *msr_bitmap, + 					u32 msr, int type) +{ +	int f = sizeof(unsigned
>>>> long); + +	if (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See
>>>> Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 *
>>>> have the write-low and read-high bitmap offsets the wrong way round.
>>>> +	 * We can control MSRs 0x00000000-0x00001fff and
>>>> 0xc0000000-0xc0001fff. +	 */ +	if (msr <= 0x1fff) { +		if (type &
>>>> MSR_TYPE_R) +			/* read-low */ +			__clear_bit(msr, msr_bitmap +
>>>> 0x000 / f); + +		if (type & MSR_TYPE_W) +			/* write-low */ +
>>>> 	__clear_bit(msr, msr_bitmap + 0x800 / f); + +	} else if ((msr >=
>>>> 0xc0000000) && (msr <= 0xc0001fff)) { +		msr &= 0x1fff; +		if (type &
>>>> MSR_TYPE_R) + 	/* read-high */ +			__clear_bit(msr, msr_bitmap +
>>>> 0x400 / f); + +		if (type & MSR_TYPE_W) +			/* write-high */ +
>>>> 	__clear_bit(msr, msr_bitmap + 0xc00 / f); + +	} +} + +static void
>>>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + 					u32
>>>> msr, int type)
>>>>  {
>>>>  	int f = sizeof(unsigned long);
>>>> @@ -3744,20 +3796,75 @@ static void
>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>>>>  	 * We can control MSRs 0x00000000-0x00001fff and
>>>>  0xc0000000-0xc0001fff. 	 */ 	if (msr <= 0x1fff) {
>>>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>>>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
>>>> +		if (type & MSR_TYPE_R)
>>>> +			/* read-low */
>>>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
>>>> +
>>>> +		if (type & MSR_TYPE_W)
>>>> +			/* write-low */
>>>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
>>>> +
>>>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>>>  		msr &= 0x1fff;
>>>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
>>>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
>>>> +		if (type & MSR_TYPE_R)
>>>> +			/* read-high */
>>>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
>>>> +
>>>> +		if (type & MSR_TYPE_W)
>>>> +			/* write-high */
>>>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
>>>> +
>>>>  	} } + static void vmx_disable_intercept_for_msr(u32 msr, bool
>>>>  longmode_only) { 	if (!longmode_only)
>>>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
>>>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +						msr,
>>>> MSR_TYPE_R | MSR_TYPE_W);
>>>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>> +						msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void
>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, +					bool
>>>> set) +{ +	if (!longmode_only) { +		if (set) +
>>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_R); +		else +
>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_R); + +	} +	if (set)
>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_R); +	else
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_R); +} + +static void vmx_intercept_for_msr_write(u32 msr,
>>>> bool longmode_only, +					bool set) +{ +	if (!longmode_only) { +		if
>>>> (set) + 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
>>>> +					msr, MSR_TYPE_W); +		else +
>>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
>>>> MSR_TYPE_W); + +	} +	if (set)
>>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_W); +	else
>>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
>>>> MSR_TYPE_W);
>>>>  }
>>>>  
>>>>  /*
>>>> @@ -3855,6 +3962,7 @@ static u32 vmx_secondary_exec_control(struct
>>> vcpu_vmx *vmx)
>>>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
>>>>  (!enable_apicv_reg_vid) 		exec_control &=
>>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
>>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return
> exec_control; }
>>>> @@ -6110,6 +6218,76 @@ static void update_cr8_intercept(struct
> kvm_vcpu
>>> *vcpu, int tpr, int irr)
>>>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>>>  }
>>>> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	u32 exec_control;
>>>> +	int msr;
>>>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>> +
>>>> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
>>>> +		return;
>>>> +
>>>> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>>>> +	/* virtualize x2apic mode relies on tpr shadow */
>>>> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
>>>> +		return;
>>>> +
>>>> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>>>> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
>>>> +	vmx->virtual_x2apic_enabled = true;
>>> Why track it?
>> With this flag, we don't need to read vmcs to check whether we enabled
>> virtua x2apic before.
>> 
> Why do you care? Just disabled it regardless.
kvm_lapic_set_base will be called when creating lapic. At that time, vcpu didn't initialized. Then read/write vmcs in vmx_disable_virtual_x2apic_mode will cause error.
With this flag, we only disable the virtual x2apic mode if it is enabled before. 

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 11, 2013, 7:36 a.m. UTC | #12
Gleb Natapov wrote on 2013-01-10:
> On Thu, Jan 10, 2013 at 12:22:51PM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-10:
>>> On Thu, Jan 10, 2013 at 11:54:21AM +0000, Zhang, Yang Z wrote:
>>>>>> The right logic should be:
>>>>>> When register virtualization enabled, read all apic msr(except apic id reg
> and
>>>>> tmcct which have the hook) not cause vmexit and only write TPR not cause
>>>>> vmexit.
>>>>>> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
>>>>>> It's better to put the patch of envirtualize x2apic mode as first patch.
>>>>>> 
>>>>> There is no point whatsoever to enable virtualize x2apic without
>>>>> allowing at least one non intercepted access to x2apic MSR range and
>>>>> this is what your patch is doing when run on cpu without vid capability.
>>>> No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is
>>> enabled.
>>> For that you need to disable 808H MSR intercept, which your patch does not
> do.
>> I want to do this in next patch.
>> 
> Then don't. There is no point in the patch that enables virtualize
> x2apic and does not allow at least one non-intercepted MSR access.
As I said, read/write TPR will not cause vmexit if virtual x2apic is set.

>>>> I am not sure whether I understand your comments right in previous
>>>> discussion, here is my thinking: 1. enable virtualize x2apic mode if
>>>> guest is really using x2apic and clear TPR in msr read  and write
>>>> bitmap. This will benefit reading TPR. 2. If APIC registers
>>>> virtualization is enabled, clear all bit in rang
>>> 0x800-0x8ff(except apic id reg and tmcct).
>>> Clear all read bits in the range.
>>> 
>>>> 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
>>>> 
>>> Looks OK.
>>> 
>>>> One concern you mentioned is that vid enabled and virtualize x2apic is
> disabled
>>> but guest still use x2apic. In this case, we still use MSR bitmap to
>>> intercept x2apic. This means the vEOI update will never happen. But we
>>> still can benefit from interrupt window.
>>>> 
>>> What interrupt windows? Without virtualized x2apic TPR/EOI
>>> virtualization will not happen and we rely on that in the code.
>> Yes, but we can eliminate vmexit of interrupt window. Interrupt still can
> delivery to guest without vmexit when guest enables interrupt if vid is enabled.
> See SDM vol 3, 29.2.2.
>> 
> What does it matter that you eliminated vmexit of interrupt window if
> you broke everything else? The vid patch assumes that apic emulation
> either entirely happens in a software when vid is disabled or in a CPU
> if vid is enabled. Mixed mode will not work (apic->isr_count = 1 trick
> will break things for instance). And it is not worth it to complicate
> the code to make it work.
Yes, you are right. It too complicated.
Another question? Why not to hide x2apic capability to guest when vid is supported and virtual x2apic mode is not supported? It should be more reasonable than disable vid when virtual x2apic mode is unavailable.

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 11, 2013, 1:51 p.m. UTC | #13
On Fri, Jan 11, 2013 at 02:37:15AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-10:
> > On Thu, Jan 10, 2013 at 08:32:06AM +0000, Zhang, Yang Z wrote:
> >> Gleb Natapov wrote on 2013-01-10:
> >>> On Thu, Jan 10, 2013 at 03:26:07PM +0800, Yang Zhang wrote:
> >>>> From: Yang Zhang <yang.z.zhang@Intel.com>
> >>>> 
> >>>> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> >>>> Currently, we only enable it when guest is really using x2apic.
> >>>> 
> >>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
> >>> x2apic:
> >>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
> >>>>     		   except APIC ID and TMCCT.
> >>>>     APIC ID and TMCCT: need software's assistance to get right value.
> >>>>     TPR,EOI,SELF-IPI: no write intercept for virtual interrupt delivery.
> >>>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> >>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >>>> ---
> >>>>  arch/x86/include/asm/kvm_host.h |    2 + arch/x86/include/asm/vmx.h
> >>>>    |    1 + arch/x86/kvm/lapic.c            |    5 +-
> >>>>    arch/x86/kvm/svm.c              |    6 + arch/x86/kvm/vmx.c |  194
> >>>>    +++++++++++++++++++++++++++++++++++++-- 5 files
> > changed, 200
> >>>>  insertions(+), 8 deletions(-)
> >>>> diff --git a/arch/x86/include/asm/kvm_host.h
> >>>> b/arch/x86/include/asm/kvm_host.h index c431b33..572a562 100644 ---
> >>>> a/arch/x86/include/asm/kvm_host.h +++
> >>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,8 @@ struct
> >>>> kvm_x86_ops {
> >>>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
> >>>>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
> >>>>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> >>>> +	void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
> >>>> +	void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
> >>> Make one callback with enable/disable parameter. And do not forget SVM.
> >>> 
> >>>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> >>>>  	int (*get_tdp_level)(void);
> >>>>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool
> > is_mmio);
> >>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> >>>> index 44c3f7e..0a54df0 100644
> >>>> --- a/arch/x86/include/asm/vmx.h
> >>>> +++ b/arch/x86/include/asm/vmx.h
> >>>> @@ -139,6 +139,7 @@
> >>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
> >>>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
> >>>>  SECONDARY_EXEC_RDTSCP			0x00000008 +#define
> >>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
> >>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
> >>>>  SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
> >>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
> >>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >>>> index 0664c13..ec38906 100644
> >>>> --- a/arch/x86/kvm/lapic.c
> >>>> +++ b/arch/x86/kvm/lapic.c
> >>>> @@ -1328,7 +1328,10 @@ void kvm_lapic_set_base(struct kvm_vcpu
> > *vcpu,
> >>> u64 value)
> >>>>  		u32 id = kvm_apic_id(apic);
> >>>>  		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> >>>>  		kvm_apic_set_ldr(apic, ldr);
> >>>> -	}
> >>>> +		kvm_x86_ops->enable_virtual_x2apic_mode(vcpu);
> >>>> +	} else
> >>>> +		kvm_x86_ops->disable_virtual_x2apic_mode(vcpu);
> >>>> +
> >>> You just broke SVM.
> >>>>  	apic->base_address = apic->vcpu->arch.apic_base &
> >>>>  			     MSR_IA32_APICBASE_BASE;
> >>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >>>> index d29d3cd..0b82cb1 100644
> >>>> --- a/arch/x86/kvm/svm.c
> >>>> +++ b/arch/x86/kvm/svm.c
> >>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
> > kvm_vcpu
> >>> *vcpu, int tpr, int irr)
> >>>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
> >>>>  }
> >>>> +static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> >>>> +{
> >>>> +	return;
> >>>> +}
> >>>> +
> >>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
> >>>>  *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops
> >>>>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>>>  update_cr8_intercept,
> >>>> +	.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
> >>>> 
> >>>>  	.set_tss_addr = svm_set_tss_addr,
> >>>>  	.get_tdp_level = get_npt_level,
> >>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >>>> index 688f43f..b203ce7 100644
> >>>> --- a/arch/x86/kvm/vmx.c
> >>>> +++ b/arch/x86/kvm/vmx.c
> >>>> @@ -433,6 +433,8 @@ struct vcpu_vmx {
> >>>> 
> >>>>  	bool rdtscp_enabled;
> >>>> +	bool virtual_x2apic_enabled;
> >>>> +
> >>>>  	/* Support for a guest hypervisor (nested VMX) */
> >>>>  	struct nested_vmx nested;
> >>>>  };
> >>>> @@ -767,12 +769,23 @@ static inline bool
> >>> cpu_has_vmx_virtualize_apic_accesses(void)
> >>>>  		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>>>  }
> >>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> >>>> +{
> >>>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >>>> +		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >>>> +}
> >>>> +
> >>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
> >>>>  {
> >>>>  	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >>>>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
> >>>>  }
> >>>> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
> >>>> +{
> >>>> +	return false;
> >>>> +}
> >>>> +
> >>>>  static inline bool cpu_has_vmx_flexpriority(void)
> >>>>  {
> >>>>  	return cpu_has_vmx_tpr_shadow() &&
> >>>> @@ -2544,6 +2557,7 @@ static __init int setup_vmcs_config(struct
> >>> vmcs_config *vmcs_conf)
> >>>>  	if (_cpu_based_exec_control &
> >>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 		min2 = 0; 		opt2 =
> >>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> >>>>  +			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
> >>>>  			SECONDARY_EXEC_WBINVD_EXITING | 	SECONDARY_EXEC_ENABLE_VPID |
> >>>>  			SECONDARY_EXEC_ENABLE_EPT | @@ -3731,7 +3745,45 @@ static void
> >>>>  free_vpid(struct vcpu_vmx *vmx) 	spin_unlock(&vmx_vpid_lock); }
> >>>> -static void __vmx_disable_intercept_for_msr(unsigned long
> >>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R	1 +#define MSR_TYPE_W	2
> >>>> +static void __vmx_disable_intercept_for_msr(unsigned long
> >>>> *msr_bitmap, + 					u32 msr, int type) +{ +	int f = sizeof(unsigned
> >>>> long); + +	if (!cpu_has_vmx_msr_bitmap()) +		return; + +	/* +	 * See
> >>>> Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals +	 *
> >>>> have the write-low and read-high bitmap offsets the wrong way round.
> >>>> +	 * We can control MSRs 0x00000000-0x00001fff and
> >>>> 0xc0000000-0xc0001fff. +	 */ +	if (msr <= 0x1fff) { +		if (type &
> >>>> MSR_TYPE_R) +			/* read-low */ +			__clear_bit(msr, msr_bitmap +
> >>>> 0x000 / f); + +		if (type & MSR_TYPE_W) +			/* write-low */ +
> >>>> 	__clear_bit(msr, msr_bitmap + 0x800 / f); + +	} else if ((msr >=
> >>>> 0xc0000000) && (msr <= 0xc0001fff)) { +		msr &= 0x1fff; +		if (type &
> >>>> MSR_TYPE_R) + 	/* read-high */ +			__clear_bit(msr, msr_bitmap +
> >>>> 0x400 / f); + +		if (type & MSR_TYPE_W) +			/* write-high */ +
> >>>> 	__clear_bit(msr, msr_bitmap + 0xc00 / f); + +	} +} + +static void
> >>>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + 					u32
> >>>> msr, int type)
> >>>>  {
> >>>>  	int f = sizeof(unsigned long);
> >>>> @@ -3744,20 +3796,75 @@ static void
> >>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
> >>>>  	 * We can control MSRs 0x00000000-0x00001fff and
> >>>>  0xc0000000-0xc0001fff. 	 */ 	if (msr <= 0x1fff) {
> >>>> -		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> >>>> -		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> >>>> +		if (type & MSR_TYPE_R)
> >>>> +			/* read-low */
> >>>> +			__set_bit(msr, msr_bitmap + 0x000 / f);
> >>>> +
> >>>> +		if (type & MSR_TYPE_W)
> >>>> +			/* write-low */
> >>>> +			__set_bit(msr, msr_bitmap + 0x800 / f);
> >>>> +
> >>>>  	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> >>>>  		msr &= 0x1fff;
> >>>> -		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> >>>> -		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> >>>> +		if (type & MSR_TYPE_R)
> >>>> +			/* read-high */
> >>>> +			__set_bit(msr, msr_bitmap + 0x400 / f);
> >>>> +
> >>>> +		if (type & MSR_TYPE_W)
> >>>> +			/* write-high */
> >>>> +			__set_bit(msr, msr_bitmap + 0xc00 / f);
> >>>> +
> >>>>  	} } + static void vmx_disable_intercept_for_msr(u32 msr, bool
> >>>>  longmode_only) { 	if (!longmode_only)
> >>>> -		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> >>>> -	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> >>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +						msr,
> >>>> MSR_TYPE_R | MSR_TYPE_W);
> >>>> +	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> >>>> +						msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void
> >>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, +					bool
> >>>> set) +{ +	if (!longmode_only) { +		if (set) +
> >>>> 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
> >>>> MSR_TYPE_R); +		else +
> >>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
> >>>> MSR_TYPE_R); + +	} +	if (set)
> >>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
> >>>> MSR_TYPE_R); +	else
> >>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
> >>>> MSR_TYPE_R); +} + +static void vmx_intercept_for_msr_write(u32 msr,
> >>>> bool longmode_only, +					bool set) +{ +	if (!longmode_only) { +		if
> >>>> (set) + 	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
> >>>> +					msr, MSR_TYPE_W); +		else +
> >>>> 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +					msr,
> >>>> MSR_TYPE_W); + +	} +	if (set)
> >>>> +		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
> >>>> MSR_TYPE_W); +	else
> >>>> +		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, +				msr,
> >>>> MSR_TYPE_W);
> >>>>  }
> >>>>  
> >>>>  /*
> >>>> @@ -3855,6 +3962,7 @@ static u32 vmx_secondary_exec_control(struct
> >>> vcpu_vmx *vmx)
> >>>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 	if
> >>>>  (!enable_apicv_reg_vid) 		exec_control &=
> >>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +	exec_control &=
> >>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 	return
> > exec_control; }
> >>>> @@ -6110,6 +6218,76 @@ static void update_cr8_intercept(struct
> > kvm_vcpu
> >>> *vcpu, int tpr, int irr)
> >>>>  	vmcs_write32(TPR_THRESHOLD, irr);
> >>>>  }
> >>>> +static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
> >>>> +{
> >>>> +	u32 exec_control;
> >>>> +	int msr;
> >>>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >>>> +
> >>>> +	if (!cpu_has_vmx_virtualize_x2apic_mode())
> >>>> +		return;
> >>>> +
> >>>> +	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> >>>> +	/* virtualize x2apic mode relies on tpr shadow */
> >>>> +	if (!(exec_control & CPU_BASED_TPR_SHADOW))
> >>>> +		return;
> >>>> +
> >>>> +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> >>>> +	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> >>>> +	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> >>>> +	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> >>>> +	vmx->virtual_x2apic_enabled = true;
> >>> Why track it?
> >> With this flag, we don't need to read vmcs to check whether we enabled
> >> virtua x2apic before.
> >> 
> > Why do you care? Just disabled it regardless.
> kvm_lapic_set_base will be called when creating lapic. At that time, vcpu didn't initialized. Then read/write vmcs in vmx_disable_virtual_x2apic_mode will cause error.
> With this flag, we only disable the virtual x2apic mode if it is enabled before. 
> 
Then call vmx_enable_virtual_x2apic_mode() only when mode actually changes.
kvm_lapic_set_base() can track it like it does to MSR_IA32_APICBASE_ENABLE.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Jan. 11, 2013, 4:54 p.m. UTC | #14
On Fri, Jan 11, 2013 at 07:36:44AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-01-10:
> > On Thu, Jan 10, 2013 at 12:22:51PM +0000, Zhang, Yang Z wrote:
> >> Gleb Natapov wrote on 2013-01-10:
> >>> On Thu, Jan 10, 2013 at 11:54:21AM +0000, Zhang, Yang Z wrote:
> >>>>>> The right logic should be:
> >>>>>> When register virtualization enabled, read all apic msr(except apic id reg
> > and
> >>>>> tmcct which have the hook) not cause vmexit and only write TPR not cause
> >>>>> vmexit.
> >>>>>> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
> >>>>>> It's better to put the patch of envirtualize x2apic mode as first patch.
> >>>>>> 
> >>>>> There is no point whatsoever to enable virtualize x2apic without
> >>>>> allowing at least one non intercepted access to x2apic MSR range and
> >>>>> this is what your patch is doing when run on cpu without vid capability.
> >>>> No, at least read/write TPR will not cause vmexit if virtualize x2apic mode is
> >>> enabled.
> >>> For that you need to disable 808H MSR intercept, which your patch does not
> > do.
> >> I want to do this in next patch.
> >> 
> > Then don't. There is no point in the patch that enables virtualize
> > x2apic and does not allow at least one non-intercepted MSR access.
> As I said, read/write TPR will not cause vmexit if virtual x2apic is set.
> 
From my reading of the spec you need to disable 808H intercept for that.
Is this not the case?

> >>>> I am not sure whether I understand your comments right in previous
> >>>> discussion, here is my thinking: 1. enable virtualize x2apic mode if
> >>>> guest is really using x2apic and clear TPR in msr read  and write
> >>>> bitmap. This will benefit reading TPR. 2. If APIC registers
> >>>> virtualization is enabled, clear all bit in rang
> >>> 0x800-0x8ff(except apic id reg and tmcct).
> >>> Clear all read bits in the range.
> >>> 
> >>>> 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
> >>>> 
> >>> Looks OK.
> >>> 
> >>>> One concern you mentioned is that vid enabled and virtualize x2apic is
> > disabled
> >>> but guest still use x2apic. In this case, we still use MSR bitmap to
> >>> intercept x2apic. This means the vEOI update will never happen. But we
> >>> still can benefit from interrupt window.
> >>>> 
> >>> What interrupt windows? Without virtualized x2apic TPR/EOI
> >>> virtualization will not happen and we rely on that in the code.
> >> Yes, but we can eliminate vmexit of interrupt window. Interrupt still can
> > delivery to guest without vmexit when guest enables interrupt if vid is enabled.
> > See SDM vol 3, 29.2.2.
> >> 
> > What does it matter that you eliminated vmexit of interrupt window if
> > you broke everything else? The vid patch assumes that apic emulation
> > either entirely happens in a software when vid is disabled or in a CPU
> > if vid is enabled. Mixed mode will not work (apic->isr_count = 1 trick
> > will break things for instance). And it is not worth it to complicate
> > the code to make it work.
> Yes, you are right. It too complicated.
> Another question? Why not to hide x2apic capability to guest when vid is supported and virtual x2apic mode is not supported? It should be more reasonable than disable vid when virtual x2apic mode is unavailable.
> 
Because I think there will be no such HW. If such HW expected to be
common then we can go this route.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 14, 2013, 1:03 a.m. UTC | #15
Gleb Natapov wrote on 2013-01-12:
> On Fri, Jan 11, 2013 at 07:36:44AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-10:
>>> On Thu, Jan 10, 2013 at 12:22:51PM +0000, Zhang, Yang Z wrote:
>>>> Gleb Natapov wrote on 2013-01-10:
>>>>> On Thu, Jan 10, 2013 at 11:54:21AM +0000, Zhang, Yang Z wrote:
>>>>>>>> The right logic should be:
>>>>>>>> When register virtualization enabled, read all apic msr(except apic id
> reg
>>> and
>>>>>>> tmcct which have the hook) not cause vmexit and only write TPR not
>>>>>>> cause vmexit.
>>>>>>>> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
>>>>>>>> It's better to put the patch of envirtualize x2apic mode as first patch.
>>>>>>>> 
>>>>>>> There is no point whatsoever to enable virtualize x2apic without
>>>>>>> allowing at least one non intercepted access to x2apic MSR range and
>>>>>>> this is what your patch is doing when run on cpu without vid capability.
>>>>>> No, at least read/write TPR will not cause vmexit if virtualize x2apic mode
> is
>>>>> enabled.
>>>>> For that you need to disable 808H MSR intercept, which your patch does
> not
>>> do.
>>>> I want to do this in next patch.
>>>> 
>>> Then don't. There is no point in the patch that enables virtualize
>>> x2apic and does not allow at least one non-intercepted MSR access.
>> As I said, read/write TPR will not cause vmexit if virtual x2apic is set.
>> 
> From my reading of the spec you need to disable 808H intercept for that.
> Is this not the case?
> 
>>>>>> I am not sure whether I understand your comments right in previous
>>>>>> discussion, here is my thinking: 1. enable virtualize x2apic mode if
>>>>>> guest is really using x2apic and clear TPR in msr read  and write
>>>>>> bitmap. This will benefit reading TPR. 2. If APIC registers
>>>>>> virtualization is enabled, clear all bit in rang
>>>>> 0x800-0x8ff(except apic id reg and tmcct).
>>>>> Clear all read bits in the range.
>>>>> 
>>>>>> 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
>>>>>> 
>>>>> Looks OK.
>>>>> 
>>>>>> One concern you mentioned is that vid enabled and virtualize x2apic is
>>> disabled
>>>>> but guest still use x2apic. In this case, we still use MSR bitmap to
>>>>> intercept x2apic. This means the vEOI update will never happen. But we
>>>>> still can benefit from interrupt window.
>>>>>> 
>>>>> What interrupt windows? Without virtualized x2apic TPR/EOI
>>>>> virtualization will not happen and we rely on that in the code.
>>>> Yes, but we can eliminate vmexit of interrupt window. Interrupt still can
>>> delivery to guest without vmexit when guest enables interrupt if vid
>>> is enabled. See SDM vol 3, 29.2.2.
>>>> 
>>> What does it matter that you eliminated vmexit of interrupt window if
>>> you broke everything else? The vid patch assumes that apic emulation
>>> either entirely happens in a software when vid is disabled or in a CPU
>>> if vid is enabled. Mixed mode will not work (apic->isr_count = 1 trick
>>> will break things for instance). And it is not worth it to complicate
>>> the code to make it work.
>> Yes, you are right. It too complicated.
>> Another question? Why not to hide x2apic capability to guest when vid is
> supported and virtual x2apic mode is not supported? It should be more
> reasonable than disable vid when virtual x2apic mode is unavailable.
>> 
> Because I think there will be no such HW. If such HW expected to be
> common then we can go this route.
Yes. No HW will support vid without supporting virtual x2apic mode. So we just ignore this case.

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 14, 2013, 1:14 a.m. UTC | #16
Gleb Natapov wrote on 2013-01-12:
> On Fri, Jan 11, 2013 at 07:36:44AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-10:
>>> On Thu, Jan 10, 2013 at 12:22:51PM +0000, Zhang, Yang Z wrote:
>>>> Gleb Natapov wrote on 2013-01-10:
>>>>> On Thu, Jan 10, 2013 at 11:54:21AM +0000, Zhang, Yang Z wrote:
>>>>>>>> The right logic should be:
>>>>>>>> When register virtualization enabled, read all apic msr(except apic id
> reg
>>> and
>>>>>>> tmcct which have the hook) not cause vmexit and only write TPR not
>>>>>>> cause vmexit.
>>>>>>>> When vid enabled, write to TPR, EOI and SELF IPI not cause vmexit.
>>>>>>>> It's better to put the patch of envirtualize x2apic mode as first patch.
>>>>>>>> 
>>>>>>> There is no point whatsoever to enable virtualize x2apic without
>>>>>>> allowing at least one non intercepted access to x2apic MSR range and
>>>>>>> this is what your patch is doing when run on cpu without vid capability.
>>>>>> No, at least read/write TPR will not cause vmexit if virtualize x2apic mode
> is
>>>>> enabled.
>>>>> For that you need to disable 808H MSR intercept, which your patch does
> not
>>> do.
>>>> I want to do this in next patch.
>>>> 
>>> Then don't. There is no point in the patch that enables virtualize
>>> x2apic and does not allow at least one non-intercepted MSR access.
>> As I said, read/write TPR will not cause vmexit if virtual x2apic is set.
>> 
> From my reading of the spec you need to disable 808H intercept for that.
> Is this not the case?
After thinking, since Linux doesn't use TPR, there is no point to do this. No real guest will benefit from this.

>>>>>> I am not sure whether I understand your comments right in previous
>>>>>> discussion, here is my thinking: 1. enable virtualize x2apic mode if
>>>>>> guest is really using x2apic and clear TPR in msr read  and write
>>>>>> bitmap. This will benefit reading TPR. 2. If APIC registers
>>>>>> virtualization is enabled, clear all bit in rang
>>>>> 0x800-0x8ff(except apic id reg and tmcct).
>>>>> Clear all read bits in the range.
>>>>> 
>>>>>> 3. If vid is enabled, clear EOI and SELF IPI in msr write map.
>>>>>> 
>>>>> Looks OK.
>>>>> 
>>>>>> One concern you mentioned is that vid enabled and virtualize x2apic is
>>> disabled
>>>>> but guest still use x2apic. In this case, we still use MSR bitmap to
>>>>> intercept x2apic. This means the vEOI update will never happen. But we
>>>>> still can benefit from interrupt window.
>>>>>> 
>>>>> What interrupt windows? Without virtualized x2apic TPR/EOI
>>>>> virtualization will not happen and we rely on that in the code.
>>>> Yes, but we can eliminate vmexit of interrupt window. Interrupt still can
>>> delivery to guest without vmexit when guest enables interrupt if vid
>>> is enabled. See SDM vol 3, 29.2.2.
>>>> 
>>> What does it matter that you eliminated vmexit of interrupt window if
>>> you broke everything else? The vid patch assumes that apic emulation
>>> either entirely happens in a software when vid is disabled or in a CPU
>>> if vid is enabled. Mixed mode will not work (apic->isr_count = 1 trick
>>> will break things for instance). And it is not worth it to complicate
>>> the code to make it work.
>> Yes, you are right. It too complicated.
>> Another question? Why not to hide x2apic capability to guest when vid is
> supported and virtual x2apic mode is not supported? It should be more
> reasonable than disable vid when virtual x2apic mode is unavailable.
>> 
> Because I think there will be no such HW. If such HW expected to be
> common then we can go this route.
> 
> --
> 			Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c431b33..572a562 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -697,6 +697,8 @@  struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+	void (*enable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
+	void (*disable_virtual_x2apic_mode)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 44c3f7e..0a54df0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -139,6 +139,7 @@ 
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
 #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_RDTSCP			0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0664c13..ec38906 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1328,7 +1328,10 @@  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 		u32 id = kvm_apic_id(apic);
 		u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
 		kvm_apic_set_ldr(apic, ldr);
-	}
+		kvm_x86_ops->enable_virtual_x2apic_mode(vcpu);
+	} else
+		kvm_x86_ops->disable_virtual_x2apic_mode(vcpu);
+
 	apic->base_address = apic->vcpu->arch.apic_base &
 			     MSR_IA32_APICBASE_BASE;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd..0b82cb1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,11 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
+static void svm_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
+{
+	return;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4295,7 @@  static struct kvm_x86_ops svm_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
+	.enable_virtual_x2apic_mode = svm_enable_virtual_x2apic_mode,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 688f43f..b203ce7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -433,6 +433,8 @@  struct vcpu_vmx {
 
 	bool rdtscp_enabled;
 
+	bool virtual_x2apic_enabled;
+
 	/* Support for a guest hypervisor (nested VMX) */
 	struct nested_vmx nested;
 };
@@ -767,12 +769,23 @@  static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 }
 
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
 static inline bool cpu_has_vmx_apic_register_virt(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_APIC_REGISTER_VIRT;
 }
 
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+	return false;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
@@ -2544,6 +2557,7 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
 		min2 = 0;
 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 			SECONDARY_EXEC_WBINVD_EXITING |
 			SECONDARY_EXEC_ENABLE_VPID |
 			SECONDARY_EXEC_ENABLE_EPT |
@@ -3731,7 +3745,45 @@  static void free_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+#define MSR_TYPE_R	1
+#define MSR_TYPE_W	2
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+						u32 msr, int type)
+{
+	int f = sizeof(unsigned long);
+
+	if (!cpu_has_vmx_msr_bitmap())
+		return;
+
+	/*
+	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+	 * have the write-low and read-high bitmap offsets the wrong way round.
+	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+	 */
+	if (msr <= 0x1fff) {
+		if (type & MSR_TYPE_R)
+			/* read-low */
+			__clear_bit(msr, msr_bitmap + 0x000 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-low */
+			__clear_bit(msr, msr_bitmap + 0x800 / f);
+
+	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+		msr &= 0x1fff;
+		if (type & MSR_TYPE_R)
+			/* read-high */
+			__clear_bit(msr, msr_bitmap + 0x400 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-high */
+			__clear_bit(msr, msr_bitmap + 0xc00 / f);
+
+	}
+}
+
+static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+						u32 msr, int type)
 {
 	int f = sizeof(unsigned long);
 
@@ -3744,20 +3796,75 @@  static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 	 */
 	if (msr <= 0x1fff) {
-		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
-		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+		if (type & MSR_TYPE_R)
+			/* read-low */
+			__set_bit(msr, msr_bitmap + 0x000 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-low */
+			__set_bit(msr, msr_bitmap + 0x800 / f);
+
 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 		msr &= 0x1fff;
-		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
-		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+		if (type & MSR_TYPE_R)
+			/* read-high */
+			__set_bit(msr, msr_bitmap + 0x400 / f);
+
+		if (type & MSR_TYPE_W)
+			/* write-high */
+			__set_bit(msr, msr_bitmap + 0xc00 / f);
+
 	}
 }
 
+
 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 {
 	if (!longmode_only)
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+						msr, MSR_TYPE_R | MSR_TYPE_W);
+	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+						msr, MSR_TYPE_R | MSR_TYPE_W);
+}
+
+static void vmx_intercept_for_msr_read(u32 msr, bool longmode_only,
+					bool set)
+{
+	if (!longmode_only) {
+		if (set)
+			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
+					msr, MSR_TYPE_R);
+		else
+			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+					msr, MSR_TYPE_R);
+
+	}
+	if (set)
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
+				msr, MSR_TYPE_R);
+	else
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+				msr, MSR_TYPE_R);
+}
+
+static void vmx_intercept_for_msr_write(u32 msr, bool longmode_only,
+					bool set)
+{
+	if (!longmode_only) {
+		if (set)
+			__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy,
+					msr, MSR_TYPE_W);
+		else
+			__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+					msr, MSR_TYPE_W);
+
+	}
+	if (set)
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
+				msr, MSR_TYPE_W);
+	else
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+				msr, MSR_TYPE_W);
 }
 
 /*
@@ -3855,6 +3962,7 @@  static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 	if (!enable_apicv_reg_vid)
 		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 	return exec_control;
 }
 
@@ -6110,6 +6218,76 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
+static void vmx_enable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
+{
+	u32 exec_control;
+	int msr;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!cpu_has_vmx_virtualize_x2apic_mode())
+		return;
+
+	exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	/* virtualize x2apic mode relies on tpr shadow */
+	if (!(exec_control & CPU_BASED_TPR_SHADOW))
+		return;
+
+	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	vmx->virtual_x2apic_enabled = true;
+
+	if (!cpu_has_vmx_virtual_intr_delivery())
+		return;
+
+	for (msr = 0x800; msr <= 0x8ff; msr++)
+		vmx_intercept_for_msr_read(msr, false, false);
+
+	/* APIC ID */
+	vmx_intercept_for_msr_read(0x802, false, true);
+	/* TMCCT */
+	vmx_intercept_for_msr_read(0x839, false, true);
+	/* TPR */
+	vmx_intercept_for_msr_write(0x808, false, false);
+	/* EOI */
+	vmx_intercept_for_msr_write(0x80b, false, false);
+	/* SELF-IPI */
+	vmx_intercept_for_msr_write(0x83f, false, false);
+
+}
+
+static void vmx_disable_virtual_x2apic_mode(struct kvm_vcpu *vcpu)
+{
+	u32 second_exec_control;
+	int msr;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/* If doesn't enable virtual x2apic before, do nothing*/
+	if (!vmx->virtual_x2apic_enabled)
+		return;
+
+	second_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	/* disalbe virtual x2apic*/
+	second_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+	second_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, second_exec_control);
+	vmx->virtual_x2apic_enabled = false;
+
+	if (!cpu_has_vmx_virtual_intr_delivery())
+		return;
+
+	for (msr = 0x800; msr <= 0x8ff; msr++)
+		vmx_intercept_for_msr_read(msr, false, true);
+
+	/* TPR */
+	vmx_intercept_for_msr_write(0x808, false, true);
+	/* EOI */
+	vmx_intercept_for_msr_write(0x80b, false, true);
+	/* SELF-IPI */
+	vmx_intercept_for_msr_write(0x83f, false, true);
+}
+
 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -7373,6 +7551,8 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
+	.enable_virtual_x2apic_mode = vmx_enable_virtual_x2apic_mode,
+	.disable_virtual_x2apic_mode = vmx_disable_virtual_x2apic_mode,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,