diff mbox

[v2,3/6] x86, apicv: add virtual interrupt delivery support

Message ID 1353485379-6823-4-git-send-email-yang.z.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhang, Yang Z Nov. 21, 2012, 8:09 a.m. UTC
Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
manually, which is fully taken care of by the hardware. This needs
some special awareness into existing interrupr injection path:

- for pending interrupt, instead of direct injection, we may need
  update architecture specific indicators before resuming to guest.

- A pending interrupt, which is masked by ISR, should be also
  considered in above update action, since hardware will decide
  when to inject it at right time. Current has_interrupt and
  get_interrupt only returns a valid vector from injection p.o.v.

Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
---
 arch/x86/include/asm/kvm_host.h |    4 +
 arch/x86/include/asm/vmx.h      |   11 ++++
 arch/x86/kvm/irq.c              |   44 ++++++++++++++
 arch/x86/kvm/lapic.c            |   44 +++++++++++++-
 arch/x86/kvm/lapic.h            |   13 ++++
 arch/x86/kvm/svm.c              |    6 ++
 arch/x86/kvm/vmx.c              |  125 ++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |   16 +++++-
 virt/kvm/ioapic.c               |    1 +
 9 files changed, 260 insertions(+), 4 deletions(-)

Comments

Gleb Natapov Nov. 22, 2012, 1:57 p.m. UTC | #1
On Wed, Nov 21, 2012 at 04:09:36PM +0800, Yang Zhang wrote:
> Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
> manually, which is fully taken care of by the hardware. This needs
> some special awareness into existing interrupr injection path:
> 
> - for pending interrupt, instead of direct injection, we may need
>   update architecture specific indicators before resuming to guest.
> 
> - A pending interrupt, which is masked by ISR, should be also
>   considered in above update action, since hardware will decide
>   when to inject it at right time. Current has_interrupt and
>   get_interrupt only returns a valid vector from injection p.o.v.
> 
> Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>
> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    4 +
>  arch/x86/include/asm/vmx.h      |   11 ++++
>  arch/x86/kvm/irq.c              |   44 ++++++++++++++
>  arch/x86/kvm/lapic.c            |   44 +++++++++++++-
>  arch/x86/kvm/lapic.h            |   13 ++++
>  arch/x86/kvm/svm.c              |    6 ++
>  arch/x86/kvm/vmx.c              |  125 ++++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              |   16 +++++-
>  virt/kvm/ioapic.c               |    1 +
>  9 files changed, 260 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index b2e11f4..8e07a86 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -682,6 +682,10 @@ struct kvm_x86_ops {
>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> +	int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
> +	void (*update_irq)(struct kvm_vcpu *vcpu);
> +	void (*set_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector,
> +			int need_eoi, int global);
>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>  	int (*get_tdp_level)(void);
>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 21101b6..1003341 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -62,6 +62,7 @@
>  #define EXIT_REASON_MCE_DURING_VMENTRY  41
>  #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
>  #define EXIT_REASON_APIC_ACCESS         44
> +#define EXIT_REASON_EOI_INDUCED         45
>  #define EXIT_REASON_EPT_VIOLATION       48
>  #define EXIT_REASON_EPT_MISCONFIG       49
>  #define EXIT_REASON_WBINVD              54
> @@ -143,6 +144,7 @@
>  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
>  #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
>  #define SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100
> +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
>  #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
>  
> @@ -180,6 +182,7 @@ enum vmcs_field {
>  	GUEST_GS_SELECTOR               = 0x0000080a,
>  	GUEST_LDTR_SELECTOR             = 0x0000080c,
>  	GUEST_TR_SELECTOR               = 0x0000080e,
> +	GUEST_INTR_STATUS               = 0x00000810,
>  	HOST_ES_SELECTOR                = 0x00000c00,
>  	HOST_CS_SELECTOR                = 0x00000c02,
>  	HOST_SS_SELECTOR                = 0x00000c04,
> @@ -207,6 +210,14 @@ enum vmcs_field {
>  	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
>  	EPT_POINTER                     = 0x0000201a,
>  	EPT_POINTER_HIGH                = 0x0000201b,
> +	EOI_EXIT_BITMAP0                = 0x0000201c,
> +	EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
> +	EOI_EXIT_BITMAP1                = 0x0000201e,
> +	EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
> +	EOI_EXIT_BITMAP2                = 0x00002020,
> +	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
> +	EOI_EXIT_BITMAP3                = 0x00002022,
> +	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
>  	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
>  	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
>  	VMCS_LINK_POINTER               = 0x00002800,
> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> index 7e06ba1..c7356a3 100644
> --- a/arch/x86/kvm/irq.c
> +++ b/arch/x86/kvm/irq.c
> @@ -60,6 +60,29 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
>  EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
>  
>  /*
> + * check if there is pending interrupt without
> + * intack. This _apicv version is used when hardware
> + * supports APIC virtualization with virtual interrupt
> + * delivery support. In such case, KVM is not required
> + * to poll pending APIC interrupt, and thus this
> + * interface is used to poll pending interupts from
> + * non-APIC source.
> + */
> +int kvm_cpu_has_extint(struct kvm_vcpu *v)
> +{
> +	struct kvm_pic *s;
> +
> +	if (!irqchip_in_kernel(v->kvm))
> +		return v->arch.interrupt.pending;
> +
This does not belong here. If !irqchip_in_kernel() the function will not
be called. Hmm actually with !irqchip_in_kernel() kernel will oops in
kvm_apic_vid_enabled() since it dereference vcpu->arch.apic without
checking if it is NULL.


> +	if (kvm_apic_accept_pic_intr(v)) {
> +		s = pic_irqchip(v->kvm);	/* PIC */
> +		return s->output;
> +	} else
> +		return 0;
This is code duplication from kvm_cpu_has_interrupt(). Write common
function and call it from kvm_cpu_has_interrupt(), but even that is
not needed, see below.

> +}
> +
> +/*
>   * Read pending interrupt vector and intack.
>   */
>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
> @@ -82,6 +105,27 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
>  }
>  EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
>  
> +/*
> + * Read pending interrupt vector and intack.
> + * Similar to kvm_cpu_has_interrupt_apicv, to get
> + * interrupts from non-APIC sources.
> + */
> +int kvm_cpu_get_extint(struct kvm_vcpu *v)
> +{
> +	struct kvm_pic *s;
> +	int vector = -1;
> +
> +	if (!irqchip_in_kernel(v->kvm))
> +		return v->arch.interrupt.nr;
Same as above.

> +
> +	if (kvm_apic_accept_pic_intr(v)) {
> +		s = pic_irqchip(v->kvm);
> +		s->output = 0;		/* PIC */
> +		vector = kvm_pic_read_irq(v->kvm);
Ditto about code duplication.
 
> +	}
> +	return vector;
> +}
> +
>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
>  {
>  	kvm_inject_apic_timer_irqs(vcpu);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index a63ffdc..af48361 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -643,6 +643,12 @@ out:
>  	return ret;
>  }
>  
> +void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
> +		int need_eoi, int global)
> +{
> +	kvm_x86_ops->set_eoi_exitmap(vcpu, vector, need_eoi, global);
> +}
> +
>  /*
>   * Add a pending IRQ into lapic.
>   * Return 1 if successfully added and 0 if discarded.
> @@ -664,8 +670,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  		if (trig_mode) {
>  			apic_debug("level trig mode for vector %d", vector);
>  			apic_set_vector(vector, apic->regs + APIC_TMR);
> -		} else
> +			kvm_set_eoi_exitmap(vcpu, vector, 1, 0);
> +		} else {
>  			apic_clear_vector(vector, apic->regs + APIC_TMR);
> +			kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
Why not use APIC_TMR directly instead of kvm_set_eoi_exitmap() logic?

> +		}
>  
>  		result = !apic_test_and_set_irr(vector, apic);
>  		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
> @@ -769,6 +778,26 @@ static int apic_set_eoi(struct kvm_lapic *apic)
>  	return vector;
>  }
>  
> +/*
> + * this interface assumes a trap-like exit, which has already finished
> + * desired side effect including vISR and vPPR update.
> + */
> +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	int trigger_mode;
> +
> +	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
> +		trigger_mode = IOAPIC_LEVEL_TRIG;
> +	else
> +		trigger_mode = IOAPIC_EDGE_TRIG;
> +
> +	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
> +		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
> +	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
More code duplication. Why not call apic_set_eoi() and skip isr/ppr
logic there if vid is enabled, or put the logic in common function and
call from both places.

> +}
> +EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
> +
>  static void apic_send_ipi(struct kvm_lapic *apic)
>  {
>  	u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
> @@ -1510,6 +1539,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
>  	kvm_lapic_reset(vcpu);
>  	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
>  
> +	if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
> +		apic->vid_enabled = true;
What do you have vid_enabled for. This is global, not per apic, state.

>  	return 0;
>  nomem_free_apic:
>  	kfree(apic);
> @@ -1533,6 +1564,17 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
>  	return highest_irr;
>  }
>  
> +int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +
> +	if (!apic || !apic_enabled(apic))
> +		return -1;
> +
> +	return apic_find_highest_irr(apic);
> +}
> +EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
> +
>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
>  {
>  	u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index c42f111..2503a64 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -20,6 +20,7 @@ struct kvm_lapic {
>  	u32 divide_count;
>  	struct kvm_vcpu *vcpu;
>  	bool irr_pending;
> +	bool vid_enabled;
>  	/* Number of bits set in ISR. */
>  	s16 isr_count;
>  	/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
> @@ -39,6 +40,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
>  int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
>  int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
> +int kvm_cpu_has_extint(struct kvm_vcpu *v);
> +int kvm_cpu_get_extint(struct kvm_vcpu *v);
> +int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
>  void kvm_lapic_reset(struct kvm_vcpu *vcpu);
>  u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
>  void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
> @@ -50,6 +54,8 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
>  int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
>  int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
>  int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
> +void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
> +		int need_eoi, int global);
>  int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
>  
>  bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> @@ -65,6 +71,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
>  void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
>  
>  int kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
> +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
>  
>  void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
>  void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
> @@ -81,6 +88,12 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
>  	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
>  }
>  
> +static inline bool kvm_apic_vid_enabled(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	return apic->vid_enabled;
> +}
vcpu->arch.apic can be NULL from where this is called.

> +
>  int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
>  void kvm_lapic_init(void);
>  
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d017df3..b290aba 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3564,6 +3564,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>  }
>  
> +static int svm_has_virtual_interrupt_delivery(struct kvm_vcpu *vcpu)
> +{
> +	return 0;
> +}
> +
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -4283,6 +4288,7 @@ static struct kvm_x86_ops svm_x86_ops = {
>  	.enable_nmi_window = enable_nmi_window,
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
> +	.has_virtual_interrupt_delivery = svm_has_virtual_interrupt_delivery,
>  
>  	.set_tss_addr = svm_set_tss_addr,
>  	.get_tdp_level = get_npt_level,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index e9287aa..c0d74ce 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -86,6 +86,9 @@ module_param(fasteoi, bool, S_IRUGO);
>  static bool __read_mostly enable_apicv_reg = 0;
>  module_param(enable_apicv_reg, bool, S_IRUGO);
>  
> +static bool __read_mostly enable_apicv_vid = 0;
> +module_param(enable_apicv_vid, bool, S_IRUGO);
> +
>  /*
>   * If nested=1, nested virtualization is supported, i.e., guests may use
>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
> @@ -432,6 +435,10 @@ struct vcpu_vmx {
>  
>  	bool rdtscp_enabled;
>  
> +	u8 eoi_exitmap_changed;
> +	u64 eoi_exit_bitmap[4];
> +	u64 eoi_exit_bitmap_global[4];
> +
>  	/* Support for a guest hypervisor (nested VMX) */
>  	struct nested_vmx nested;
>  };
> @@ -770,6 +777,12 @@ static inline bool cpu_has_vmx_apic_register_virt(void)
>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
>  }
>  
> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
> +{
> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> +		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
> +}
> +
>  static inline bool cpu_has_vmx_flexpriority(void)
>  {
>  	return cpu_has_vmx_tpr_shadow() &&
> @@ -2480,7 +2493,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
>  			SECONDARY_EXEC_RDTSCP |
>  			SECONDARY_EXEC_ENABLE_INVPCID |
> -			SECONDARY_EXEC_APIC_REGISTER_VIRT;
> +			SECONDARY_EXEC_APIC_REGISTER_VIRT |
> +			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>  		if (adjust_vmx_controls(min2, opt2,
>  					MSR_IA32_VMX_PROCBASED_CTLS2,
>  					&_cpu_based_2nd_exec_control) < 0)
> @@ -2494,7 +2508,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  
>  	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
>  		_cpu_based_2nd_exec_control &= ~(
> -				SECONDARY_EXEC_APIC_REGISTER_VIRT);
> +				SECONDARY_EXEC_APIC_REGISTER_VIRT |
> +				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
>  
>  	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
>  		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
> @@ -2696,6 +2711,9 @@ static __init int hardware_setup(void)
>  	if (!cpu_has_vmx_apic_register_virt())
>  		enable_apicv_reg = 0;
>  
> +	if (!cpu_has_vmx_virtual_intr_delivery())
> +		enable_apicv_vid = 0;
> +
>  	if (nested)
>  		nested_vmx_setup_ctls_msrs();
>  
> @@ -3811,6 +3829,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>  	if (!enable_apicv_reg)
>  		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> +	if (!enable_apicv_vid)
> +		exec_control &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>  	return exec_control;
>  }
>  
> @@ -3855,6 +3875,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  				vmx_secondary_exec_control(vmx));
>  	}
>  
> +	if (enable_apicv_vid) {
> +		vmcs_write64(EOI_EXIT_BITMAP0, 0);
> +		vmcs_write64(EOI_EXIT_BITMAP1, 0);
> +		vmcs_write64(EOI_EXIT_BITMAP2, 0);
> +		vmcs_write64(EOI_EXIT_BITMAP3, 0);
> +
> +		vmcs_write16(GUEST_INTR_STATUS, 0);
> +	}
> +
>  	if (ple_gap) {
>  		vmcs_write32(PLE_GAP, ple_gap);
>  		vmcs_write32(PLE_WINDOW, ple_window);
> @@ -4770,6 +4799,16 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
>  	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
>  }
>  
> +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	int vector = exit_qualification & 0xff;
> +
> +	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
> +	kvm_apic_set_eoi_accelerated(vcpu, vector);
> +	return 1;
> +}
> +
>  static int handle_apic_write(struct kvm_vcpu *vcpu)
>  {
>  	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> @@ -5719,6 +5758,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
>  	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
>  	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
> +	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
>  	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
>  	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
>  	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
> @@ -6049,6 +6089,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  
>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  {
> +    /* no need for tpr_threshold update if APIC virtual
> +     * interrupt delivery is enabled */
> +	if (!enable_apicv_vid)
> +		return ;

Just set kvm_x86_ops->update_cr8_intercept to NULL if !enable_apicv_vid
and the function will not be called.

> +
>  	if (irr == -1 || tpr < irr) {
>  		vmcs_write32(TPR_THRESHOLD, 0);
>  		return;
> @@ -6057,6 +6102,79 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  	vmcs_write32(TPR_THRESHOLD, irr);
>  }
>  
> +static int vmx_has_virtual_interrupt_delivery(struct kvm_vcpu *vcpu)
> +{
> +	return irqchip_in_kernel(vcpu->kvm) && enable_apicv_vid;
> +}
> +
> +static void vmx_update_irq(struct kvm_vcpu *vcpu)
> +{
> +	u16 status;
> +	u8 old;
> +	int vector;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!enable_apicv_vid)
> +		return ;
Ditto. Set kvm_x86_ops->update_irq to a function that does nothing if
!enable_apicv_vid. BTW you do not set this callback in SVM code and call
it unconditionally.

> +
> +	vector = kvm_apic_get_highest_irr(vcpu);
> +	if (vector == -1)
> +		return;
> +
> +	status = vmcs_read16(GUEST_INTR_STATUS);
> +	old = (u8)status & 0xff;
> +	if ((u8)vector != old) {
> +		status &= ~0xff;
> +		status |= (u8)vector;
> +		vmcs_write16(GUEST_INTR_STATUS, status);
> +	}
Please write RVI assessor functions.

> +
> +	if (vmx->eoi_exitmap_changed) {
> +#define UPDATE_EOI_EXITMAP(v, e) {				\
> +	if ((v)->eoi_exitmap_changed & (1 << (e)))	\
> +		vmcs_write64(EOI_EXIT_BITMAP##e,		\
> +		(v)->eoi_exit_bitmap[e] | (v)->eoi_exit_bitmap_global[e]); }
Inline function would do. But why calculate this on each entry? We want
EOI exits only for level IOAPIC interrupts and edge IOAPIC interrupt
with registered notifiers. This configuration rarely changes.


> +
> +		UPDATE_EOI_EXITMAP(vmx, 0);
> +		UPDATE_EOI_EXITMAP(vmx, 1);
> +		UPDATE_EOI_EXITMAP(vmx, 2);
> +		UPDATE_EOI_EXITMAP(vmx, 3);
> +		vmx->eoi_exitmap_changed = 0;
> +	}
> +}
> +
> +static void vmx_set_eoi_exitmap(struct kvm_vcpu *vcpu,
> +				int vector,
> +				int need_eoi, int global)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int index, offset, changed;
> +	unsigned long *eoi_exitmap;
> +
> +	if (!enable_apicv_vid)
> +		return ;
> +
> +	if (WARN_ONCE((vector < 0) || (vector > 255),
> +		"KVM VMX: vector (%d) out of range\n", vector))
> +		return;
> +
> +	index = vector >> 6;
> +	offset = vector & 63;
> +	if (global)
> +		eoi_exitmap =
> +		    (unsigned long *)&vmx->eoi_exit_bitmap_global[index];
> +	else
> +		eoi_exitmap = (unsigned long *)&vmx->eoi_exit_bitmap[index];
> +
> +	if (need_eoi)
> +		changed = !test_and_set_bit(offset, eoi_exitmap);
> +	else
> +		changed = test_and_clear_bit(offset, eoi_exitmap);
> +
> +	if (changed)
> +		vmx->eoi_exitmap_changed |= 1 << index;
> +}
> +
>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
>  {
>  	u32 exit_intr_info;
> @@ -7320,6 +7438,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
>  	.enable_nmi_window = enable_nmi_window,
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
> +	.has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
> +	.update_irq = vmx_update_irq,
You need to initialize this one in svm.c too.

> +	.set_eoi_exitmap = vmx_set_eoi_exitmap,
>  
>  	.set_tss_addr = vmx_set_tss_addr,
>  	.get_tdp_level = get_ept_level,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 4f76417..8b8de3b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5190,6 +5190,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
>  			vcpu->arch.nmi_injected = true;
>  			kvm_x86_ops->set_nmi(vcpu);
>  		}
> +	} else if (kvm_apic_vid_enabled(vcpu)) {
> +		if (kvm_cpu_has_extint(vcpu) &&
> +		    kvm_x86_ops->interrupt_allowed(vcpu)) {
> +			kvm_queue_interrupt(vcpu,
> +				kvm_cpu_get_extint(vcpu), false);
> +			kvm_x86_ops->set_irq(vcpu);
> +		}
Drop all this and modify kvm_cpu_has_interrupt()/kvm_cpu_get_interrupt()
to consider apic interrupts only if vid is enabled then the if below
will just work.

>  	} else if (kvm_cpu_has_interrupt(vcpu)) {
>  		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
>  			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
> @@ -5289,12 +5296,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	}
>  
>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> +		/* update archtecture specific hints for APIC
> +		 * virtual interrupt delivery */
> +		kvm_x86_ops->update_irq(vcpu);
> +
>  		inject_pending_event(vcpu);
>  
>  		/* enable NMI/IRQ window open exits if needed */
>  		if (vcpu->arch.nmi_pending)
>  			kvm_x86_ops->enable_nmi_window(vcpu);
> -		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
> +		else if (kvm_apic_vid_enabled(vcpu)) {
> +			if (kvm_cpu_has_extint(vcpu))
> +				kvm_x86_ops->enable_irq_window(vcpu);
Same as above. With proper kvm_cpu_has_interrupt() implementation this
id is not needed.

> +		} else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
>  			kvm_x86_ops->enable_irq_window(vcpu);
>  
>  		if (kvm_lapic_enabled(vcpu)) {
> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> index 166c450..898aa62 100644
> --- a/virt/kvm/ioapic.c
> +++ b/virt/kvm/ioapic.c
> @@ -186,6 +186,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
>  		/* need to read apic_id from apic regiest since
>  		 * it can be rewritten */
>  		irqe.dest_id = ioapic->kvm->bsp_vcpu_id;
> +		kvm_set_eoi_exitmap(ioapic->kvm->vcpus[0], irqe.vector, 1, 1);
>  	}
>  #endif
>  	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
> -- 
> 1.7.1

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Nov. 23, 2012, 11:46 a.m. UTC | #2
Gleb Natapov wrote on 2012-11-22:
> On Wed, Nov 21, 2012 at 04:09:36PM +0800, Yang Zhang wrote:
>> Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
>> manually, which is fully taken care of by the hardware. This needs
>> some special awareness into existing interrupr injection path:
>> 
>> - for pending interrupt, instead of direct injection, we may need
>>   update architecture specific indicators before resuming to guest.
>> - A pending interrupt, which is masked by ISR, should be also
>>   considered in above update action, since hardware will decide
>>   when to inject it at right time. Current has_interrupt and
>>   get_interrupt only returns a valid vector from injection p.o.v.
>> Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>
>> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    4 + arch/x86/include/asm/vmx.h   
>>    |   11 ++++ arch/x86/kvm/irq.c              |   44 ++++++++++++++
>>  arch/x86/kvm/lapic.c            |   44 +++++++++++++-
>>  arch/x86/kvm/lapic.h            |   13 ++++ arch/x86/kvm/svm.c        
>>       |    6 ++ arch/x86/kvm/vmx.c              |  125
>>  ++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c            
>>   |   16 +++++- virt/kvm/ioapic.c               |    1 + 9 files
>>  changed, 260 insertions(+), 4 deletions(-)
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index b2e11f4..8e07a86 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -682,6 +682,10 @@ struct kvm_x86_ops {
>>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
>>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>> +	int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
>> +	void (*update_irq)(struct kvm_vcpu *vcpu);
>> +	void (*set_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector,
>> +			int need_eoi, int global);
>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>  	int (*get_tdp_level)(void);
>>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 21101b6..1003341 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -62,6 +62,7 @@
>>  #define EXIT_REASON_MCE_DURING_VMENTRY  41 #define
>>  EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS    
>>      44 +#define EXIT_REASON_EOI_INDUCED         45 #define
>>  EXIT_REASON_EPT_VIOLATION       48 #define EXIT_REASON_EPT_MISCONFIG  
>>      49 #define EXIT_REASON_WBINVD              54 @@ -143,6 +144,7 @@
>>  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
>>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080 #define
>>  SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100 +#define
>>  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200 #define
>>  SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400 #define
>>  SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
>> @@ -180,6 +182,7 @@ enum vmcs_field {
>>  	GUEST_GS_SELECTOR               = 0x0000080a, 	GUEST_LDTR_SELECTOR   
>>           = 0x0000080c, 	GUEST_TR_SELECTOR               = 0x0000080e,
>>  +	GUEST_INTR_STATUS               = 0x00000810, 	HOST_ES_SELECTOR     
>>            = 0x00000c00, 	HOST_CS_SELECTOR                = 0x00000c02,
>>  	HOST_SS_SELECTOR                = 0x00000c04, @@ -207,6 +210,14 @@
>>  enum vmcs_field { 	APIC_ACCESS_ADDR_HIGH		= 0x00002015, 	EPT_POINTER  
>>                    = 0x0000201a, 	EPT_POINTER_HIGH                =
>>  0x0000201b,
>> +	EOI_EXIT_BITMAP0                = 0x0000201c,
>> +	EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
>> +	EOI_EXIT_BITMAP1                = 0x0000201e,
>> +	EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
>> +	EOI_EXIT_BITMAP2                = 0x00002020,
>> +	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
>> +	EOI_EXIT_BITMAP3                = 0x00002022,
>> +	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
>>  	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
>>  	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
>>  	VMCS_LINK_POINTER               = 0x00002800,
>> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
>> index 7e06ba1..c7356a3 100644
>> --- a/arch/x86/kvm/irq.c
>> +++ b/arch/x86/kvm/irq.c
>> @@ -60,6 +60,29 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
>>  EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
>>  
>>  /*
>> + * check if there is pending interrupt without
>> + * intack. This _apicv version is used when hardware
>> + * supports APIC virtualization with virtual interrupt
>> + * delivery support. In such case, KVM is not required
>> + * to poll pending APIC interrupt, and thus this
>> + * interface is used to poll pending interupts from
>> + * non-APIC source.
>> + */
>> +int kvm_cpu_has_extint(struct kvm_vcpu *v)
>> +{
>> +	struct kvm_pic *s;
>> +
>> +	if (!irqchip_in_kernel(v->kvm))
>> +		return v->arch.interrupt.pending;
>> +
> This does not belong here. If !irqchip_in_kernel() the function will not
> be called. Hmm actually with !irqchip_in_kernel() kernel will oops in
> kvm_apic_vid_enabled() since it dereference vcpu->arch.apic without
> checking if it is NULL.

Right. Will remove it in next version and add the check in kvm_apic_vid_enabled.
 
> 
>> +	if (kvm_apic_accept_pic_intr(v)) {
>> +		s = pic_irqchip(v->kvm);	/* PIC */
>> +		return s->output;
>> +	} else
>> +		return 0;
> This is code duplication from kvm_cpu_has_interrupt(). Write common
> function and call it from kvm_cpu_has_interrupt(), but even that is
> not needed, see below.

Why it is not needed? 
 
>> +}
>> +
>> +/*
>>   * Read pending interrupt vector and intack.
>>   */
>>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v) @@ -82,6 +105,27 @@ int
>>  kvm_cpu_get_interrupt(struct kvm_vcpu *v) }
>>  EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
>> +/*
>> + * Read pending interrupt vector and intack.
>> + * Similar to kvm_cpu_has_interrupt_apicv, to get
>> + * interrupts from non-APIC sources.
>> + */
>> +int kvm_cpu_get_extint(struct kvm_vcpu *v)
>> +{
>> +	struct kvm_pic *s;
>> +	int vector = -1;
>> +
>> +	if (!irqchip_in_kernel(v->kvm))
>> +		return v->arch.interrupt.nr;
> Same as above.
> 
>> +
>> +	if (kvm_apic_accept_pic_intr(v)) {
>> +		s = pic_irqchip(v->kvm);
>> +		s->output = 0;		/* PIC */
>> +		vector = kvm_pic_read_irq(v->kvm);
> Ditto about code duplication.
> 
>> +	}
>> +	return vector;
>> +}
>> +
>>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
>>  {
>>  	kvm_inject_apic_timer_irqs(vcpu);
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index a63ffdc..af48361 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -643,6 +643,12 @@ out:
>>  	return ret;
>>  }
>> +void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
>> +		int need_eoi, int global)
>> +{
>> +	kvm_x86_ops->set_eoi_exitmap(vcpu, vector, need_eoi, global);
>> +}
>> +
>>  /*
>>   * Add a pending IRQ into lapic.
>>   * Return 1 if successfully added and 0 if discarded.
>> @@ -664,8 +670,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>  		if (trig_mode) {
>>  			apic_debug("level trig mode for vector %d", vector);
>>  			apic_set_vector(vector, apic->regs + APIC_TMR);
>> -		} else
>> +			kvm_set_eoi_exitmap(vcpu, vector, 1, 0);
>> +		} else {
>>  			apic_clear_vector(vector, apic->regs + APIC_TMR);
>> +			kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
> Why not use APIC_TMR directly instead of kvm_set_eoi_exitmap() logic?

Good idea. It seems more reasonable. 

>> +		}
>> 
>>  		result = !apic_test_and_set_irr(vector, apic);
>>  		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, @@ -769,6
>>  +778,26 @@ static int apic_set_eoi(struct kvm_lapic *apic) 	return
>>  vector; }
>> +/*
>> + * this interface assumes a trap-like exit, which has already finished
>> + * desired side effect including vISR and vPPR update.
>> + */
>> +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
>> +{
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +	int trigger_mode;
>> +
>> +	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
>> +		trigger_mode = IOAPIC_LEVEL_TRIG;
>> +	else
>> +		trigger_mode = IOAPIC_EDGE_TRIG;
>> +
>> +	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
>> +		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
>> +	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
> More code duplication. Why not call apic_set_eoi() and skip isr/ppr
> logic there if vid is enabled, or put the logic in common function and
> call from both places.

Ok, will change it in next patch.

>> +}
>> +EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
>> +
>>  static void apic_send_ipi(struct kvm_lapic *apic) { 	u32 icr_low =
>>  kvm_apic_get_reg(apic, APIC_ICR); @@ -1510,6 +1539,8 @@ int
>>  kvm_create_lapic(struct kvm_vcpu *vcpu) 	kvm_lapic_reset(vcpu);
>>  	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
>> +	if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
>> +		apic->vid_enabled = true;
> What do you have vid_enabled for. This is global, not per apic, state.
When inject interrupt to guest, we need this to check whether vid is enabled. If not, use old way to handle the interrupt.
I thing put it in apic is reasonable. Though all vcpu use same configuration, APICv feature is per vcpu too.

> 
>>  	return 0; nomem_free_apic: 	kfree(apic); @@ -1533,6 +1564,17 @@ int
>>  kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) 	return highest_irr; }
>> +int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu)
>> +{
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +
>> +	if (!apic || !apic_enabled(apic))
>> +		return -1;
>> +
>> +	return apic_find_highest_irr(apic);
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
>> +
>>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
>>  {
>>  	u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
>> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
>> index c42f111..2503a64 100644
>> --- a/arch/x86/kvm/lapic.h
>> +++ b/arch/x86/kvm/lapic.h
>> @@ -20,6 +20,7 @@ struct kvm_lapic {
>>  	u32 divide_count; 	struct kvm_vcpu *vcpu; 	bool irr_pending; +	bool
>>  vid_enabled; 	/* Number of bits set in ISR. */ 	s16 isr_count; 	/* The
>>  highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -39,6
>>  +40,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); int
>>  kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int
>>  kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int
>>  kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
>> +int kvm_cpu_has_extint(struct kvm_vcpu *v);
>> +int kvm_cpu_get_extint(struct kvm_vcpu *v);
>> +int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
>>  void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64
>>  kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void
>>  kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); @@ -50,6
>>  +54,8 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); int
>>  kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int
>>  kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); int
>>  kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
>> +void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
>> +		int need_eoi, int global);
>>  int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
>>  
>>  bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
>> @@ -65,6 +71,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu
> *vcpu);
>>  void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
>>  
>>  int kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
>> +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
>> 
>>  void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
>>  void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
>> @@ -81,6 +88,12 @@ static inline bool
> kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
>>  	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
>>  }
>> +static inline bool kvm_apic_vid_enabled(struct kvm_vcpu *vcpu)
>> +{
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +	return apic->vid_enabled;
>> +}
> vcpu->arch.apic can be NULL from where this is called.
> 
>> +
>>  int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
>>  void kvm_lapic_init(void);
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index d017df3..b290aba 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -3564,6 +3564,11 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
>>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>  }
>> +static int svm_has_virtual_interrupt_delivery(struct kvm_vcpu *vcpu)
>> +{
>> +	return 0;
>> +}
>> +
>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
>>  *svm = to_svm(vcpu); @@ -4283,6 +4288,7 @@ static struct kvm_x86_ops
>>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>  update_cr8_intercept,
>> +	.has_virtual_interrupt_delivery = svm_has_virtual_interrupt_delivery,
>> 
>>  	.set_tss_addr = svm_set_tss_addr,
>>  	.get_tdp_level = get_npt_level,
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index e9287aa..c0d74ce 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -86,6 +86,9 @@ module_param(fasteoi, bool, S_IRUGO);
>>  static bool __read_mostly enable_apicv_reg = 0;
>>  module_param(enable_apicv_reg, bool, S_IRUGO);
>> +static bool __read_mostly enable_apicv_vid = 0;
>> +module_param(enable_apicv_vid, bool, S_IRUGO);
>> +
>>  /*
>>   * If nested=1, nested virtualization is supported, i.e., guests may use
>>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
>> @@ -432,6 +435,10 @@ struct vcpu_vmx {
>> 
>>  	bool rdtscp_enabled;
>> +	u8 eoi_exitmap_changed;
>> +	u64 eoi_exit_bitmap[4];
>> +	u64 eoi_exit_bitmap_global[4];
>> +
>>  	/* Support for a guest hypervisor (nested VMX) */
>>  	struct nested_vmx nested;
>>  };
>> @@ -770,6 +777,12 @@ static inline bool
> cpu_has_vmx_apic_register_virt(void)
>>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
>>  }
>> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
>> +{
>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>> +		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>> +}
>> +
>>  static inline bool cpu_has_vmx_flexpriority(void)
>>  {
>>  	return cpu_has_vmx_tpr_shadow() &&
>> @@ -2480,7 +2493,8 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
>>  			SECONDARY_EXEC_RDTSCP |
>>  			SECONDARY_EXEC_ENABLE_INVPCID |
>> -			SECONDARY_EXEC_APIC_REGISTER_VIRT;
>> +			SECONDARY_EXEC_APIC_REGISTER_VIRT |
>> +			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>>  		if (adjust_vmx_controls(min2, opt2,
>>  					MSR_IA32_VMX_PROCBASED_CTLS2,
>>  					&_cpu_based_2nd_exec_control) < 0)
>> @@ -2494,7 +2508,8 @@ static __init int setup_vmcs_config(struct
>> vmcs_config *vmcs_conf)
>> 
>>  	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
>>  		_cpu_based_2nd_exec_control &= ~(
>> -				SECONDARY_EXEC_APIC_REGISTER_VIRT);
>> +				SECONDARY_EXEC_APIC_REGISTER_VIRT |
>> +				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
>> 
>>  	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 		/*
>>  CR3 accesses and invlpg don't need to cause VM Exits when EPT @@
>>  -2696,6 +2711,9 @@ static __init int hardware_setup(void) 	if
>>  (!cpu_has_vmx_apic_register_virt()) 		enable_apicv_reg = 0;
>> +	if (!cpu_has_vmx_virtual_intr_delivery())
>> +		enable_apicv_vid = 0;
>> +
>>  	if (nested)
>>  		nested_vmx_setup_ctls_msrs();
>> @@ -3811,6 +3829,8 @@ static u32 vmx_secondary_exec_control(struct
> vcpu_vmx *vmx)
>>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>>  	if (!enable_apicv_reg)
>>  		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
>> +	if (!enable_apicv_vid)
>> +		exec_control &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>>  	return exec_control;
>>  }
>> @@ -3855,6 +3875,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>  				vmx_secondary_exec_control(vmx));
>>  	}
>> +	if (enable_apicv_vid) {
>> +		vmcs_write64(EOI_EXIT_BITMAP0, 0);
>> +		vmcs_write64(EOI_EXIT_BITMAP1, 0);
>> +		vmcs_write64(EOI_EXIT_BITMAP2, 0);
>> +		vmcs_write64(EOI_EXIT_BITMAP3, 0);
>> +
>> +		vmcs_write16(GUEST_INTR_STATUS, 0);
>> +	}
>> +
>>  	if (ple_gap) {
>>  		vmcs_write32(PLE_GAP, ple_gap);
>>  		vmcs_write32(PLE_WINDOW, ple_window);
>> @@ -4770,6 +4799,16 @@ static int handle_apic_access(struct kvm_vcpu
> *vcpu)
>>  	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
>>  }
>> +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
>> +	int vector = exit_qualification & 0xff;
>> +
>> +	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
>> +	kvm_apic_set_eoi_accelerated(vcpu, vector);
>> +	return 1;
>> +}
>> +
>>  static int handle_apic_write(struct kvm_vcpu *vcpu)
>>  {
>>  	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
>> @@ -5719,6 +5758,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct
> kvm_vcpu *vcpu) = {
>>  	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
>>  	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
>>  	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
>>  +	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
>>  	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
>>  	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
>>  	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
>> @@ -6049,6 +6089,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>> 
>>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>>  {
>> +    /* no need for tpr_threshold update if APIC virtual
>> +     * interrupt delivery is enabled */
>> +	if (!enable_apicv_vid)
>> +		return ;
> 
> Just set kvm_x86_ops->update_cr8_intercept to NULL if !enable_apicv_vid
> and the function will not be called.

Sure.
 
>> +
>>  	if (irr == -1 || tpr < irr) {
>>  		vmcs_write32(TPR_THRESHOLD, 0);
>>  		return;
>> @@ -6057,6 +6102,79 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
>>  	vmcs_write32(TPR_THRESHOLD, irr);
>>  }
>> +static int vmx_has_virtual_interrupt_delivery(struct kvm_vcpu *vcpu)
>> +{
>> +	return irqchip_in_kernel(vcpu->kvm) && enable_apicv_vid;
>> +}
>> +
>> +static void vmx_update_irq(struct kvm_vcpu *vcpu)
>> +{
>> +	u16 status;
>> +	u8 old;
>> +	int vector;
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +	if (!enable_apicv_vid)
>> +		return ;
> Ditto. Set kvm_x86_ops->update_irq to a function that does nothing if
> !enable_apicv_vid. BTW you do not set this callback in SVM code and call
> it unconditionally.
> 
>> +
>> +	vector = kvm_apic_get_highest_irr(vcpu);
>> +	if (vector == -1)
>> +		return;
>> +
>> +	status = vmcs_read16(GUEST_INTR_STATUS);
>> +	old = (u8)status & 0xff;
>> +	if ((u8)vector != old) {
>> +		status &= ~0xff;
>> +		status |= (u8)vector;
>> +		vmcs_write16(GUEST_INTR_STATUS, status);
>> +	}
> Please write RVI assessor functions.
Sure.

>> +
>> +	if (vmx->eoi_exitmap_changed) {
>> +#define UPDATE_EOI_EXITMAP(v, e) {				\
>> +	if ((v)->eoi_exitmap_changed & (1 << (e)))	\
>> +		vmcs_write64(EOI_EXIT_BITMAP##e,		\
>> +		(v)->eoi_exit_bitmap[e] | (v)->eoi_exit_bitmap_global[e]); }
> Inline function would do. But why calculate this on each entry? We want
> EOI exits only for level IOAPIC interrupts and edge IOAPIC interrupt
> with registered notifiers. This configuration rarely changes.

eoi_exitmap_changed is used to track whether the trig mode is changed. As you said, it changes rarely, so this codes seldom will be executed.

> 
> 
>> +
>> +		UPDATE_EOI_EXITMAP(vmx, 0);
>> +		UPDATE_EOI_EXITMAP(vmx, 1);
>> +		UPDATE_EOI_EXITMAP(vmx, 2);
>> +		UPDATE_EOI_EXITMAP(vmx, 3);
>> +		vmx->eoi_exitmap_changed = 0;
>> +	}
>> +}
>> +
>> +static void vmx_set_eoi_exitmap(struct kvm_vcpu *vcpu,
>> +				int vector,
>> +				int need_eoi, int global)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +	int index, offset, changed;
>> +	unsigned long *eoi_exitmap;
>> +
>> +	if (!enable_apicv_vid)
>> +		return ;
>> +
>> +	if (WARN_ONCE((vector < 0) || (vector > 255),
>> +		"KVM VMX: vector (%d) out of range\n", vector))
>> +		return;
>> +
>> +	index = vector >> 6;
>> +	offset = vector & 63;
>> +	if (global)
>> +		eoi_exitmap =
>> +		    (unsigned long *)&vmx->eoi_exit_bitmap_global[index];
>> +	else
>> +		eoi_exitmap = (unsigned long *)&vmx->eoi_exit_bitmap[index];
>> +
>> +	if (need_eoi)
>> +		changed = !test_and_set_bit(offset, eoi_exitmap);
>> +	else
>> +		changed = test_and_clear_bit(offset, eoi_exitmap);
>> +
>> +	if (changed)
>> +		vmx->eoi_exitmap_changed |= 1 << index;
>> +}
>> +
>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
>>  exit_intr_info; @@ -7320,6 +7438,9 @@ static struct kvm_x86_ops
>>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
>>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
>>  update_cr8_intercept,
>> +	.has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
>> +	.update_irq = vmx_update_irq,
> You need to initialize this one in svm.c too.
> 
>> +	.set_eoi_exitmap = vmx_set_eoi_exitmap,
>> 
>>  	.set_tss_addr = vmx_set_tss_addr,
>>  	.get_tdp_level = get_ept_level,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 4f76417..8b8de3b 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -5190,6 +5190,13 @@ static void inject_pending_event(struct kvm_vcpu
> *vcpu)
>>  			vcpu->arch.nmi_injected = true;
>>  			kvm_x86_ops->set_nmi(vcpu);
>>  		}
>> +	} else if (kvm_apic_vid_enabled(vcpu)) {
>> +		if (kvm_cpu_has_extint(vcpu) &&
>> +		    kvm_x86_ops->interrupt_allowed(vcpu)) {
>> +			kvm_queue_interrupt(vcpu,
>> +				kvm_cpu_get_extint(vcpu), false);
>> +			kvm_x86_ops->set_irq(vcpu);
>> +		}
> Drop all this and modify kvm_cpu_has_interrupt()/kvm_cpu_get_interrupt()
> to consider apic interrupts only if vid is enabled then the if below
> will just work.
Ok.

> 
>>  	} else if (kvm_cpu_has_interrupt(vcpu)) {
>>  		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
>>  			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
>> @@ -5289,12 +5296,19 @@ static int vcpu_enter_guest(struct kvm_vcpu
> *vcpu)
>>  	}
>>  
>>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
>> +		/* update archtecture specific hints for APIC
>> +		 * virtual interrupt delivery */
>> +		kvm_x86_ops->update_irq(vcpu);
>> +
>>  		inject_pending_event(vcpu);
>>  
>>  		/* enable NMI/IRQ window open exits if needed */
>>  		if (vcpu->arch.nmi_pending)
>>  			kvm_x86_ops->enable_nmi_window(vcpu);
>> -		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
>> +		else if (kvm_apic_vid_enabled(vcpu)) {
>> +			if (kvm_cpu_has_extint(vcpu))
>> +				kvm_x86_ops->enable_irq_window(vcpu);
> Same as above. With proper kvm_cpu_has_interrupt() implementation this
> id is not needed.
> 
>> +		} else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
>>  			kvm_x86_ops->enable_irq_window(vcpu);
>>  
>>  		if (kvm_lapic_enabled(vcpu)) {
>> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
>> index 166c450..898aa62 100644
>> --- a/virt/kvm/ioapic.c
>> +++ b/virt/kvm/ioapic.c
>> @@ -186,6 +186,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int
> irq)
>>  		/* need to read apic_id from apic regiest since 		 * it can be
>>  rewritten */ 		irqe.dest_id = ioapic->kvm->bsp_vcpu_id;
>>  +		kvm_set_eoi_exitmap(ioapic->kvm->vcpus[0], irqe.vector, 1, 1); 	}
>>  #endif 	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
>> --
>> 1.7.1
> 
> --
> 			Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Nov. 25, 2012, 8:53 a.m. UTC | #3
On Fri, Nov 23, 2012 at 11:46:30AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2012-11-22:
> > On Wed, Nov 21, 2012 at 04:09:36PM +0800, Yang Zhang wrote:
> >> Virtual interrupt delivery avoids KVM to inject vAPIC interrupts
> >> manually, which is fully taken care of by the hardware. This needs
> >> some special awareness into existing interrupr injection path:
> >> 
> >> - for pending interrupt, instead of direct injection, we may need
> >>   update architecture specific indicators before resuming to guest.
> >> - A pending interrupt, which is masked by ISR, should be also
> >>   considered in above update action, since hardware will decide
> >>   when to inject it at right time. Current has_interrupt and
> >>   get_interrupt only returns a valid vector from injection p.o.v.
> >> Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>
> >> Signed-off-by: Kevin Tian <kevin.tian@intel.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    4 + arch/x86/include/asm/vmx.h   
> >>    |   11 ++++ arch/x86/kvm/irq.c              |   44 ++++++++++++++
> >>  arch/x86/kvm/lapic.c            |   44 +++++++++++++-
> >>  arch/x86/kvm/lapic.h            |   13 ++++ arch/x86/kvm/svm.c        
> >>       |    6 ++ arch/x86/kvm/vmx.c              |  125
> >>  ++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c            
> >>   |   16 +++++- virt/kvm/ioapic.c               |    1 + 9 files
> >>  changed, 260 insertions(+), 4 deletions(-)
> >> diff --git a/arch/x86/include/asm/kvm_host.h
> >> b/arch/x86/include/asm/kvm_host.h index b2e11f4..8e07a86 100644 ---
> >> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -682,6 +682,10 @@ struct kvm_x86_ops {
> >>  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
> >>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
> >>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> >> +	int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
> >> +	void (*update_irq)(struct kvm_vcpu *vcpu);
> >> +	void (*set_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector,
> >> +			int need_eoi, int global);
> >>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> >>  	int (*get_tdp_level)(void);
> >>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> >> index 21101b6..1003341 100644
> >> --- a/arch/x86/include/asm/vmx.h
> >> +++ b/arch/x86/include/asm/vmx.h
> >> @@ -62,6 +62,7 @@
> >>  #define EXIT_REASON_MCE_DURING_VMENTRY  41 #define
> >>  EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS    
> >>      44 +#define EXIT_REASON_EOI_INDUCED         45 #define
> >>  EXIT_REASON_EPT_VIOLATION       48 #define EXIT_REASON_EPT_MISCONFIG  
> >>      49 #define EXIT_REASON_WBINVD              54 @@ -143,6 +144,7 @@
> >>  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040 #define
> >>  SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080 #define
> >>  SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100 +#define
> >>  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200 #define
> >>  SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400 #define
> >>  SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
> >> @@ -180,6 +182,7 @@ enum vmcs_field {
> >>  	GUEST_GS_SELECTOR               = 0x0000080a, 	GUEST_LDTR_SELECTOR   
> >>           = 0x0000080c, 	GUEST_TR_SELECTOR               = 0x0000080e,
> >>  +	GUEST_INTR_STATUS               = 0x00000810, 	HOST_ES_SELECTOR     
> >>            = 0x00000c00, 	HOST_CS_SELECTOR                = 0x00000c02,
> >>  	HOST_SS_SELECTOR                = 0x00000c04, @@ -207,6 +210,14 @@
> >>  enum vmcs_field { 	APIC_ACCESS_ADDR_HIGH		= 0x00002015, 	EPT_POINTER  
> >>                    = 0x0000201a, 	EPT_POINTER_HIGH                =
> >>  0x0000201b,
> >> +	EOI_EXIT_BITMAP0                = 0x0000201c,
> >> +	EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
> >> +	EOI_EXIT_BITMAP1                = 0x0000201e,
> >> +	EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
> >> +	EOI_EXIT_BITMAP2                = 0x00002020,
> >> +	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
> >> +	EOI_EXIT_BITMAP3                = 0x00002022,
> >> +	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
> >>  	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
> >>  	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
> >>  	VMCS_LINK_POINTER               = 0x00002800,
> >> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> >> index 7e06ba1..c7356a3 100644
> >> --- a/arch/x86/kvm/irq.c
> >> +++ b/arch/x86/kvm/irq.c
> >> @@ -60,6 +60,29 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
> >>  EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
> >>  
> >>  /*
> >> + * check if there is pending interrupt without
> >> + * intack. This _apicv version is used when hardware
> >> + * supports APIC virtualization with virtual interrupt
> >> + * delivery support. In such case, KVM is not required
> >> + * to poll pending APIC interrupt, and thus this
> >> + * interface is used to poll pending interupts from
> >> + * non-APIC source.
> >> + */
> >> +int kvm_cpu_has_extint(struct kvm_vcpu *v)
> >> +{
> >> +	struct kvm_pic *s;
> >> +
> >> +	if (!irqchip_in_kernel(v->kvm))
> >> +		return v->arch.interrupt.pending;
> >> +
> > This does not belong here. If !irqchip_in_kernel() the function will not
> > be called. Hmm actually with !irqchip_in_kernel() kernel will oops in
> > kvm_apic_vid_enabled() since it dereference vcpu->arch.apic without
> > checking if it is NULL.
> 
> Right. Will remove it in next version and add the check in kvm_apic_vid_enabled.
>  
> > 
> >> +	if (kvm_apic_accept_pic_intr(v)) {
> >> +		s = pic_irqchip(v->kvm);	/* PIC */
> >> +		return s->output;
> >> +	} else
> >> +		return 0;
> > This is code duplication from kvm_cpu_has_interrupt(). Write common
> > function and call it from kvm_cpu_has_interrupt(), but even that is
> > not needed, see below.
> 
> Why it is not needed? 
Because it you change kvm_cpu_has_interrupt() like I described below the
code path that uses this function will not be needed.

>  
> >> +}
> >> +
> >> +/*
> >>   * Read pending interrupt vector and intack.
> >>   */
> >>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v) @@ -82,6 +105,27 @@ int
> >>  kvm_cpu_get_interrupt(struct kvm_vcpu *v) }
> >>  EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
> >> +/*
> >> + * Read pending interrupt vector and intack.
> >> + * Similar to kvm_cpu_has_interrupt_apicv, to get
> >> + * interrupts from non-APIC sources.
> >> + */
> >> +int kvm_cpu_get_extint(struct kvm_vcpu *v)
> >> +{
> >> +	struct kvm_pic *s;
> >> +	int vector = -1;
> >> +
> >> +	if (!irqchip_in_kernel(v->kvm))
> >> +		return v->arch.interrupt.nr;
> > Same as above.
> > 
> >> +
> >> +	if (kvm_apic_accept_pic_intr(v)) {
> >> +		s = pic_irqchip(v->kvm);
> >> +		s->output = 0;		/* PIC */
> >> +		vector = kvm_pic_read_irq(v->kvm);
> > Ditto about code duplication.
> > 
> >> +	}
> >> +	return vector;
> >> +}
> >> +
> >>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
> >>  {
> >>  	kvm_inject_apic_timer_irqs(vcpu);
> >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >> index a63ffdc..af48361 100644
> >> --- a/arch/x86/kvm/lapic.c
> >> +++ b/arch/x86/kvm/lapic.c
> >> @@ -643,6 +643,12 @@ out:
> >>  	return ret;
> >>  }
> >> +void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
> >> +		int need_eoi, int global)
> >> +{
> >> +	kvm_x86_ops->set_eoi_exitmap(vcpu, vector, need_eoi, global);
> >> +}
> >> +
> >>  /*
> >>   * Add a pending IRQ into lapic.
> >>   * Return 1 if successfully added and 0 if discarded.
> >> @@ -664,8 +670,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> > delivery_mode,
> >>  		if (trig_mode) {
> >>  			apic_debug("level trig mode for vector %d", vector);
> >>  			apic_set_vector(vector, apic->regs + APIC_TMR);
> >> -		} else
> >> +			kvm_set_eoi_exitmap(vcpu, vector, 1, 0);
> >> +		} else {
> >>  			apic_clear_vector(vector, apic->regs + APIC_TMR);
> >> +			kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
> > Why not use APIC_TMR directly instead of kvm_set_eoi_exitmap() logic?
> 
> Good idea. It seems more reasonable. 
> 
> >> +		}
> >> 
> >>  		result = !apic_test_and_set_irr(vector, apic);
> >>  		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, @@ -769,6
> >>  +778,26 @@ static int apic_set_eoi(struct kvm_lapic *apic) 	return
> >>  vector; }
> >> +/*
> >> + * this interface assumes a trap-like exit, which has already finished
> >> + * desired side effect including vISR and vPPR update.
> >> + */
> >> +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
> >> +{
> >> +	struct kvm_lapic *apic = vcpu->arch.apic;
> >> +	int trigger_mode;
> >> +
> >> +	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
> >> +		trigger_mode = IOAPIC_LEVEL_TRIG;
> >> +	else
> >> +		trigger_mode = IOAPIC_EDGE_TRIG;
> >> +
> >> +	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
> >> +		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
> >> +	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
> > More code duplication. Why not call apic_set_eoi() and skip isr/ppr
> > logic there if vid is enabled, or put the logic in common function and
> > call from both places.
> 
> Ok, will change it in next patch.
> 
> >> +}
> >> +EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
> >> +
> >>  static void apic_send_ipi(struct kvm_lapic *apic) { 	u32 icr_low =
> >>  kvm_apic_get_reg(apic, APIC_ICR); @@ -1510,6 +1539,8 @@ int
> >>  kvm_create_lapic(struct kvm_vcpu *vcpu) 	kvm_lapic_reset(vcpu);
> >>  	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
> >> +	if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
> >> +		apic->vid_enabled = true;
> > What do you have vid_enabled for. This is global, not per apic, state.
> When inject interrupt to guest, we need this to check whether vid is enabled. If not, use old way to handle the interrupt.
> I thing put it in apic is reasonable. Though all vcpu use same configuration, APICv feature is per vcpu too.
> 
How APICv is per vcpu? It is global. Just call has_virtual_interrupt_delivery(vcpu)
instead of vid_enabled thing.

> > 
> >>  	return 0; nomem_free_apic: 	kfree(apic); @@ -1533,6 +1564,17 @@ int
> >>  kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) 	return highest_irr; }
> >> +int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu)
> >> +{
> >> +	struct kvm_lapic *apic = vcpu->arch.apic;
> >> +
> >> +	if (!apic || !apic_enabled(apic))
> >> +		return -1;
> >> +
> >> +	return apic_find_highest_irr(apic);
> >> +}
> >> +EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
> >> +
> >>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
> >>  {
> >>  	u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
> >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> >> index c42f111..2503a64 100644
> >> --- a/arch/x86/kvm/lapic.h
> >> +++ b/arch/x86/kvm/lapic.h
> >> @@ -20,6 +20,7 @@ struct kvm_lapic {
> >>  	u32 divide_count; 	struct kvm_vcpu *vcpu; 	bool irr_pending; +	bool
> >>  vid_enabled; 	/* Number of bits set in ISR. */ 	s16 isr_count; 	/* The
> >>  highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -39,6
> >>  +40,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); int
> >>  kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int
> >>  kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int
> >>  kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
> >> +int kvm_cpu_has_extint(struct kvm_vcpu *v);
> >> +int kvm_cpu_get_extint(struct kvm_vcpu *v);
> >> +int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
> >>  void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64
> >>  kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void
> >>  kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); @@ -50,6
> >>  +54,8 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); int
> >>  kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int
> >>  kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); int
> >>  kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
> >> +void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
> >> +		int need_eoi, int global);
> >>  int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
> >>  
> >>  bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> >> @@ -65,6 +71,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu
> > *vcpu);
> >>  void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
> >>  
> >>  int kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
> >> +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
> >> 
> >>  void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
> >>  void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
> >> @@ -81,6 +88,12 @@ static inline bool
> > kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
> >>  	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
> >>  }
> >> +static inline bool kvm_apic_vid_enabled(struct kvm_vcpu *vcpu)
> >> +{
> >> +	struct kvm_lapic *apic = vcpu->arch.apic;
> >> +	return apic->vid_enabled;
> >> +}
> > vcpu->arch.apic can be NULL from where this is called.
> > 
> >> +
> >>  int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
> >>  void kvm_lapic_init(void);
> >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >> index d017df3..b290aba 100644
> >> --- a/arch/x86/kvm/svm.c
> >> +++ b/arch/x86/kvm/svm.c
> >> @@ -3564,6 +3564,11 @@ static void update_cr8_intercept(struct kvm_vcpu
> > *vcpu, int tpr, int irr)
> >>  		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
> >>  }
> >> +static int svm_has_virtual_interrupt_delivery(struct kvm_vcpu *vcpu)
> >> +{
> >> +	return 0;
> >> +}
> >> +
> >>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
> >>  *svm = to_svm(vcpu); @@ -4283,6 +4288,7 @@ static struct kvm_x86_ops
> >>  svm_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>  update_cr8_intercept,
> >> +	.has_virtual_interrupt_delivery = svm_has_virtual_interrupt_delivery,
> >> 
> >>  	.set_tss_addr = svm_set_tss_addr,
> >>  	.get_tdp_level = get_npt_level,
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> index e9287aa..c0d74ce 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -86,6 +86,9 @@ module_param(fasteoi, bool, S_IRUGO);
> >>  static bool __read_mostly enable_apicv_reg = 0;
> >>  module_param(enable_apicv_reg, bool, S_IRUGO);
> >> +static bool __read_mostly enable_apicv_vid = 0;
> >> +module_param(enable_apicv_vid, bool, S_IRUGO);
> >> +
> >>  /*
> >>   * If nested=1, nested virtualization is supported, i.e., guests may use
> >>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
> >> @@ -432,6 +435,10 @@ struct vcpu_vmx {
> >> 
> >>  	bool rdtscp_enabled;
> >> +	u8 eoi_exitmap_changed;
> >> +	u64 eoi_exit_bitmap[4];
> >> +	u64 eoi_exit_bitmap_global[4];
> >> +
> >>  	/* Support for a guest hypervisor (nested VMX) */
> >>  	struct nested_vmx nested;
> >>  };
> >> @@ -770,6 +777,12 @@ static inline bool
> > cpu_has_vmx_apic_register_virt(void)
> >>  		SECONDARY_EXEC_APIC_REGISTER_VIRT;
> >>  }
> >> +static inline bool cpu_has_vmx_virtual_intr_delivery(void)
> >> +{
> >> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> >> +		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
> >> +}
> >> +
> >>  static inline bool cpu_has_vmx_flexpriority(void)
> >>  {
> >>  	return cpu_has_vmx_tpr_shadow() &&
> >> @@ -2480,7 +2493,8 @@ static __init int setup_vmcs_config(struct
> > vmcs_config *vmcs_conf)
> >>  			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
> >>  			SECONDARY_EXEC_RDTSCP |
> >>  			SECONDARY_EXEC_ENABLE_INVPCID |
> >> -			SECONDARY_EXEC_APIC_REGISTER_VIRT;
> >> +			SECONDARY_EXEC_APIC_REGISTER_VIRT |
> >> +			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
> >>  		if (adjust_vmx_controls(min2, opt2,
> >>  					MSR_IA32_VMX_PROCBASED_CTLS2,
> >>  					&_cpu_based_2nd_exec_control) < 0)
> >> @@ -2494,7 +2508,8 @@ static __init int setup_vmcs_config(struct
> >> vmcs_config *vmcs_conf)
> >> 
> >>  	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
> >>  		_cpu_based_2nd_exec_control &= ~(
> >> -				SECONDARY_EXEC_APIC_REGISTER_VIRT);
> >> +				SECONDARY_EXEC_APIC_REGISTER_VIRT |
> >> +				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
> >> 
> >>  	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 		/*
> >>  CR3 accesses and invlpg don't need to cause VM Exits when EPT @@
> >>  -2696,6 +2711,9 @@ static __init int hardware_setup(void) 	if
> >>  (!cpu_has_vmx_apic_register_virt()) 		enable_apicv_reg = 0;
> >> +	if (!cpu_has_vmx_virtual_intr_delivery())
> >> +		enable_apicv_vid = 0;
> >> +
> >>  	if (nested)
> >>  		nested_vmx_setup_ctls_msrs();
> >> @@ -3811,6 +3829,8 @@ static u32 vmx_secondary_exec_control(struct
> > vcpu_vmx *vmx)
> >>  		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> >>  	if (!enable_apicv_reg)
> >>  		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> >> +	if (!enable_apicv_vid)
> >> +		exec_control &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
> >>  	return exec_control;
> >>  }
> >> @@ -3855,6 +3875,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> >>  				vmx_secondary_exec_control(vmx));
> >>  	}
> >> +	if (enable_apicv_vid) {
> >> +		vmcs_write64(EOI_EXIT_BITMAP0, 0);
> >> +		vmcs_write64(EOI_EXIT_BITMAP1, 0);
> >> +		vmcs_write64(EOI_EXIT_BITMAP2, 0);
> >> +		vmcs_write64(EOI_EXIT_BITMAP3, 0);
> >> +
> >> +		vmcs_write16(GUEST_INTR_STATUS, 0);
> >> +	}
> >> +
> >>  	if (ple_gap) {
> >>  		vmcs_write32(PLE_GAP, ple_gap);
> >>  		vmcs_write32(PLE_WINDOW, ple_window);
> >> @@ -4770,6 +4799,16 @@ static int handle_apic_access(struct kvm_vcpu
> > *vcpu)
> >>  	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
> >>  }
> >> +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
> >> +{
> >> +	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> >> +	int vector = exit_qualification & 0xff;
> >> +
> >> +	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
> >> +	kvm_apic_set_eoi_accelerated(vcpu, vector);
> >> +	return 1;
> >> +}
> >> +
> >>  static int handle_apic_write(struct kvm_vcpu *vcpu)
> >>  {
> >>  	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> >> @@ -5719,6 +5758,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct
> > kvm_vcpu *vcpu) = {
> >>  	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
> >>  	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
> >>  	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
> >>  +	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
> >>  	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
> >>  	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
> >>  	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
> >> @@ -6049,6 +6089,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >> 
> >>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
> >>  {
> >> +    /* no need for tpr_threshold update if APIC virtual
> >> +     * interrupt delivery is enabled */
> >> +	if (!enable_apicv_vid)
> >> +		return ;
> > 
> > Just set kvm_x86_ops->update_cr8_intercept to NULL if !enable_apicv_vid
> > and the function will not be called.
> 
> Sure.
>  
> >> +
> >>  	if (irr == -1 || tpr < irr) {
> >>  		vmcs_write32(TPR_THRESHOLD, 0);
> >>  		return;
> >> @@ -6057,6 +6102,79 @@ static void update_cr8_intercept(struct kvm_vcpu
> > *vcpu, int tpr, int irr)
> >>  	vmcs_write32(TPR_THRESHOLD, irr);
> >>  }
> >> +static int vmx_has_virtual_interrupt_delivery(struct kvm_vcpu *vcpu)
> >> +{
> >> +	return irqchip_in_kernel(vcpu->kvm) && enable_apicv_vid;
> >> +}
> >> +
> >> +static void vmx_update_irq(struct kvm_vcpu *vcpu)
> >> +{
> >> +	u16 status;
> >> +	u8 old;
> >> +	int vector;
> >> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +
> >> +	if (!enable_apicv_vid)
> >> +		return ;
> > Ditto. Set kvm_x86_ops->update_irq to a function that does nothing if
> > !enable_apicv_vid. BTW you do not set this callback in SVM code and call
> > it unconditionally.
> > 
> >> +
> >> +	vector = kvm_apic_get_highest_irr(vcpu);
> >> +	if (vector == -1)
> >> +		return;
> >> +
> >> +	status = vmcs_read16(GUEST_INTR_STATUS);
> >> +	old = (u8)status & 0xff;
> >> +	if ((u8)vector != old) {
> >> +		status &= ~0xff;
> >> +		status |= (u8)vector;
> >> +		vmcs_write16(GUEST_INTR_STATUS, status);
> >> +	}
> > Please write RVI assessor functions.
> Sure.
> 
> >> +
> >> +	if (vmx->eoi_exitmap_changed) {
> >> +#define UPDATE_EOI_EXITMAP(v, e) {				\
> >> +	if ((v)->eoi_exitmap_changed & (1 << (e)))	\
> >> +		vmcs_write64(EOI_EXIT_BITMAP##e,		\
> >> +		(v)->eoi_exit_bitmap[e] | (v)->eoi_exit_bitmap_global[e]); }
> > Inline function would do. But why calculate this on each entry? We want
> > EOI exits only for level IOAPIC interrupts and edge IOAPIC interrupt
> > with registered notifiers. This configuration rarely changes.
> 
> eoi_exitmap_changed is used to track whether the trig mode is changed. As you said, it changes rarely, so this codes seldom will be executed.
> 
But code still checks whether bitmap was changed during each interrupt
injection. Recalculate bitmap when notifier is added/removed or ioapic
configuration changes. Use request bit to reload new bitmap.

> > 
> > 
> >> +
> >> +		UPDATE_EOI_EXITMAP(vmx, 0);
> >> +		UPDATE_EOI_EXITMAP(vmx, 1);
> >> +		UPDATE_EOI_EXITMAP(vmx, 2);
> >> +		UPDATE_EOI_EXITMAP(vmx, 3);
> >> +		vmx->eoi_exitmap_changed = 0;
> >> +	}
> >> +}
> >> +
> >> +static void vmx_set_eoi_exitmap(struct kvm_vcpu *vcpu,
> >> +				int vector,
> >> +				int need_eoi, int global)
> >> +{
> >> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +	int index, offset, changed;
> >> +	unsigned long *eoi_exitmap;
> >> +
> >> +	if (!enable_apicv_vid)
> >> +		return ;
> >> +
> >> +	if (WARN_ONCE((vector < 0) || (vector > 255),
> >> +		"KVM VMX: vector (%d) out of range\n", vector))
> >> +		return;
> >> +
> >> +	index = vector >> 6;
> >> +	offset = vector & 63;
> >> +	if (global)
> >> +		eoi_exitmap =
> >> +		    (unsigned long *)&vmx->eoi_exit_bitmap_global[index];
> >> +	else
> >> +		eoi_exitmap = (unsigned long *)&vmx->eoi_exit_bitmap[index];
> >> +
> >> +	if (need_eoi)
> >> +		changed = !test_and_set_bit(offset, eoi_exitmap);
> >> +	else
> >> +		changed = test_and_clear_bit(offset, eoi_exitmap);
> >> +
> >> +	if (changed)
> >> +		vmx->eoi_exitmap_changed |= 1 << index;
> >> +}
> >> +
> >>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { 	u32
> >>  exit_intr_info; @@ -7320,6 +7438,9 @@ static struct kvm_x86_ops
> >>  vmx_x86_ops = { 	.enable_nmi_window = enable_nmi_window,
> >>  	.enable_irq_window = enable_irq_window, 	.update_cr8_intercept =
> >>  update_cr8_intercept,
> >> +	.has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
> >> +	.update_irq = vmx_update_irq,
> > You need to initialize this one in svm.c too.
> > 
> >> +	.set_eoi_exitmap = vmx_set_eoi_exitmap,
> >> 
> >>  	.set_tss_addr = vmx_set_tss_addr,
> >>  	.get_tdp_level = get_ept_level,
> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> index 4f76417..8b8de3b 100644
> >> --- a/arch/x86/kvm/x86.c
> >> +++ b/arch/x86/kvm/x86.c
> >> @@ -5190,6 +5190,13 @@ static void inject_pending_event(struct kvm_vcpu
> > *vcpu)
> >>  			vcpu->arch.nmi_injected = true;
> >>  			kvm_x86_ops->set_nmi(vcpu);
> >>  		}
> >> +	} else if (kvm_apic_vid_enabled(vcpu)) {
> >> +		if (kvm_cpu_has_extint(vcpu) &&
> >> +		    kvm_x86_ops->interrupt_allowed(vcpu)) {
> >> +			kvm_queue_interrupt(vcpu,
> >> +				kvm_cpu_get_extint(vcpu), false);
> >> +			kvm_x86_ops->set_irq(vcpu);
> >> +		}
> > Drop all this and modify kvm_cpu_has_interrupt()/kvm_cpu_get_interrupt()
> > to consider apic interrupts only if vid is enabled then the if below
> > will just work.
> Ok.
> 
> > 
> >>  	} else if (kvm_cpu_has_interrupt(vcpu)) {
> >>  		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
> >>  			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
> >> @@ -5289,12 +5296,19 @@ static int vcpu_enter_guest(struct kvm_vcpu
> > *vcpu)
> >>  	}
> >>  
> >>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> >> +		/* update archtecture specific hints for APIC
> >> +		 * virtual interrupt delivery */
> >> +		kvm_x86_ops->update_irq(vcpu);
> >> +
> >>  		inject_pending_event(vcpu);
> >>  
> >>  		/* enable NMI/IRQ window open exits if needed */
> >>  		if (vcpu->arch.nmi_pending)
> >>  			kvm_x86_ops->enable_nmi_window(vcpu);
> >> -		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
> >> +		else if (kvm_apic_vid_enabled(vcpu)) {
> >> +			if (kvm_cpu_has_extint(vcpu))
> >> +				kvm_x86_ops->enable_irq_window(vcpu);
> > Same as above. With proper kvm_cpu_has_interrupt() implementation this
> > id is not needed.
> > 
> >> +		} else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
> >>  			kvm_x86_ops->enable_irq_window(vcpu);
> >>  
> >>  		if (kvm_lapic_enabled(vcpu)) {
> >> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> >> index 166c450..898aa62 100644
> >> --- a/virt/kvm/ioapic.c
> >> +++ b/virt/kvm/ioapic.c
> >> @@ -186,6 +186,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int
> > irq)
> >>  		/* need to read apic_id from apic regiest since 		 * it can be
> >>  rewritten */ 		irqe.dest_id = ioapic->kvm->bsp_vcpu_id;
> >>  +		kvm_set_eoi_exitmap(ioapic->kvm->vcpus[0], irqe.vector, 1, 1); 	}
> >>  #endif 	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
> >> --
> >> 1.7.1
> > 
> > --
> > 			Gleb.
> 
> 
> Best regards,
> Yang
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2e11f4..8e07a86 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -682,6 +682,10 @@  struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+	int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
+	void (*update_irq)(struct kvm_vcpu *vcpu);
+	void (*set_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector,
+			int need_eoi, int global);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 21101b6..1003341 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -62,6 +62,7 @@ 
 #define EXIT_REASON_MCE_DURING_VMENTRY  41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD              54
@@ -143,6 +144,7 @@ 
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
 #define SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
 
@@ -180,6 +182,7 @@  enum vmcs_field {
 	GUEST_GS_SELECTOR               = 0x0000080a,
 	GUEST_LDTR_SELECTOR             = 0x0000080c,
 	GUEST_TR_SELECTOR               = 0x0000080e,
+	GUEST_INTR_STATUS               = 0x00000810,
 	HOST_ES_SELECTOR                = 0x00000c00,
 	HOST_CS_SELECTOR                = 0x00000c02,
 	HOST_SS_SELECTOR                = 0x00000c04,
@@ -207,6 +210,14 @@  enum vmcs_field {
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
 	EPT_POINTER                     = 0x0000201a,
 	EPT_POINTER_HIGH                = 0x0000201b,
+	EOI_EXIT_BITMAP0                = 0x0000201c,
+	EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
+	EOI_EXIT_BITMAP1                = 0x0000201e,
+	EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
+	EOI_EXIT_BITMAP2                = 0x00002020,
+	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
+	EOI_EXIT_BITMAP3                = 0x00002022,
+	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
 	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
 	VMCS_LINK_POINTER               = 0x00002800,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1..c7356a3 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -60,6 +60,29 @@  int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
 
 /*
+ * check if there is pending interrupt without
+ * intack. This _apicv version is used when hardware
+ * supports APIC virtualization with virtual interrupt
+ * delivery support. In such case, KVM is not required
+ * to poll pending APIC interrupt, and thus this
+ * interface is used to poll pending interupts from
+ * non-APIC source.
+ */
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+	struct kvm_pic *s;
+
+	if (!irqchip_in_kernel(v->kvm))
+		return v->arch.interrupt.pending;
+
+	if (kvm_apic_accept_pic_intr(v)) {
+		s = pic_irqchip(v->kvm);	/* PIC */
+		return s->output;
+	} else
+		return 0;
+}
+
+/*
  * Read pending interrupt vector and intack.
  */
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
@@ -82,6 +105,27 @@  int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
+/*
+ * Read pending interrupt vector and intack.
+ * Similar to kvm_cpu_has_interrupt_apicv, to get
+ * interrupts from non-APIC sources.
+ */
+int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+	struct kvm_pic *s;
+	int vector = -1;
+
+	if (!irqchip_in_kernel(v->kvm))
+		return v->arch.interrupt.nr;
+
+	if (kvm_apic_accept_pic_intr(v)) {
+		s = pic_irqchip(v->kvm);
+		s->output = 0;		/* PIC */
+		vector = kvm_pic_read_irq(v->kvm);
+	}
+	return vector;
+}
+
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	kvm_inject_apic_timer_irqs(vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a63ffdc..af48361 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -643,6 +643,12 @@  out:
 	return ret;
 }
 
+void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
+		int need_eoi, int global)
+{
+	kvm_x86_ops->set_eoi_exitmap(vcpu, vector, need_eoi, global);
+}
+
 /*
  * Add a pending IRQ into lapic.
  * Return 1 if successfully added and 0 if discarded.
@@ -664,8 +670,11 @@  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (trig_mode) {
 			apic_debug("level trig mode for vector %d", vector);
 			apic_set_vector(vector, apic->regs + APIC_TMR);
-		} else
+			kvm_set_eoi_exitmap(vcpu, vector, 1, 0);
+		} else {
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
+			kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
+		}
 
 		result = !apic_test_and_set_irr(vector, apic);
 		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
@@ -769,6 +778,26 @@  static int apic_set_eoi(struct kvm_lapic *apic)
 	return vector;
 }
 
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int trigger_mode;
+
+	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
+		trigger_mode = IOAPIC_LEVEL_TRIG;
+	else
+		trigger_mode = IOAPIC_EDGE_TRIG;
+
+	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
+		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
+
 static void apic_send_ipi(struct kvm_lapic *apic)
 {
 	u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
@@ -1510,6 +1539,8 @@  int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	kvm_lapic_reset(vcpu);
 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
+	if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
+		apic->vid_enabled = true;
 	return 0;
 nomem_free_apic:
 	kfree(apic);
@@ -1533,6 +1564,17 @@  int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	return highest_irr;
 }
 
+int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (!apic || !apic_enabled(apic))
+		return -1;
+
+	return apic_find_highest_irr(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
+
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 {
 	u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c42f111..2503a64 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -20,6 +20,7 @@  struct kvm_lapic {
 	u32 divide_count;
 	struct kvm_vcpu *vcpu;
 	bool irr_pending;
+	bool vid_enabled;
 	/* Number of bits set in ISR. */
 	s16 isr_count;
 	/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
@@ -39,6 +40,9 @@  void kvm_free_lapic(struct kvm_vcpu *vcpu);
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_extint(struct kvm_vcpu *v);
+int kvm_cpu_get_extint(struct kvm_vcpu *v);
+int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
@@ -50,6 +54,8 @@  void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
+		int need_eoi, int global);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
@@ -65,6 +71,7 @@  u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
 
 int kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
@@ -81,6 +88,12 @@  static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
 }
 
+static inline bool kvm_apic_vid_enabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	return apic->vid_enabled;
+}
+
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
 void kvm_lapic_init(void);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d017df3..b290aba 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3564,6 +3564,11 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
+static int svm_has_virtual_interrupt_delivery(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4283,6 +4288,7 @@  static struct kvm_x86_ops svm_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
+	.has_virtual_interrupt_delivery = svm_has_virtual_interrupt_delivery,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e9287aa..c0d74ce 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -86,6 +86,9 @@  module_param(fasteoi, bool, S_IRUGO);
 static bool __read_mostly enable_apicv_reg = 0;
 module_param(enable_apicv_reg, bool, S_IRUGO);
 
+static bool __read_mostly enable_apicv_vid = 0;
+module_param(enable_apicv_vid, bool, S_IRUGO);
+
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -432,6 +435,10 @@  struct vcpu_vmx {
 
 	bool rdtscp_enabled;
 
+	u8 eoi_exitmap_changed;
+	u64 eoi_exit_bitmap[4];
+	u64 eoi_exit_bitmap_global[4];
+
 	/* Support for a guest hypervisor (nested VMX) */
 	struct nested_vmx nested;
 };
@@ -770,6 +777,12 @@  static inline bool cpu_has_vmx_apic_register_virt(void)
 		SECONDARY_EXEC_APIC_REGISTER_VIRT;
 }
 
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
@@ -2480,7 +2493,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
 			SECONDARY_EXEC_RDTSCP |
 			SECONDARY_EXEC_ENABLE_INVPCID |
-			SECONDARY_EXEC_APIC_REGISTER_VIRT;
+			SECONDARY_EXEC_APIC_REGISTER_VIRT |
+			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -2494,7 +2508,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 
 	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
 		_cpu_based_2nd_exec_control &= ~(
-				SECONDARY_EXEC_APIC_REGISTER_VIRT);
+				SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 
 	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
 		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
@@ -2696,6 +2711,9 @@  static __init int hardware_setup(void)
 	if (!cpu_has_vmx_apic_register_virt())
 		enable_apicv_reg = 0;
 
+	if (!cpu_has_vmx_virtual_intr_delivery())
+		enable_apicv_vid = 0;
+
 	if (nested)
 		nested_vmx_setup_ctls_msrs();
 
@@ -3811,6 +3829,8 @@  static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 	if (!enable_apicv_reg)
 		exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+	if (!enable_apicv_vid)
+		exec_control &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 	return exec_control;
 }
 
@@ -3855,6 +3875,15 @@  static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				vmx_secondary_exec_control(vmx));
 	}
 
+	if (enable_apicv_vid) {
+		vmcs_write64(EOI_EXIT_BITMAP0, 0);
+		vmcs_write64(EOI_EXIT_BITMAP1, 0);
+		vmcs_write64(EOI_EXIT_BITMAP2, 0);
+		vmcs_write64(EOI_EXIT_BITMAP3, 0);
+
+		vmcs_write16(GUEST_INTR_STATUS, 0);
+	}
+
 	if (ple_gap) {
 		vmcs_write32(PLE_GAP, ple_gap);
 		vmcs_write32(PLE_WINDOW, ple_window);
@@ -4770,6 +4799,16 @@  static int handle_apic_access(struct kvm_vcpu *vcpu)
 	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 }
 
+static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	int vector = exit_qualification & 0xff;
+
+	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
+	kvm_apic_set_eoi_accelerated(vcpu, vector);
+	return 1;
+}
+
 static int handle_apic_write(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -5719,6 +5758,7 @@  static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
+	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
@@ -6049,6 +6089,11 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
+    /* no need for tpr_threshold update if APIC virtual
+     * interrupt delivery is enabled */
+	if (!enable_apicv_vid)
+		return ;
+
 	if (irr == -1 || tpr < irr) {
 		vmcs_write32(TPR_THRESHOLD, 0);
 		return;
@@ -6057,6 +6102,79 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
+static int vmx_has_virtual_interrupt_delivery(struct kvm_vcpu *vcpu)
+{
+	return irqchip_in_kernel(vcpu->kvm) && enable_apicv_vid;
+}
+
+static void vmx_update_irq(struct kvm_vcpu *vcpu)
+{
+	u16 status;
+	u8 old;
+	int vector;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!enable_apicv_vid)
+		return ;
+
+	vector = kvm_apic_get_highest_irr(vcpu);
+	if (vector == -1)
+		return;
+
+	status = vmcs_read16(GUEST_INTR_STATUS);
+	old = (u8)status & 0xff;
+	if ((u8)vector != old) {
+		status &= ~0xff;
+		status |= (u8)vector;
+		vmcs_write16(GUEST_INTR_STATUS, status);
+	}
+
+	if (vmx->eoi_exitmap_changed) {
+#define UPDATE_EOI_EXITMAP(v, e) {				\
+	if ((v)->eoi_exitmap_changed & (1 << (e)))	\
+		vmcs_write64(EOI_EXIT_BITMAP##e,		\
+		(v)->eoi_exit_bitmap[e] | (v)->eoi_exit_bitmap_global[e]); }
+
+		UPDATE_EOI_EXITMAP(vmx, 0);
+		UPDATE_EOI_EXITMAP(vmx, 1);
+		UPDATE_EOI_EXITMAP(vmx, 2);
+		UPDATE_EOI_EXITMAP(vmx, 3);
+		vmx->eoi_exitmap_changed = 0;
+	}
+}
+
+static void vmx_set_eoi_exitmap(struct kvm_vcpu *vcpu,
+				int vector,
+				int need_eoi, int global)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int index, offset, changed;
+	unsigned long *eoi_exitmap;
+
+	if (!enable_apicv_vid)
+		return ;
+
+	if (WARN_ONCE((vector < 0) || (vector > 255),
+		"KVM VMX: vector (%d) out of range\n", vector))
+		return;
+
+	index = vector >> 6;
+	offset = vector & 63;
+	if (global)
+		eoi_exitmap =
+		    (unsigned long *)&vmx->eoi_exit_bitmap_global[index];
+	else
+		eoi_exitmap = (unsigned long *)&vmx->eoi_exit_bitmap[index];
+
+	if (need_eoi)
+		changed = !test_and_set_bit(offset, eoi_exitmap);
+	else
+		changed = test_and_clear_bit(offset, eoi_exitmap);
+
+	if (changed)
+		vmx->eoi_exitmap_changed |= 1 << index;
+}
+
 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -7320,6 +7438,9 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
+	.has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
+	.update_irq = vmx_update_irq,
+	.set_eoi_exitmap = vmx_set_eoi_exitmap,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f76417..8b8de3b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5190,6 +5190,13 @@  static void inject_pending_event(struct kvm_vcpu *vcpu)
 			vcpu->arch.nmi_injected = true;
 			kvm_x86_ops->set_nmi(vcpu);
 		}
+	} else if (kvm_apic_vid_enabled(vcpu)) {
+		if (kvm_cpu_has_extint(vcpu) &&
+		    kvm_x86_ops->interrupt_allowed(vcpu)) {
+			kvm_queue_interrupt(vcpu,
+				kvm_cpu_get_extint(vcpu), false);
+			kvm_x86_ops->set_irq(vcpu);
+		}
 	} else if (kvm_cpu_has_interrupt(vcpu)) {
 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
@@ -5289,12 +5296,19 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		/* update archtecture specific hints for APIC
+		 * virtual interrupt delivery */
+		kvm_x86_ops->update_irq(vcpu);
+
 		inject_pending_event(vcpu);
 
 		/* enable NMI/IRQ window open exits if needed */
 		if (vcpu->arch.nmi_pending)
 			kvm_x86_ops->enable_nmi_window(vcpu);
-		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		else if (kvm_apic_vid_enabled(vcpu)) {
+			if (kvm_cpu_has_extint(vcpu))
+				kvm_x86_ops->enable_irq_window(vcpu);
+		} else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
 			kvm_x86_ops->enable_irq_window(vcpu);
 
 		if (kvm_lapic_enabled(vcpu)) {
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 166c450..898aa62 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -186,6 +186,7 @@  static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		/* need to read apic_id from apic regiest since
 		 * it can be rewritten */
 		irqe.dest_id = ioapic->kvm->bsp_vcpu_id;
+		kvm_set_eoi_exitmap(ioapic->kvm->vcpus[0], irqe.vector, 1, 1);
 	}
 #endif
 	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);