Message ID | 1359968714-13820-3-git-send-email-yang.z.zhang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Feb 04, 2013 at 05:05:14PM +0800, Yang Zhang wrote: > From: Yang Zhang <yang.z.zhang@Intel.com> > > Posted Interrupt allows APIC interrupts to inject into guest directly > without any vmexit. > > - When delivering a interrupt to guest, if target vcpu is running, > update Posted-interrupt requests bitmap and send a notification event > to the vcpu. Then the vcpu will handle this interrupt automatically, > without any software involvemnt. > > - If target vcpu is not running or there already a notification event > pending in the vcpu, do nothing. The interrupt will be handled by > next vm entry > > Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> > --- > arch/x86/include/asm/entry_arch.h | 1 + > arch/x86/include/asm/hw_irq.h | 1 + > arch/x86/include/asm/irq_vectors.h | 4 + > arch/x86/include/asm/kvm_host.h | 3 + > arch/x86/include/asm/vmx.h | 4 + > arch/x86/kernel/entry_64.S | 5 + > arch/x86/kernel/irq.c | 19 ++++ > arch/x86/kernel/irqinit.c | 4 + > arch/x86/kvm/lapic.c | 15 +++- > arch/x86/kvm/lapic.h | 1 + > arch/x86/kvm/svm.c | 6 ++ > arch/x86/kvm/vmx.c | 164 +++++++++++++++++++++++++++++++----- > arch/x86/kvm/x86.c | 4 + > include/linux/kvm_host.h | 1 + > 14 files changed, 208 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h > index 40afa00..7b0a29e 100644 > --- a/arch/x86/include/asm/entry_arch.h > +++ b/arch/x86/include/asm/entry_arch.h > @@ -18,6 +18,7 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) > #endif > > BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) > +BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR) Missing CONFIG_HAVE_KVM ifdef. Have you verified that this patch compiles with KVM support disabled? Also give it a name that will associate it with KVM. > > /* > * every pentium local APIC has two 'local interrupts', with a > diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h > index eb92a6e..ee61af3 100644 > --- a/arch/x86/include/asm/hw_irq.h > +++ b/arch/x86/include/asm/hw_irq.h > @@ -28,6 +28,7 @@ > /* Interrupt handlers registered during init_IRQ */ > extern void apic_timer_interrupt(void); > extern void x86_platform_ipi(void); > +extern void posted_intr_ipi(void); > extern void error_interrupt(void); > extern void irq_work_interrupt(void); > > diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h > index 1508e51..6421a63 100644 > --- a/arch/x86/include/asm/irq_vectors.h > +++ b/arch/x86/include/asm/irq_vectors.h > @@ -102,6 +102,10 @@ > */ > #define X86_PLATFORM_IPI_VECTOR 0xf7 > > +#ifdef CONFIG_HAVE_KVM > +#define POSTED_INTR_VECTOR 0xf2 > +#endif > + > /* > * IRQ work vector: > */ > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index b8388e9..bab1c0a 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -704,6 +704,9 @@ struct kvm_x86_ops { > void (*hwapic_isr_update)(struct kvm *kvm, int isr); > void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); > void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); > + bool (*send_notification_event)(struct kvm_vcpu *vcpu, > + int vector, int *result); > + bool (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); > int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); > int (*get_tdp_level)(void); > u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index 694586c..f5ec72c 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -153,6 +153,7 @@ > #define PIN_BASED_EXT_INTR_MASK 0x00000001 > #define PIN_BASED_NMI_EXITING 0x00000008 > #define PIN_BASED_VIRTUAL_NMIS 0x00000020 > +#define PIN_BASED_POSTED_INTR 0x00000080 > > #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 > #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 > @@ -175,6 +176,7 @@ > /* VMCS Encodings */ > enum vmcs_field { > VIRTUAL_PROCESSOR_ID = 0x00000000, > + POSTED_INTR_NV = 0x00000002, > GUEST_ES_SELECTOR = 0x00000800, > GUEST_CS_SELECTOR = 0x00000802, > GUEST_SS_SELECTOR = 0x00000804, > @@ -209,6 +211,8 @@ enum vmcs_field { > VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, > APIC_ACCESS_ADDR = 0x00002014, > APIC_ACCESS_ADDR_HIGH = 0x00002015, > + POSTED_INTR_DESC_ADDR = 0x00002016, > + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, > EPT_POINTER = 0x0000201a, > EPT_POINTER_HIGH = 0x0000201b, > EOI_EXIT_BITMAP0 = 0x0000201c, > diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S > index 70641af..c6c47a3 100644 > --- a/arch/x86/kernel/entry_64.S > +++ b/arch/x86/kernel/entry_64.S > @@ -1177,6 +1177,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \ > apicinterrupt X86_PLATFORM_IPI_VECTOR \ > x86_platform_ipi smp_x86_platform_ipi > > +#ifdef CONFIG_HAVE_KVM > +apicinterrupt POSTED_INTR_VECTOR \ > + posted_intr_ipi smp_posted_intr_ipi > +#endif > + > apicinterrupt THRESHOLD_APIC_VECTOR \ > threshold_interrupt smp_threshold_interrupt > apicinterrupt THERMAL_APIC_VECTOR \ > diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c > index e4595f1..3551cf2 100644 > --- a/arch/x86/kernel/irq.c > +++ b/arch/x86/kernel/irq.c > @@ -228,6 +228,25 @@ void smp_x86_platform_ipi(struct pt_regs *regs) > set_irq_regs(old_regs); > } > > +/* > + * Handler for POSTED_INTERRUPT_VECTOR. > + */ #ifdef CONFIG_HAVE_KVM > +void smp_posted_intr_ipi(struct pt_regs *regs) > +{ > + struct pt_regs *old_regs = set_irq_regs(regs); > + > + ack_APIC_irq(); > + > + irq_enter(); > + > + exit_idle(); > + > + irq_exit(); > + > + set_irq_regs(old_regs); > +} > + > + One blank line is enough. > EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); > > #ifdef CONFIG_HOTPLUG_CPU > diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c > index 6e03b0d..f90c5ae 100644 > --- a/arch/x86/kernel/irqinit.c > +++ b/arch/x86/kernel/irqinit.c > @@ -205,6 +205,10 @@ static void __init apic_intr_init(void) > > /* IPI for X86 platform specific use */ > alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); > +#ifdef CONFIG_HAVE_KVM > + /* IPI for posted interrupt use */ > + alloc_intr_gate(POSTED_INTR_VECTOR, posted_intr_ipi); > +#endif > > /* IPI vectors for APIC spurious and error interrupts */ > alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 02b51dd..df6b6a3 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -379,6 +379,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) > if (!apic->irr_pending) > return -1; > > + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); > result = apic_search_irr(apic); > ASSERT(result == -1 || result >= 16); > > @@ -685,6 +686,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > { > int result = 0; > struct kvm_vcpu *vcpu = apic->vcpu; > + bool send = false; > > switch (delivery_mode) { > case APIC_DM_LOWEST: > @@ -700,7 +702,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > } else > apic_clear_vector(vector, apic->regs + APIC_TMR); > > - result = !apic_test_and_set_irr(vector, apic); > + if (kvm_x86_ops->vm_has_apicv(vcpu->kvm)) Just call send_notification_event() and do the check inside. And call it deliver_posted_interrupt() or something. It does more than just sends notification event. Actually it may not send it at all. > + send = kvm_x86_ops->send_notification_event(vcpu, > + vector, &result); > + else > + result = !apic_test_and_set_irr(vector, apic); > + > trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, > trig_mode, vector, !result); > if (!result) { > @@ -710,8 +717,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > break; > } > > - kvm_make_request(KVM_REQ_EVENT, vcpu); > - kvm_vcpu_kick(vcpu); > + if (!send) { > + kvm_make_request(KVM_REQ_EVENT, vcpu); > + kvm_vcpu_kick(vcpu); > + } > break; > > case APIC_DM_REMRD: > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 1676d34..632111f 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -46,6 +46,7 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); > void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); > u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); > void kvm_apic_set_version(struct kvm_vcpu *vcpu); > +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir); > > int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); > int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index a7d60d7..37f961d 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -3591,6 +3591,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr) > return; > } > > +static bool svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) > +{ > + return false; > +} > + > static int svm_nmi_allowed(struct kvm_vcpu *vcpu) > { > struct vcpu_svm *svm = to_svm(vcpu); > @@ -4319,6 +4324,7 @@ static struct kvm_x86_ops svm_x86_ops = { > .vm_has_apicv = svm_vm_has_apicv, > .load_eoi_exitmap = svm_load_eoi_exitmap, > .hwapic_isr_update = svm_hwapic_isr_update, > + .sync_pir_to_irr = svm_sync_pir_to_irr, > > .set_tss_addr = svm_set_tss_addr, > .get_tdp_level = get_npt_level, > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index e826d29..d2b02f2 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -84,8 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); > static bool __read_mostly fasteoi = 1; > module_param(fasteoi, bool, S_IRUGO); > > -static bool __read_mostly enable_apicv_reg_vid = 1; > -module_param(enable_apicv_reg_vid, bool, S_IRUGO); > +static bool __read_mostly enable_apicv = 1; > +module_param(enable_apicv, bool, S_IRUGO); > > /* > * If nested=1, nested virtualization is supported, i.e., guests may use > @@ -370,6 +370,41 @@ struct nested_vmx { > struct page *apic_access_page; > }; > > +#define POSTED_INTR_ON 0 > +/* Posted-Interrupt Descriptor */ > +struct pi_desc { > + u32 pir[8]; /* Posted interrupt requested */ > + union { > + struct { > + u8 on:1, > + rsvd:7; > + } control; > + u32 rsvd[8]; > + } u; > +} __aligned(64); > + > +static bool pi_test_on(struct pi_desc *pi_desc) > +{ > + return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control); > +} > + > +static bool pi_test_and_set_on(struct pi_desc *pi_desc) > +{ > + return test_and_set_bit(POSTED_INTR_ON, > + (unsigned long *)&pi_desc->u.control); > +} > + > +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) > +{ > + return test_and_clear_bit(POSTED_INTR_ON, > + (unsigned long *)&pi_desc->u.control); > +} > + > +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) > +{ > + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); > +} > + > struct vcpu_vmx { > struct kvm_vcpu vcpu; > unsigned long host_rsp; > @@ -434,6 +469,9 @@ struct vcpu_vmx { > > bool rdtscp_enabled; > > + /* Posted interrupt descriptor */ > + struct pi_desc *pi; > + You haven't answered on my previous review why are you trying save 46 bytes here. > /* Support for a guest hypervisor (nested VMX) */ > struct nested_vmx nested; > }; > @@ -788,6 +826,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) > SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; > } > > +static inline bool cpu_has_vmx_posted_intr(void) > +{ > + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; > +} > + > +static inline bool cpu_has_vmx_apicv(void) > +{ > + return cpu_has_vmx_apic_register_virt() && > + cpu_has_vmx_virtual_intr_delivery() && > + cpu_has_vmx_posted_intr(); > +} > + > static inline bool cpu_has_vmx_flexpriority(void) > { > return cpu_has_vmx_tpr_shadow() && > @@ -2535,12 +2585,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) > u32 _vmexit_control = 0; > u32 _vmentry_control = 0; > > - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; > - opt = PIN_BASED_VIRTUAL_NMIS; > - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, > - &_pin_based_exec_control) < 0) > - return -EIO; > - > min = CPU_BASED_HLT_EXITING | > #ifdef CONFIG_X86_64 > CPU_BASED_CR8_LOAD_EXITING | > @@ -2617,6 +2661,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) > &_vmexit_control) < 0) > return -EIO; > > + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; > + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; > + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, > + &_pin_based_exec_control) < 0) > + return -EIO; > + > + if (!(_cpu_based_2nd_exec_control & > + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || > + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) > + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; > + > min = 0; > opt = VM_ENTRY_LOAD_IA32_PAT; > if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, > @@ -2795,11 +2850,10 @@ static __init int hardware_setup(void) > if (!cpu_has_vmx_ple()) > ple_gap = 0; > > - if (!cpu_has_vmx_apic_register_virt() || > - !cpu_has_vmx_virtual_intr_delivery()) > - enable_apicv_reg_vid = 0; > + if (!cpu_has_vmx_apicv()) > + enable_apicv = 0; > > - if (enable_apicv_reg_vid) > + if (enable_apicv) > kvm_x86_ops->update_cr8_intercept = NULL; > else > kvm_x86_ops->hwapic_irr_update = NULL; > @@ -3868,6 +3922,61 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) > msr, MSR_TYPE_W); > } > > +static int vmx_vm_has_apicv(struct kvm *kvm) > +{ > + return enable_apicv && irqchip_in_kernel(kvm); > +} > + > +static bool vmx_send_notification_event(struct kvm_vcpu *vcpu, > + int vector, int *result) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + > + *result = !pi_test_and_set_pir(vector, vmx->pi); The problem here is that interrupt may still be pending in IRR so eventually it will be coalesced, but we report it as delivered here. I do not see solution for this yet. > + if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) { > + kvm_make_request(KVM_REQ_PENDING_PIR, vcpu); Why not set KVM_REQ_EVENT here? What this intermediate event is needed for? > + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), > + POSTED_INTR_VECTOR); > + if (!pi_test_on(vmx->pi)) Isn't it too optimistic of you to expect IPI to be delivered and processed by remote CPU by this point? > + clear_bit(KVM_REQ_PENDING_PIR, &vcpu->requests) ; > + return true; > + } > + return false; > +} > + > +static bool vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct kvm_lapic *apic = vcpu->arch.apic; > + unsigned int i, old, new, ret_val, irr_offset, pir_val; > + bool make_request = false; > + > + if (!vmx_vm_has_apicv(vcpu->kvm) || !pi_test_and_clear_on(vmx->pi)) > + return false; > + > + for (i = 0; i <= 7; i++) { > + pir_val = xchg(&vmx->pi->pir[i], 0); > + if (pir_val) { > + irr_offset = APIC_IRR + i * 0x10; > + do { > + old = kvm_apic_get_reg(apic, irr_offset); > + new = old | pir_val; > + ret_val = cmpxchg((u32 *)(apic->regs + > + irr_offset), old, new); > + } while (unlikely(ret_val != old)); > + make_request = true; > + } > + } > + > + return make_request; > +} > + > +static void free_pi(struct vcpu_vmx *vmx) > +{ > + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) > + kfree(vmx->pi); > +} > + > /* > * Set up the vmcs's constant host-state fields, i.e., host-state fields that > * will not change in the lifetime of the guest. > @@ -3928,6 +4037,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) > vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); > } > > +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) > +{ > + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; > + > + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) > + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; > + return pin_based_exec_ctrl; > +} > + > static u32 vmx_exec_control(struct vcpu_vmx *vmx) > { > u32 exec_control = vmcs_config.cpu_based_exec_ctrl; > @@ -3945,11 +4063,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) > return exec_control; > } > > -static int vmx_vm_has_apicv(struct kvm *kvm) > -{ > - return enable_apicv_reg_vid && irqchip_in_kernel(kvm); > -} > - > static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) > { > u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; > @@ -4005,8 +4118,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) > vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ > > /* Control */ > - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, > - vmcs_config.pin_based_exec_ctrl); > + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); > > vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); > > @@ -4015,13 +4127,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) > vmx_secondary_exec_control(vmx)); > } > > - if (enable_apicv_reg_vid) { > + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { > vmcs_write64(EOI_EXIT_BITMAP0, 0); > vmcs_write64(EOI_EXIT_BITMAP1, 0); > vmcs_write64(EOI_EXIT_BITMAP2, 0); > vmcs_write64(EOI_EXIT_BITMAP3, 0); > > vmcs_write16(GUEST_INTR_STATUS, 0); > + > + vmx->pi = kzalloc(sizeof(struct pi_desc), GFP_KERNEL); > + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); > + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi))); > } > > if (ple_gap) { > @@ -4171,6 +4287,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) > vmcs_write64(APIC_ACCESS_ADDR, > page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); > > + if (vmx_vm_has_apicv(vcpu->kvm)) > + memset(vmx->pi, 0, sizeof(struct pi_desc)); > + > if (vmx->vpid != 0) > vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); > > @@ -6746,6 +6865,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) > > free_vpid(vmx); > free_nested(vmx); > + free_pi(vmx); > free_loaded_vmcs(vmx->loaded_vmcs); > kfree(vmx->guest_msrs); > kvm_vcpu_uninit(vcpu); > @@ -7647,6 +7767,8 @@ static struct kvm_x86_ops vmx_x86_ops = { > .load_eoi_exitmap = vmx_load_eoi_exitmap, > .hwapic_irr_update = vmx_hwapic_irr_update, > .hwapic_isr_update = vmx_hwapic_isr_update, > + .sync_pir_to_irr = vmx_sync_pir_to_irr, > + .send_notification_event = vmx_send_notification_event, > > .set_tss_addr = vmx_set_tss_addr, > .get_tdp_level = get_ept_level, > @@ -7750,7 +7872,7 @@ static int __init vmx_init(void) > memcpy(vmx_msr_bitmap_longmode_x2apic, > vmx_msr_bitmap_longmode, PAGE_SIZE); > > - if (enable_apicv_reg_vid) { > + if (enable_apicv) { > for (msr = 0x800; msr <= 0x8ff; msr++) > vmx_disable_intercept_msr_read_x2apic(msr); > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 9f25d70..6e1e6e7 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2681,6 +2681,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) > static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, > struct kvm_lapic_state *s) > { > + kvm_x86_ops->sync_pir_to_irr(vcpu); > memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); > > return 0; > @@ -5698,6 +5699,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > kvm_deliver_pmi(vcpu); > if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) > update_eoi_exitmap(vcpu); > + if (kvm_check_request(KVM_REQ_PENDING_PIR, vcpu)) > + if (kvm_x86_ops->sync_pir_to_irr(vcpu)) > + kvm_make_request(KVM_REQ_EVENT, vcpu); > } > > if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 0350e0d..a410819 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -124,6 +124,7 @@ static inline bool is_error_page(struct page *page) > #define KVM_REQ_MCLOCK_INPROGRESS 20 > #define KVM_REQ_EPR_EXIT 21 > #define KVM_REQ_EOIBITMAP 22 > +#define KVM_REQ_PENDING_PIR 23 > > #define KVM_USERSPACE_IRQ_SOURCE_ID 0 > #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 > -- > 1.7.1 -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote on 2013-02-04: > On Mon, Feb 04, 2013 at 05:05:14PM +0800, Yang Zhang wrote: >> From: Yang Zhang <yang.z.zhang@Intel.com> >> >> Posted Interrupt allows APIC interrupts to inject into guest directly >> without any vmexit. >> >> - When delivering a interrupt to guest, if target vcpu is running, >> update Posted-interrupt requests bitmap and send a notification event >> to the vcpu. Then the vcpu will handle this interrupt automatically, >> without any software involvemnt. >> - If target vcpu is not running or there already a notification event >> pending in the vcpu, do nothing. The interrupt will be handled by >> next vm entry >> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> >> --- >> arch/x86/include/asm/entry_arch.h | 1 + >> arch/x86/include/asm/hw_irq.h | 1 + >> arch/x86/include/asm/irq_vectors.h | 4 + >> arch/x86/include/asm/kvm_host.h | 3 + arch/x86/include/asm/vmx.h >> | 4 + arch/x86/kernel/entry_64.S | 5 + >> arch/x86/kernel/irq.c | 19 ++++ >> arch/x86/kernel/irqinit.c | 4 + arch/x86/kvm/lapic.c >> | 15 +++- arch/x86/kvm/lapic.h | 1 + >> arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c >> | 164 +++++++++++++++++++++++++++++++----- >> arch/x86/kvm/x86.c | 4 + include/linux/kvm_host.h >> | 1 + 14 files changed, 208 insertions(+), 24 deletions(-) >> diff --git a/arch/x86/include/asm/entry_arch.h >> b/arch/x86/include/asm/entry_arch.h index 40afa00..7b0a29e 100644 --- >> a/arch/x86/include/asm/entry_arch.h +++ >> b/arch/x86/include/asm/entry_arch.h @@ -18,6 +18,7 @@ >> BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) >> #endif >> >> BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) >> +BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR) > Missing CONFIG_HAVE_KVM ifdef. Have you verified that this patch > compiles with KVM support disabled? Also give it a name that will > associate it with KVM. Yes, but seems it is selected by x86 by default. And it always enabled when building kernel. I will remove the select in Kconfig and try again. >> >> /* >> * every pentium local APIC has two 'local interrupts', with a >> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h >> index eb92a6e..ee61af3 100644 >> --- a/arch/x86/include/asm/hw_irq.h >> +++ b/arch/x86/include/asm/hw_irq.h >> @@ -28,6 +28,7 @@ >> /* Interrupt handlers registered during init_IRQ */ extern void >> apic_timer_interrupt(void); extern void x86_platform_ipi(void); >> +extern void posted_intr_ipi(void); extern void error_interrupt(void); >> extern void irq_work_interrupt(void); >> diff --git a/arch/x86/include/asm/irq_vectors.h >> b/arch/x86/include/asm/irq_vectors.h index 1508e51..6421a63 100644 --- >> a/arch/x86/include/asm/irq_vectors.h +++ >> b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,10 @@ >> */ >> #define X86_PLATFORM_IPI_VECTOR 0xf7 >> +#ifdef CONFIG_HAVE_KVM >> +#define POSTED_INTR_VECTOR 0xf2 >> +#endif >> + >> /* >> * IRQ work vector: >> */ >> diff --git a/arch/x86/include/asm/kvm_host.h >> b/arch/x86/include/asm/kvm_host.h index b8388e9..bab1c0a 100644 --- >> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h >> @@ -704,6 +704,9 @@ struct kvm_x86_ops { >> void (*hwapic_isr_update)(struct kvm *kvm, int isr); >> void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); >> void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); >> + bool (*send_notification_event)(struct kvm_vcpu *vcpu, >> + int vector, int *result); >> + bool (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); >> int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); >> int (*get_tdp_level)(void); >> u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h >> index 694586c..f5ec72c 100644 >> --- a/arch/x86/include/asm/vmx.h >> +++ b/arch/x86/include/asm/vmx.h >> @@ -153,6 +153,7 @@ >> #define PIN_BASED_EXT_INTR_MASK 0x00000001 >> #define PIN_BASED_NMI_EXITING 0x00000008 >> #define PIN_BASED_VIRTUAL_NMIS 0x00000020 >> +#define PIN_BASED_POSTED_INTR 0x00000080 >> >> #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define >> VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 @@ -175,6 +176,7 @@ >> /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID >> = 0x00000000, + POSTED_INTR_NV = 0x00000002, >> GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR >> = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, >> @@ -209,6 +211,8 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH >> = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, >> APIC_ACCESS_ADDR_HIGH = 0x00002015, >> + POSTED_INTR_DESC_ADDR = 0x00002016, >> + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, >> EPT_POINTER = 0x0000201a, >> EPT_POINTER_HIGH = 0x0000201b, >> EOI_EXIT_BITMAP0 = 0x0000201c, >> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S >> index 70641af..c6c47a3 100644 >> --- a/arch/x86/kernel/entry_64.S >> +++ b/arch/x86/kernel/entry_64.S >> @@ -1177,6 +1177,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \ >> apicinterrupt X86_PLATFORM_IPI_VECTOR \ >> x86_platform_ipi smp_x86_platform_ipi >> +#ifdef CONFIG_HAVE_KVM >> +apicinterrupt POSTED_INTR_VECTOR \ >> + posted_intr_ipi smp_posted_intr_ipi >> +#endif >> + >> apicinterrupt THRESHOLD_APIC_VECTOR \ >> threshold_interrupt smp_threshold_interrupt >> apicinterrupt THERMAL_APIC_VECTOR \ >> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c >> index e4595f1..3551cf2 100644 >> --- a/arch/x86/kernel/irq.c >> +++ b/arch/x86/kernel/irq.c >> @@ -228,6 +228,25 @@ void smp_x86_platform_ipi(struct pt_regs *regs) >> set_irq_regs(old_regs); >> } >> +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ #ifdef >> CONFIG_HAVE_KVM +void smp_posted_intr_ipi(struct pt_regs *regs) +{ >> + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + >> + irq_enter(); + + exit_idle(); + + irq_exit(); + >> + set_irq_regs(old_regs); +} + + > One blank line is enough. > >> EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); >> >> #ifdef CONFIG_HOTPLUG_CPU >> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c >> index 6e03b0d..f90c5ae 100644 >> --- a/arch/x86/kernel/irqinit.c >> +++ b/arch/x86/kernel/irqinit.c >> @@ -205,6 +205,10 @@ static void __init apic_intr_init(void) >> >> /* IPI for X86 platform specific use */ >> alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); >> +#ifdef CONFIG_HAVE_KVM >> + /* IPI for posted interrupt use */ >> + alloc_intr_gate(POSTED_INTR_VECTOR, posted_intr_ipi); >> +#endif >> >> /* IPI vectors for APIC spurious and error interrupts */ >> alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >> index 02b51dd..df6b6a3 100644 >> --- a/arch/x86/kvm/lapic.c >> +++ b/arch/x86/kvm/lapic.c >> @@ -379,6 +379,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic > *apic) >> if (!apic->irr_pending) >> return -1; >> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); >> result = apic_search_irr(apic); >> ASSERT(result == -1 || result >= 16); >> @@ -685,6 +686,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > delivery_mode, >> { >> int result = 0; >> struct kvm_vcpu *vcpu = apic->vcpu; >> + bool send = false; >> >> switch (delivery_mode) { >> case APIC_DM_LOWEST: >> @@ -700,7 +702,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > delivery_mode, >> } else >> apic_clear_vector(vector, apic->regs + APIC_TMR); >> - result = !apic_test_and_set_irr(vector, apic); >> + if (kvm_x86_ops->vm_has_apicv(vcpu->kvm)) > Just call send_notification_event() and do the check inside. And call it > deliver_posted_interrupt() or something. It does more than just sends > notification event. Actually it may not send it at all. The code logic is different w/ or w/o apicv. So even put the check inside callee, we still need check it in caller. I think current solution is more clear. >> + send = kvm_x86_ops->send_notification_event(vcpu, >> + vector, &result); >> + else >> + result = !apic_test_and_set_irr(vector, apic); >> + >> trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, >> trig_mode, vector, !result); >> if (!result) { >> @@ -710,8 +717,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > delivery_mode, >> break; >> } >> - kvm_make_request(KVM_REQ_EVENT, vcpu); >> - kvm_vcpu_kick(vcpu); >> + if (!send) { >> + kvm_make_request(KVM_REQ_EVENT, vcpu); >> + kvm_vcpu_kick(vcpu); >> + } >> break; >> >> case APIC_DM_REMRD: >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h >> index 1676d34..632111f 100644 >> --- a/arch/x86/kvm/lapic.h >> +++ b/arch/x86/kvm/lapic.h >> @@ -46,6 +46,7 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); >> void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); >> u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); >> void kvm_apic_set_version(struct kvm_vcpu *vcpu); >> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir); >> >> int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); >> int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >> index a7d60d7..37f961d 100644 >> --- a/arch/x86/kvm/svm.c >> +++ b/arch/x86/kvm/svm.c >> @@ -3591,6 +3591,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, > int isr) >> return; >> } >> +static bool svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) >> +{ >> + return false; >> +} >> + >> static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm >> *svm = to_svm(vcpu); @@ -4319,6 +4324,7 @@ static struct kvm_x86_ops >> svm_x86_ops = { .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap >> = svm_load_eoi_exitmap, .hwapic_isr_update = svm_hwapic_isr_update, >> + .sync_pir_to_irr = svm_sync_pir_to_irr, >> >> .set_tss_addr = svm_set_tss_addr, >> .get_tdp_level = get_npt_level, >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index e826d29..d2b02f2 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -84,8 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); >> static bool __read_mostly fasteoi = 1; >> module_param(fasteoi, bool, S_IRUGO); >> -static bool __read_mostly enable_apicv_reg_vid = 1; >> -module_param(enable_apicv_reg_vid, bool, S_IRUGO); >> +static bool __read_mostly enable_apicv = 1; >> +module_param(enable_apicv, bool, S_IRUGO); >> >> /* >> * If nested=1, nested virtualization is supported, i.e., guests may use >> @@ -370,6 +370,41 @@ struct nested_vmx { >> struct page *apic_access_page; >> }; >> +#define POSTED_INTR_ON 0 >> +/* Posted-Interrupt Descriptor */ >> +struct pi_desc { >> + u32 pir[8]; /* Posted interrupt requested */ >> + union { >> + struct { >> + u8 on:1, >> + rsvd:7; >> + } control; >> + u32 rsvd[8]; >> + } u; >> +} __aligned(64); >> + >> +static bool pi_test_on(struct pi_desc *pi_desc) >> +{ >> + return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control); >> +} >> + >> +static bool pi_test_and_set_on(struct pi_desc *pi_desc) >> +{ >> + return test_and_set_bit(POSTED_INTR_ON, >> + (unsigned long *)&pi_desc->u.control); >> +} >> + >> +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) >> +{ >> + return test_and_clear_bit(POSTED_INTR_ON, >> + (unsigned long *)&pi_desc->u.control); >> +} >> + >> +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) >> +{ >> + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); >> +} >> + >> struct vcpu_vmx { >> struct kvm_vcpu vcpu; >> unsigned long host_rsp; >> @@ -434,6 +469,9 @@ struct vcpu_vmx { >> >> bool rdtscp_enabled; >> + /* Posted interrupt descriptor */ >> + struct pi_desc *pi; >> + > You haven't answered on my previous review why are you trying save 46 > bytes here. Sorry. I cannot get your point. It's just a pointer and only takes 8 bytes. >> /* Support for a guest hypervisor (nested VMX) */ >> struct nested_vmx nested; >> }; >> @@ -788,6 +826,18 @@ static inline bool > cpu_has_vmx_virtual_intr_delivery(void) >> SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; >> } >> +static inline bool cpu_has_vmx_posted_intr(void) >> +{ >> + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; >> +} >> + >> +static inline bool cpu_has_vmx_apicv(void) >> +{ >> + return cpu_has_vmx_apic_register_virt() && >> + cpu_has_vmx_virtual_intr_delivery() && >> + cpu_has_vmx_posted_intr(); >> +} >> + >> static inline bool cpu_has_vmx_flexpriority(void) >> { >> return cpu_has_vmx_tpr_shadow() && >> @@ -2535,12 +2585,6 @@ static __init int setup_vmcs_config(struct > vmcs_config *vmcs_conf) >> u32 _vmexit_control = 0; >> u32 _vmentry_control = 0; >> - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; >> - opt = PIN_BASED_VIRTUAL_NMIS; >> - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, >> - &_pin_based_exec_control) < 0) >> - return -EIO; >> - >> min = CPU_BASED_HLT_EXITING | >> #ifdef CONFIG_X86_64 >> CPU_BASED_CR8_LOAD_EXITING | >> @@ -2617,6 +2661,17 @@ static __init int setup_vmcs_config(struct > vmcs_config *vmcs_conf) >> &_vmexit_control) < 0) >> return -EIO; >> + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; >> + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; >> + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, >> + &_pin_based_exec_control) < 0) >> + return -EIO; >> + >> + if (!(_cpu_based_2nd_exec_control & >> + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || >> + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) >> + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; >> + >> min = 0; opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, >> opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2795,11 +2850,10 @@ static __init >> int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; >> - if (!cpu_has_vmx_apic_register_virt() || >> - !cpu_has_vmx_virtual_intr_delivery()) >> - enable_apicv_reg_vid = 0; >> + if (!cpu_has_vmx_apicv()) >> + enable_apicv = 0; >> >> - if (enable_apicv_reg_vid) >> + if (enable_apicv) >> kvm_x86_ops->update_cr8_intercept = NULL; >> else >> kvm_x86_ops->hwapic_irr_update = NULL; >> @@ -3868,6 +3922,61 @@ static void > vmx_disable_intercept_msr_write_x2apic(u32 msr) >> msr, MSR_TYPE_W); >> } >> +static int vmx_vm_has_apicv(struct kvm *kvm) >> +{ >> + return enable_apicv && irqchip_in_kernel(kvm); >> +} >> + >> +static bool vmx_send_notification_event(struct kvm_vcpu *vcpu, >> + int vector, int *result) >> +{ >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + >> + *result = !pi_test_and_set_pir(vector, vmx->pi); > The problem here is that interrupt may still be pending in IRR so > eventually it will be coalesced, but we report it as delivered here. I > do not see solution for this yet. Yes, it's true and it may result in the interrupt losing. But even in real hardware, an interrupt also will lost in some cases: for example, cpu doesn't turn on irq in time or there is a high priority interrupt pending in IRR. And since there already an interrupt pending in IRR, so the interrupt still will be handled. >> + if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) { >> + kvm_make_request(KVM_REQ_PENDING_PIR, vcpu); > Why not set KVM_REQ_EVENT here? What this intermediate event is needed > for? see answer in below. >> + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), >> + POSTED_INTR_VECTOR); >> + if (!pi_test_on(vmx->pi)) > Isn't it too optimistic of you to expect IPI to be delivered and > processed by remote CPU by this point? I have collected some data in my box and it shows about 5 percent of the posted interrupt will be handled when calling this check. How about add a unlikely() here? Also it means 5% of check the request is unnecessary. And check KVM_REQ_EVENT is more costly, so I use a more light request to do it. >> + clear_bit(KVM_REQ_PENDING_PIR, &vcpu->requests) ; >> + return true; >> + } >> + return false; >> +} >> + >> +static bool vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) >> +{ >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + struct kvm_lapic *apic = vcpu->arch.apic; >> + unsigned int i, old, new, ret_val, irr_offset, pir_val; >> + bool make_request = false; >> + >> + if (!vmx_vm_has_apicv(vcpu->kvm) || !pi_test_and_clear_on(vmx->pi)) >> + return false; >> + >> + for (i = 0; i <= 7; i++) { >> + pir_val = xchg(&vmx->pi->pir[i], 0); >> + if (pir_val) { >> + irr_offset = APIC_IRR + i * 0x10; >> + do { >> + old = kvm_apic_get_reg(apic, irr_offset); >> + new = old | pir_val; >> + ret_val = cmpxchg((u32 *)(apic->regs + >> + irr_offset), old, new); >> + } while (unlikely(ret_val != old)); >> + make_request = true; >> + } >> + } >> + >> + return make_request; >> +} >> + >> +static void free_pi(struct vcpu_vmx *vmx) >> +{ >> + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) >> + kfree(vmx->pi); >> +} >> + >> /* >> * Set up the vmcs's constant host-state fields, i.e., host-state fields that >> * will not change in the lifetime of the guest. >> @@ -3928,6 +4037,15 @@ static void set_cr4_guest_host_mask(struct > vcpu_vmx *vmx) >> vmcs_writel(CR4_GUEST_HOST_MASK, >> ~vmx->vcpu.arch.cr4_guest_owned_bits); } >> +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) >> +{ >> + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; >> + >> + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) >> + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; >> + return pin_based_exec_ctrl; >> +} >> + >> static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control >> = vmcs_config.cpu_based_exec_ctrl; @@ -3945,11 +4063,6 @@ static u32 >> vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } >> -static int vmx_vm_has_apicv(struct kvm *kvm) >> -{ >> - return enable_apicv_reg_vid && irqchip_in_kernel(kvm); >> -} >> - >> static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 >> exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -4005,8 +4118,7 >> @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) >> vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ >> >> /* Control */ >> - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, >> - vmcs_config.pin_based_exec_ctrl); >> + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); >> >> vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, > vmx_exec_control(vmx)); >> >> @@ -4015,13 +4127,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) >> vmx_secondary_exec_control(vmx)); >> } >> - if (enable_apicv_reg_vid) { >> + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { >> vmcs_write64(EOI_EXIT_BITMAP0, 0); >> vmcs_write64(EOI_EXIT_BITMAP1, 0); >> vmcs_write64(EOI_EXIT_BITMAP2, 0); >> vmcs_write64(EOI_EXIT_BITMAP3, 0); >> >> vmcs_write16(GUEST_INTR_STATUS, 0); >> + >> + vmx->pi = kzalloc(sizeof(struct pi_desc), GFP_KERNEL); >> + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); >> + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi))); >> } >> >> if (ple_gap) { @@ -4171,6 +4287,9 @@ static int vmx_vcpu_reset(struct >> kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, >> page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); >> + if (vmx_vm_has_apicv(vcpu->kvm)) >> + memset(vmx->pi, 0, sizeof(struct pi_desc)); >> + >> if (vmx->vpid != 0) >> vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); >> @@ -6746,6 +6865,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) >> >> free_vpid(vmx); free_nested(vmx); + free_pi(vmx); >> free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); >> kvm_vcpu_uninit(vcpu); @@ -7647,6 +7767,8 @@ static struct >> kvm_x86_ops vmx_x86_ops = { .load_eoi_exitmap = vmx_load_eoi_exitmap, >> .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = >> vmx_hwapic_isr_update, >> + .sync_pir_to_irr = vmx_sync_pir_to_irr, >> + .send_notification_event = vmx_send_notification_event, >> >> .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ >> -7750,7 +7872,7 @@ static int __init vmx_init(void) >> memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, >> PAGE_SIZE); >> - if (enable_apicv_reg_vid) { >> + if (enable_apicv) { >> for (msr = 0x800; msr <= 0x8ff; msr++) >> vmx_disable_intercept_msr_read_x2apic(msr); >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 9f25d70..6e1e6e7 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -2681,6 +2681,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) >> static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, >> struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); >> memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); >> >> return 0; @@ -5698,6 +5699,9 @@ static int vcpu_enter_guest(struct >> kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); if >> (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) >> update_eoi_exitmap(vcpu); >> + if (kvm_check_request(KVM_REQ_PENDING_PIR, vcpu)) >> + if (kvm_x86_ops->sync_pir_to_irr(vcpu)) >> + kvm_make_request(KVM_REQ_EVENT, vcpu); >> } >> >> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index 0350e0d..a410819 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -124,6 +124,7 @@ static inline bool is_error_page(struct page *page) >> #define KVM_REQ_MCLOCK_INPROGRESS 20 >> #define KVM_REQ_EPR_EXIT 21 >> #define KVM_REQ_EOIBITMAP 22 >> +#define KVM_REQ_PENDING_PIR 23 >> >> #define KVM_USERSPACE_IRQ_SOURCE_ID 0 >> #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 >> -- >> 1.7.1 > > -- > Gleb. Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 05, 2013 at 03:13:52AM +0000, Zhang, Yang Z wrote: > Gleb Natapov wrote on 2013-02-04: > > On Mon, Feb 04, 2013 at 05:05:14PM +0800, Yang Zhang wrote: > >> From: Yang Zhang <yang.z.zhang@Intel.com> > >> > >> Posted Interrupt allows APIC interrupts to inject into guest directly > >> without any vmexit. > >> > >> - When delivering a interrupt to guest, if target vcpu is running, > >> update Posted-interrupt requests bitmap and send a notification event > >> to the vcpu. Then the vcpu will handle this interrupt automatically, > >> without any software involvemnt. > >> - If target vcpu is not running or there already a notification event > >> pending in the vcpu, do nothing. The interrupt will be handled by > >> next vm entry > >> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> > >> --- > >> arch/x86/include/asm/entry_arch.h | 1 + > >> arch/x86/include/asm/hw_irq.h | 1 + > >> arch/x86/include/asm/irq_vectors.h | 4 + > >> arch/x86/include/asm/kvm_host.h | 3 + arch/x86/include/asm/vmx.h > >> | 4 + arch/x86/kernel/entry_64.S | 5 + > >> arch/x86/kernel/irq.c | 19 ++++ > >> arch/x86/kernel/irqinit.c | 4 + arch/x86/kvm/lapic.c > >> | 15 +++- arch/x86/kvm/lapic.h | 1 + > >> arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c > >> | 164 +++++++++++++++++++++++++++++++----- > >> arch/x86/kvm/x86.c | 4 + include/linux/kvm_host.h > >> | 1 + 14 files changed, 208 insertions(+), 24 deletions(-) > >> diff --git a/arch/x86/include/asm/entry_arch.h > >> b/arch/x86/include/asm/entry_arch.h index 40afa00..7b0a29e 100644 --- > >> a/arch/x86/include/asm/entry_arch.h +++ > >> b/arch/x86/include/asm/entry_arch.h @@ -18,6 +18,7 @@ > >> BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) > >> #endif > >> > >> BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) > >> +BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR) > > Missing CONFIG_HAVE_KVM ifdef. Have you verified that this patch > > compiles with KVM support disabled? Also give it a name that will > > associate it with KVM. > Yes, but seems it is selected by x86 by default. And it always enabled when building kernel. > I will remove the select in Kconfig and try again. > It is very probably true, but still we want it to be de-selectable some day. Juts add ifdefs where they are missing. > >> > >> /* > >> * every pentium local APIC has two 'local interrupts', with a > >> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h > >> index eb92a6e..ee61af3 100644 > >> --- a/arch/x86/include/asm/hw_irq.h > >> +++ b/arch/x86/include/asm/hw_irq.h > >> @@ -28,6 +28,7 @@ > >> /* Interrupt handlers registered during init_IRQ */ extern void > >> apic_timer_interrupt(void); extern void x86_platform_ipi(void); > >> +extern void posted_intr_ipi(void); extern void error_interrupt(void); > >> extern void irq_work_interrupt(void); > >> diff --git a/arch/x86/include/asm/irq_vectors.h > >> b/arch/x86/include/asm/irq_vectors.h index 1508e51..6421a63 100644 --- > >> a/arch/x86/include/asm/irq_vectors.h +++ > >> b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,10 @@ > >> */ > >> #define X86_PLATFORM_IPI_VECTOR 0xf7 > >> +#ifdef CONFIG_HAVE_KVM > >> +#define POSTED_INTR_VECTOR 0xf2 > >> +#endif > >> + > >> /* > >> * IRQ work vector: > >> */ > >> diff --git a/arch/x86/include/asm/kvm_host.h > >> b/arch/x86/include/asm/kvm_host.h index b8388e9..bab1c0a 100644 --- > >> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h > >> @@ -704,6 +704,9 @@ struct kvm_x86_ops { > >> void (*hwapic_isr_update)(struct kvm *kvm, int isr); > >> void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); > >> void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); > >> + bool (*send_notification_event)(struct kvm_vcpu *vcpu, > >> + int vector, int *result); > >> + bool (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); > >> int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); > >> int (*get_tdp_level)(void); > >> u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); > >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > >> index 694586c..f5ec72c 100644 > >> --- a/arch/x86/include/asm/vmx.h > >> +++ b/arch/x86/include/asm/vmx.h > >> @@ -153,6 +153,7 @@ > >> #define PIN_BASED_EXT_INTR_MASK 0x00000001 > >> #define PIN_BASED_NMI_EXITING 0x00000008 > >> #define PIN_BASED_VIRTUAL_NMIS 0x00000020 > >> +#define PIN_BASED_POSTED_INTR 0x00000080 > >> > >> #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define > >> VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 @@ -175,6 +176,7 @@ > >> /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID > >> = 0x00000000, + POSTED_INTR_NV = 0x00000002, > >> GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR > >> = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, > >> @@ -209,6 +211,8 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH > >> = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, > >> APIC_ACCESS_ADDR_HIGH = 0x00002015, > >> + POSTED_INTR_DESC_ADDR = 0x00002016, > >> + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, > >> EPT_POINTER = 0x0000201a, > >> EPT_POINTER_HIGH = 0x0000201b, > >> EOI_EXIT_BITMAP0 = 0x0000201c, > >> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S > >> index 70641af..c6c47a3 100644 > >> --- a/arch/x86/kernel/entry_64.S > >> +++ b/arch/x86/kernel/entry_64.S > >> @@ -1177,6 +1177,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \ > >> apicinterrupt X86_PLATFORM_IPI_VECTOR \ > >> x86_platform_ipi smp_x86_platform_ipi > >> +#ifdef CONFIG_HAVE_KVM > >> +apicinterrupt POSTED_INTR_VECTOR \ > >> + posted_intr_ipi smp_posted_intr_ipi > >> +#endif > >> + > >> apicinterrupt THRESHOLD_APIC_VECTOR \ > >> threshold_interrupt smp_threshold_interrupt > >> apicinterrupt THERMAL_APIC_VECTOR \ > >> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c > >> index e4595f1..3551cf2 100644 > >> --- a/arch/x86/kernel/irq.c > >> +++ b/arch/x86/kernel/irq.c > >> @@ -228,6 +228,25 @@ void smp_x86_platform_ipi(struct pt_regs *regs) > >> set_irq_regs(old_regs); > >> } > >> +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ #ifdef > >> CONFIG_HAVE_KVM +void smp_posted_intr_ipi(struct pt_regs *regs) +{ > >> + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + > >> + irq_enter(); + + exit_idle(); + + irq_exit(); + > >> + set_irq_regs(old_regs); +} + + > > One blank line is enough. > > > >> EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); > >> > >> #ifdef CONFIG_HOTPLUG_CPU > >> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c > >> index 6e03b0d..f90c5ae 100644 > >> --- a/arch/x86/kernel/irqinit.c > >> +++ b/arch/x86/kernel/irqinit.c > >> @@ -205,6 +205,10 @@ static void __init apic_intr_init(void) > >> > >> /* IPI for X86 platform specific use */ > >> alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); > >> +#ifdef CONFIG_HAVE_KVM > >> + /* IPI for posted interrupt use */ > >> + alloc_intr_gate(POSTED_INTR_VECTOR, posted_intr_ipi); > >> +#endif > >> > >> /* IPI vectors for APIC spurious and error interrupts */ > >> alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); > >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > >> index 02b51dd..df6b6a3 100644 > >> --- a/arch/x86/kvm/lapic.c > >> +++ b/arch/x86/kvm/lapic.c > >> @@ -379,6 +379,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic > > *apic) > >> if (!apic->irr_pending) > >> return -1; > >> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); > >> result = apic_search_irr(apic); > >> ASSERT(result == -1 || result >= 16); > >> @@ -685,6 +686,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > > delivery_mode, > >> { > >> int result = 0; > >> struct kvm_vcpu *vcpu = apic->vcpu; > >> + bool send = false; > >> > >> switch (delivery_mode) { > >> case APIC_DM_LOWEST: > >> @@ -700,7 +702,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > > delivery_mode, > >> } else > >> apic_clear_vector(vector, apic->regs + APIC_TMR); > >> - result = !apic_test_and_set_irr(vector, apic); > >> + if (kvm_x86_ops->vm_has_apicv(vcpu->kvm)) > > Just call send_notification_event() and do the check inside. And call it > > deliver_posted_interrupt() or something. It does more than just sends > > notification event. Actually it may not send it at all. > The code logic is different w/ or w/o apicv. So even put the check inside callee, we still need check it in caller. I think current solution is more clear. > It is not. It also requires two callbacks in the very fast path in case of pir enabled. > >> + send = kvm_x86_ops->send_notification_event(vcpu, > >> + vector, &result); > >> + else > >> + result = !apic_test_and_set_irr(vector, apic); > >> + > >> trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, > >> trig_mode, vector, !result); > >> if (!result) { > >> @@ -710,8 +717,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > > delivery_mode, > >> break; > >> } > >> - kvm_make_request(KVM_REQ_EVENT, vcpu); > >> - kvm_vcpu_kick(vcpu); > >> + if (!send) { > >> + kvm_make_request(KVM_REQ_EVENT, vcpu); > >> + kvm_vcpu_kick(vcpu); > >> + } > >> break; > >> > >> case APIC_DM_REMRD: > >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > >> index 1676d34..632111f 100644 > >> --- a/arch/x86/kvm/lapic.h > >> +++ b/arch/x86/kvm/lapic.h > >> @@ -46,6 +46,7 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); > >> void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); > >> u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); > >> void kvm_apic_set_version(struct kvm_vcpu *vcpu); > >> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir); > >> > >> int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); > >> int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); > >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > >> index a7d60d7..37f961d 100644 > >> --- a/arch/x86/kvm/svm.c > >> +++ b/arch/x86/kvm/svm.c > >> @@ -3591,6 +3591,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, > > int isr) > >> return; > >> } > >> +static bool svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) > >> +{ > >> + return false; > >> +} > >> + > >> static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm > >> *svm = to_svm(vcpu); @@ -4319,6 +4324,7 @@ static struct kvm_x86_ops > >> svm_x86_ops = { .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap > >> = svm_load_eoi_exitmap, .hwapic_isr_update = svm_hwapic_isr_update, > >> + .sync_pir_to_irr = svm_sync_pir_to_irr, > >> > >> .set_tss_addr = svm_set_tss_addr, > >> .get_tdp_level = get_npt_level, > >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > >> index e826d29..d2b02f2 100644 > >> --- a/arch/x86/kvm/vmx.c > >> +++ b/arch/x86/kvm/vmx.c > >> @@ -84,8 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); > >> static bool __read_mostly fasteoi = 1; > >> module_param(fasteoi, bool, S_IRUGO); > >> -static bool __read_mostly enable_apicv_reg_vid = 1; > >> -module_param(enable_apicv_reg_vid, bool, S_IRUGO); > >> +static bool __read_mostly enable_apicv = 1; > >> +module_param(enable_apicv, bool, S_IRUGO); > >> > >> /* > >> * If nested=1, nested virtualization is supported, i.e., guests may use > >> @@ -370,6 +370,41 @@ struct nested_vmx { > >> struct page *apic_access_page; > >> }; > >> +#define POSTED_INTR_ON 0 > >> +/* Posted-Interrupt Descriptor */ > >> +struct pi_desc { > >> + u32 pir[8]; /* Posted interrupt requested */ > >> + union { > >> + struct { > >> + u8 on:1, > >> + rsvd:7; > >> + } control; > >> + u32 rsvd[8]; > >> + } u; > >> +} __aligned(64); > >> + > >> +static bool pi_test_on(struct pi_desc *pi_desc) > >> +{ > >> + return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control); > >> +} > >> + > >> +static bool pi_test_and_set_on(struct pi_desc *pi_desc) > >> +{ > >> + return test_and_set_bit(POSTED_INTR_ON, > >> + (unsigned long *)&pi_desc->u.control); > >> +} > >> + > >> +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) > >> +{ > >> + return test_and_clear_bit(POSTED_INTR_ON, > >> + (unsigned long *)&pi_desc->u.control); > >> +} > >> + > >> +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) > >> +{ > >> + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); > >> +} > >> + > >> struct vcpu_vmx { > >> struct kvm_vcpu vcpu; > >> unsigned long host_rsp; > >> @@ -434,6 +469,9 @@ struct vcpu_vmx { > >> > >> bool rdtscp_enabled; > >> + /* Posted interrupt descriptor */ > >> + struct pi_desc *pi; > >> + > > You haven't answered on my previous review why are you trying save 46 > > bytes here. > Sorry. I cannot get your point. It's just a pointer and only takes 8 bytes. And embedded structure will take 64 bytes, so by allocating it dynamically you are trying to save 46 bytes for !pir case per vcpu. Just embed pi_desc here. > > >> /* Support for a guest hypervisor (nested VMX) */ > >> struct nested_vmx nested; > >> }; > >> @@ -788,6 +826,18 @@ static inline bool > > cpu_has_vmx_virtual_intr_delivery(void) > >> SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; > >> } > >> +static inline bool cpu_has_vmx_posted_intr(void) > >> +{ > >> + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; > >> +} > >> + > >> +static inline bool cpu_has_vmx_apicv(void) > >> +{ > >> + return cpu_has_vmx_apic_register_virt() && > >> + cpu_has_vmx_virtual_intr_delivery() && > >> + cpu_has_vmx_posted_intr(); > >> +} > >> + > >> static inline bool cpu_has_vmx_flexpriority(void) > >> { > >> return cpu_has_vmx_tpr_shadow() && > >> @@ -2535,12 +2585,6 @@ static __init int setup_vmcs_config(struct > > vmcs_config *vmcs_conf) > >> u32 _vmexit_control = 0; > >> u32 _vmentry_control = 0; > >> - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; > >> - opt = PIN_BASED_VIRTUAL_NMIS; > >> - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, > >> - &_pin_based_exec_control) < 0) > >> - return -EIO; > >> - > >> min = CPU_BASED_HLT_EXITING | > >> #ifdef CONFIG_X86_64 > >> CPU_BASED_CR8_LOAD_EXITING | > >> @@ -2617,6 +2661,17 @@ static __init int setup_vmcs_config(struct > > vmcs_config *vmcs_conf) > >> &_vmexit_control) < 0) > >> return -EIO; > >> + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; > >> + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; > >> + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, > >> + &_pin_based_exec_control) < 0) > >> + return -EIO; > >> + > >> + if (!(_cpu_based_2nd_exec_control & > >> + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || > >> + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) > >> + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; > >> + > >> min = 0; opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, > >> opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2795,11 +2850,10 @@ static __init > >> int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; > >> - if (!cpu_has_vmx_apic_register_virt() || > >> - !cpu_has_vmx_virtual_intr_delivery()) > >> - enable_apicv_reg_vid = 0; > >> + if (!cpu_has_vmx_apicv()) > >> + enable_apicv = 0; > >> > >> - if (enable_apicv_reg_vid) > >> + if (enable_apicv) > >> kvm_x86_ops->update_cr8_intercept = NULL; > >> else > >> kvm_x86_ops->hwapic_irr_update = NULL; > >> @@ -3868,6 +3922,61 @@ static void > > vmx_disable_intercept_msr_write_x2apic(u32 msr) > >> msr, MSR_TYPE_W); > >> } > >> +static int vmx_vm_has_apicv(struct kvm *kvm) > >> +{ > >> + return enable_apicv && irqchip_in_kernel(kvm); > >> +} > >> + > >> +static bool vmx_send_notification_event(struct kvm_vcpu *vcpu, > >> + int vector, int *result) > >> +{ > >> + struct vcpu_vmx *vmx = to_vmx(vcpu); > >> + > >> + *result = !pi_test_and_set_pir(vector, vmx->pi); > > The problem here is that interrupt may still be pending in IRR so > > eventually it will be coalesced, but we report it as delivered here. I > > do not see solution for this yet. > Yes, it's true and it may result in the interrupt losing. But even in real hardware, an interrupt also will lost in some cases: for example, cpu doesn't turn on irq in time or there is a high priority interrupt pending in IRR. > And since there already an interrupt pending in IRR, so the interrupt still will be handled. > On the real HW we do not need to do interrupt de-coalescing, so the point of the code here is exactly to not be like real HW. This is why "real HW" justification, which is perfectly valid usually, does not work in this case. In ideal world we wouldn't be even tracking "result" here. > >> + if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) { > >> + kvm_make_request(KVM_REQ_PENDING_PIR, vcpu); > > Why not set KVM_REQ_EVENT here? What this intermediate event is needed > > for? > see answer in below. > > >> + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), > >> + POSTED_INTR_VECTOR); > >> + if (!pi_test_on(vmx->pi)) > > Isn't it too optimistic of you to expect IPI to be delivered and > > processed by remote CPU by this point? > I have collected some data in my box and it shows about 5 percent of the posted interrupt will be handled when calling this check. How about add a unlikely() here? > Also it means 5% of check the request is unnecessary. And check KVM_REQ_EVENT is more costly, so I use a more light request to do it. > This is premature optimization. Drop it. If later you can back up it with impressive number it can be re-added. > >> + clear_bit(KVM_REQ_PENDING_PIR, &vcpu->requests) ; > >> + return true; > >> + } > >> + return false; > >> +} > >> + > >> +static bool vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) > >> +{ > >> + struct vcpu_vmx *vmx = to_vmx(vcpu); > >> + struct kvm_lapic *apic = vcpu->arch.apic; > >> + unsigned int i, old, new, ret_val, irr_offset, pir_val; > >> + bool make_request = false; > >> + > >> + if (!vmx_vm_has_apicv(vcpu->kvm) || !pi_test_and_clear_on(vmx->pi)) > >> + return false; > >> + > >> + for (i = 0; i <= 7; i++) { > >> + pir_val = xchg(&vmx->pi->pir[i], 0); > >> + if (pir_val) { > >> + irr_offset = APIC_IRR + i * 0x10; > >> + do { > >> + old = kvm_apic_get_reg(apic, irr_offset); > >> + new = old | pir_val; > >> + ret_val = cmpxchg((u32 *)(apic->regs + > >> + irr_offset), old, new); > >> + } while (unlikely(ret_val != old)); > >> + make_request = true; > >> + } > >> + } > >> + > >> + return make_request; > >> +} > >> + > >> +static void free_pi(struct vcpu_vmx *vmx) > >> +{ > >> + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) > >> + kfree(vmx->pi); > >> +} > >> + > >> /* > >> * Set up the vmcs's constant host-state fields, i.e., host-state fields that > >> * will not change in the lifetime of the guest. > >> @@ -3928,6 +4037,15 @@ static void set_cr4_guest_host_mask(struct > > vcpu_vmx *vmx) > >> vmcs_writel(CR4_GUEST_HOST_MASK, > >> ~vmx->vcpu.arch.cr4_guest_owned_bits); } > >> +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) > >> +{ > >> + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; > >> + > >> + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) > >> + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; > >> + return pin_based_exec_ctrl; > >> +} > >> + > >> static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control > >> = vmcs_config.cpu_based_exec_ctrl; @@ -3945,11 +4063,6 @@ static u32 > >> vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } > >> -static int vmx_vm_has_apicv(struct kvm *kvm) > >> -{ > >> - return enable_apicv_reg_vid && irqchip_in_kernel(kvm); > >> -} > >> - > >> static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 > >> exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -4005,8 +4118,7 > >> @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) > >> vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ > >> > >> /* Control */ > >> - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, > >> - vmcs_config.pin_based_exec_ctrl); > >> + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); > >> > >> vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, > > vmx_exec_control(vmx)); > >> > >> @@ -4015,13 +4127,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) > >> vmx_secondary_exec_control(vmx)); > >> } > >> - if (enable_apicv_reg_vid) { > >> + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { > >> vmcs_write64(EOI_EXIT_BITMAP0, 0); > >> vmcs_write64(EOI_EXIT_BITMAP1, 0); > >> vmcs_write64(EOI_EXIT_BITMAP2, 0); > >> vmcs_write64(EOI_EXIT_BITMAP3, 0); > >> > >> vmcs_write16(GUEST_INTR_STATUS, 0); > >> + > >> + vmx->pi = kzalloc(sizeof(struct pi_desc), GFP_KERNEL); > >> + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); > >> + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi))); > >> } > >> > >> if (ple_gap) { @@ -4171,6 +4287,9 @@ static int vmx_vcpu_reset(struct > >> kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, > >> page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); > >> + if (vmx_vm_has_apicv(vcpu->kvm)) > >> + memset(vmx->pi, 0, sizeof(struct pi_desc)); > >> + > >> if (vmx->vpid != 0) > >> vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); > >> @@ -6746,6 +6865,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) > >> > >> free_vpid(vmx); free_nested(vmx); + free_pi(vmx); > >> free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); > >> kvm_vcpu_uninit(vcpu); @@ -7647,6 +7767,8 @@ static struct > >> kvm_x86_ops vmx_x86_ops = { .load_eoi_exitmap = vmx_load_eoi_exitmap, > >> .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = > >> vmx_hwapic_isr_update, > >> + .sync_pir_to_irr = vmx_sync_pir_to_irr, > >> + .send_notification_event = vmx_send_notification_event, > >> > >> .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ > >> -7750,7 +7872,7 @@ static int __init vmx_init(void) > >> memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, > >> PAGE_SIZE); > >> - if (enable_apicv_reg_vid) { > >> + if (enable_apicv) { > >> for (msr = 0x800; msr <= 0x8ff; msr++) > >> vmx_disable_intercept_msr_read_x2apic(msr); > >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >> index 9f25d70..6e1e6e7 100644 > >> --- a/arch/x86/kvm/x86.c > >> +++ b/arch/x86/kvm/x86.c > >> @@ -2681,6 +2681,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) > >> static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, > >> struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); > >> memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); > >> > >> return 0; @@ -5698,6 +5699,9 @@ static int vcpu_enter_guest(struct > >> kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); if > >> (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) > >> update_eoi_exitmap(vcpu); > >> + if (kvm_check_request(KVM_REQ_PENDING_PIR, vcpu)) > >> + if (kvm_x86_ops->sync_pir_to_irr(vcpu)) > >> + kvm_make_request(KVM_REQ_EVENT, vcpu); > >> } > >> > >> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { > >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > >> index 0350e0d..a410819 100644 > >> --- a/include/linux/kvm_host.h > >> +++ b/include/linux/kvm_host.h > >> @@ -124,6 +124,7 @@ static inline bool is_error_page(struct page *page) > >> #define KVM_REQ_MCLOCK_INPROGRESS 20 > >> #define KVM_REQ_EPR_EXIT 21 > >> #define KVM_REQ_EOIBITMAP 22 > >> +#define KVM_REQ_PENDING_PIR 23 > >> > >> #define KVM_USERSPACE_IRQ_SOURCE_ID 0 > >> #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 > >> -- > >> 1.7.1 > > > > -- > > Gleb. > > > Best regards, > Yang > -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 05, 2013 at 08:45:01AM +0200, Gleb Natapov wrote: > > >> + /* Posted interrupt descriptor */ > > >> + struct pi_desc *pi; > > >> + > > > You haven't answered on my previous review why are you trying save 46 > > > bytes here. > > Sorry. I cannot get your point. It's just a pointer and only takes 8 bytes. > And embedded structure will take 64 bytes, so by allocating it dynamically > you are trying to save 46 bytes for !pir case per vcpu. Just embed > pi_desc here. > My calculator is broken. It is 56 bytes of course. Still small enough to embed. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 40afa00..7b0a29e 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -18,6 +18,7 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) #endif BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) +BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR) /* * every pentium local APIC has two 'local interrupts', with a diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6e..ee61af3 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,6 +28,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); extern void x86_platform_ipi(void); +extern void posted_intr_ipi(void); extern void error_interrupt(void); extern void irq_work_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 1508e51..6421a63 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,10 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * IRQ work vector: */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b8388e9..bab1c0a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -704,6 +704,9 @@ struct kvm_x86_ops { void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); + bool (*send_notification_event)(struct kvm_vcpu *vcpu, + int vector, int *result); + bool (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 694586c..f5ec72c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -153,6 +153,7 @@ #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 +#define PIN_BASED_POSTED_INTR 0x00000080 #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 @@ -175,6 +176,7 @@ /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, + POSTED_INTR_NV = 0x00000002, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -209,6 +211,8 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, APIC_ACCESS_ADDR_HIGH = 0x00002015, + POSTED_INTR_DESC_ADDR = 0x00002016, + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, EPT_POINTER = 0x0000201a, EPT_POINTER_HIGH = 0x0000201b, EOI_EXIT_BITMAP0 = 0x0000201c, diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 70641af..c6c47a3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1177,6 +1177,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi +#ifdef CONFIG_HAVE_KVM +apicinterrupt POSTED_INTR_VECTOR \ + posted_intr_ipi smp_posted_intr_ipi +#endif + apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e4595f1..3551cf2 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -228,6 +228,25 @@ void smp_x86_platform_ipi(struct pt_regs *regs) set_irq_regs(old_regs); } +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ +void smp_posted_intr_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + irq_enter(); + + exit_idle(); + + irq_exit(); + + set_irq_regs(old_regs); +} + + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6e03b0d..f90c5ae 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -205,6 +205,10 @@ static void __init apic_intr_init(void) /* IPI for X86 platform specific use */ alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); +#ifdef CONFIG_HAVE_KVM + /* IPI for posted interrupt use */ + alloc_intr_gate(POSTED_INTR_VECTOR, posted_intr_ipi); +#endif /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 02b51dd..df6b6a3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -379,6 +379,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -685,6 +686,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; + bool send = false; switch (delivery_mode) { case APIC_DM_LOWEST: @@ -700,7 +702,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - result = !apic_test_and_set_irr(vector, apic); + if (kvm_x86_ops->vm_has_apicv(vcpu->kvm)) + send = kvm_x86_ops->send_notification_event(vcpu, + vector, &result); + else + result = !apic_test_and_set_irr(vector, apic); + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); if (!result) { @@ -710,8 +717,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); + if (!send) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } break; case APIC_DM_REMRD: diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1676d34..632111f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -46,6 +46,7 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a7d60d7..37f961d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3591,6 +3591,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr) return; } +static bool svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + return false; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4319,6 +4324,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_isr_update = svm_hwapic_isr_update, + .sync_pir_to_irr = svm_sync_pir_to_irr, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e826d29..d2b02f2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,8 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); -static bool __read_mostly enable_apicv_reg_vid = 1; -module_param(enable_apicv_reg_vid, bool, S_IRUGO); +static bool __read_mostly enable_apicv = 1; +module_param(enable_apicv, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use @@ -370,6 +370,41 @@ struct nested_vmx { struct page *apic_access_page; }; +#define POSTED_INTR_ON 0 +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + u8 on:1, + rsvd:7; + } control; + u32 rsvd[8]; + } u; +} __aligned(64); + +static bool pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control); +} + +static bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->u.control); +} + +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->u.control); +} + +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -434,6 +469,9 @@ struct vcpu_vmx { bool rdtscp_enabled; + /* Posted interrupt descriptor */ + struct pi_desc *pi; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -788,6 +826,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +static inline bool cpu_has_vmx_posted_intr(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; +} + +static inline bool cpu_has_vmx_apicv(void) +{ + return cpu_has_vmx_apic_register_virt() && + cpu_has_vmx_virtual_intr_delivery() && + cpu_has_vmx_posted_intr(); +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -2535,12 +2585,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmexit_control = 0; u32 _vmentry_control = 0; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | @@ -2617,6 +2661,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; + min = 0; opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2795,11 +2850,10 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; - if (!cpu_has_vmx_apic_register_virt() || - !cpu_has_vmx_virtual_intr_delivery()) - enable_apicv_reg_vid = 0; + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; - if (enable_apicv_reg_vid) + if (enable_apicv) kvm_x86_ops->update_cr8_intercept = NULL; else kvm_x86_ops->hwapic_irr_update = NULL; @@ -3868,6 +3922,61 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } +static int vmx_vm_has_apicv(struct kvm *kvm) +{ + return enable_apicv && irqchip_in_kernel(kvm); +} + +static bool vmx_send_notification_event(struct kvm_vcpu *vcpu, + int vector, int *result) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + *result = !pi_test_and_set_pir(vector, vmx->pi); + if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) { + kvm_make_request(KVM_REQ_PENDING_PIR, vcpu); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + if (!pi_test_on(vmx->pi)) + clear_bit(KVM_REQ_PENDING_PIR, &vcpu->requests) ; + return true; + } + return false; +} + +static bool vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + unsigned int i, old, new, ret_val, irr_offset, pir_val; + bool make_request = false; + + if (!vmx_vm_has_apicv(vcpu->kvm) || !pi_test_and_clear_on(vmx->pi)) + return false; + + for (i = 0; i <= 7; i++) { + pir_val = xchg(&vmx->pi->pir[i], 0); + if (pir_val) { + irr_offset = APIC_IRR + i * 0x10; + do { + old = kvm_apic_get_reg(apic, irr_offset); + new = old | pir_val; + ret_val = cmpxchg((u32 *)(apic->regs + + irr_offset), old, new); + } while (unlikely(ret_val != old)); + make_request = true; + } + } + + return make_request; +} + +static void free_pi(struct vcpu_vmx *vmx) +{ + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + kfree(vmx->pi); +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -3928,6 +4037,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +{ + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; + + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + return pin_based_exec_ctrl; +} + static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -3945,11 +4063,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } -static int vmx_vm_has_apicv(struct kvm *kvm) -{ - return enable_apicv_reg_vid && irqchip_in_kernel(kvm); -} - static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -4005,8 +4118,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - vmcs_config.pin_based_exec_ctrl); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -4015,13 +4127,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } - if (enable_apicv_reg_vid) { + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); vmcs_write64(EOI_EXIT_BITMAP3, 0); vmcs_write16(GUEST_INTR_STATUS, 0); + + vmx->pi = kzalloc(sizeof(struct pi_desc), GFP_KERNEL); + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi))); } if (ple_gap) { @@ -4171,6 +4287,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + if (vmx_vm_has_apicv(vcpu->kvm)) + memset(vmx->pi, 0, sizeof(struct pi_desc)); + if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); @@ -6746,6 +6865,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_vpid(vmx); free_nested(vmx); + free_pi(vmx); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); @@ -7647,6 +7767,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .send_notification_event = vmx_send_notification_event, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -7750,7 +7872,7 @@ static int __init vmx_init(void) memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); - if (enable_apicv_reg_vid) { + if (enable_apicv) { for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f25d70..6e1e6e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2681,6 +2681,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); return 0; @@ -5698,6 +5699,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) update_eoi_exitmap(vcpu); + if (kvm_check_request(KVM_REQ_PENDING_PIR, vcpu)) + if (kvm_x86_ops->sync_pir_to_irr(vcpu)) + kvm_make_request(KVM_REQ_EVENT, vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0350e0d..a410819 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -124,6 +124,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MCLOCK_INPROGRESS 20 #define KVM_REQ_EPR_EXIT 21 #define KVM_REQ_EOIBITMAP 22 +#define KVM_REQ_PENDING_PIR 23 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1