From patchwork Wed Nov 21 08:09:39 2012
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Zhang, Yang Z" <yang.z.zhang@intel.com>
X-Patchwork-Id: 1778811
Return-Path: <kvm-owner@vger.kernel.org>
X-Original-To: patchwork-kvm@patchwork.kernel.org
Delivered-To: patchwork-process-083081@patchwork2.kernel.org
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by patchwork2.kernel.org (Postfix) with ESMTP id 6E309DF288
	for <patchwork-kvm@patchwork.kernel.org>;
	Wed, 21 Nov 2012 08:14:14 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1753512Ab2KUIN4 (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Wed, 21 Nov 2012 03:13:56 -0500
Received: from mga03.intel.com ([143.182.124.21]:64721 "EHLO mga03.intel.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1753427Ab2KUINz (ORCPT <rfc822;kvm@vger.kernel.org>);
	Wed, 21 Nov 2012 03:13:55 -0500
Received: from azsmga002.ch.intel.com ([10.2.17.35])
	by azsmga101.ch.intel.com with ESMTP; 21 Nov 2012 00:13:54 -0800
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="4.83,291,1352102400"; d="scan'208";a="170896295"
Received: from yang-desktop.sh.intel.com ([10.239.13.107])
	by AZSMGA002.ch.intel.com with ESMTP; 21 Nov 2012 00:13:53 -0800
From: Yang Zhang <yang.z.zhang@intel.com>
To: kvm@vger.kernel.org
Cc: mtosatti@redhat.com, gleb@redhat.com, Yang Zhang <yang.z.zhang@intel.com>
Subject: [PATCH v2 6/6] x86, apicv: Add Posted Interrupt supporting
Date: Wed, 21 Nov 2012 16:09:39 +0800
Message-Id: <1353485379-6823-7-git-send-email-yang.z.zhang@intel.com>
X-Mailer: git-send-email 1.7.1.1
In-Reply-To: <1353485379-6823-1-git-send-email-yang.z.zhang@intel.com>
References: <1353485379-6823-1-git-send-email-yang.z.zhang@intel.com>
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org

Posted Interrupt allows vAPICV interrupts to inject into guest directly
without any vmexit.

- When delivering a interrupt to guest, if target vcpu is running,
  update Posted-interrupt requests bitmap and send a notification event
  to the vcpu. Then the vcpu will handle this interrupt automatically,
  without any software involvemnt.

- If target vcpu is not running or there already a notification event
  pending in the vcpu, do nothing. The interrupt will be handled by old
  way.

Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>
---
 arch/x86/include/asm/kvm_host.h |    3 +
 arch/x86/include/asm/vmx.h      |    4 +
 arch/x86/kernel/apic/io_apic.c  |  138 ++++++++++++++++++++++++++++
 arch/x86/kvm/lapic.c            |   31 ++++++-
 arch/x86/kvm/lapic.h            |    8 ++
 arch/x86/kvm/vmx.c              |  192 +++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/x86.c              |    2 +
 include/linux/kvm_host.h        |    1 +
 virt/kvm/kvm_main.c             |    2 +
 9 files changed, 372 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8e07a86..1145894 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -683,9 +683,12 @@ struct kvm_x86_ops {
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 	int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
+	int (*has_posted_interrupt)(struct kvm_vcpu *vcpu);
 	void (*update_irq)(struct kvm_vcpu *vcpu);
 	void (*set_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector,
 			int need_eoi, int global);
+	int (*send_nv)(struct kvm_vcpu *vcpu, int vector);
+	void (*pi_migrate)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 1003341..7b9e1d0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -152,6 +152,7 @@
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
+#define PIN_BASED_POSTED_INTR                   0x00000080
 
 #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
@@ -174,6 +175,7 @@
 /* VMCS Encodings */
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID            = 0x00000000,
+	POSTED_INTR_NV                  = 0x00000002,
 	GUEST_ES_SELECTOR               = 0x00000800,
 	GUEST_CS_SELECTOR               = 0x00000802,
 	GUEST_SS_SELECTOR               = 0x00000804,
@@ -208,6 +210,8 @@ enum vmcs_field {
 	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
 	APIC_ACCESS_ADDR		= 0x00002014,
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	POSTED_INTR_DESC_ADDR           = 0x00002016,
+	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
 	EPT_POINTER                     = 0x0000201a,
 	EPT_POINTER_HIGH                = 0x0000201b,
 	EOI_EXIT_BITMAP0                = 0x0000201c,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1817fa9..97cb8ee 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3277,6 +3277,144 @@ int arch_setup_dmar_msi(unsigned int irq)
 }
 #endif
 
+static int
+pi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		      bool force)
+{
+	unsigned int dest;
+	struct irq_cfg *cfg = (struct irq_cfg *)data->chip_data;
+	if (cpumask_equal(cfg->domain, mask))
+		return IRQ_SET_MASK_OK;
+
+	if (__ioapic_set_affinity(data, mask, &dest))
+		return -1;
+
+	return IRQ_SET_MASK_OK;
+}
+
+static void pi_mask(struct irq_data *data)
+{
+	;
+}
+
+static void pi_unmask(struct irq_data *data)
+{
+	;
+}
+
+static struct irq_chip pi_chip = {
+	.name       = "POSTED-INTR",
+	.irq_ack    = ack_apic_edge,
+	.irq_unmask = pi_unmask,
+	.irq_mask   = pi_mask,
+	.irq_set_affinity   = pi_set_affinity,
+};
+
+int arch_pi_migrate(int irq, int cpu)
+{
+	struct irq_data *data = irq_get_irq_data(irq);
+	struct irq_cfg *cfg;
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	if (!desc)
+		return -EINVAL;
+
+	cfg = irq_cfg(irq);
+	if (cpumask_equal(cfg->domain, cpumask_of(cpu)))
+		return cfg->vector;
+
+	irq_set_affinity(irq, cpumask_of(cpu));
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	irq_move_irq(data);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+	return cfg->vector;
+}
+EXPORT_SYMBOL_GPL(arch_pi_migrate);
+
+static int arch_pi_create_irq(const struct cpumask *mask)
+{
+	int node = cpu_to_node(0);
+	unsigned int irq_want;
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int ret = 0;
+	int irq;
+
+	irq_want = nr_irqs_gsi;
+
+	irq = alloc_irq_from(irq_want, node);
+	if (irq < 0)
+		return 0;
+	cfg = alloc_irq_cfg(irq_want, node);
+	if (!cfg) {
+		free_irq_at(irq, NULL);
+		return 0;
+	}
+
+	raw_spin_lock_irqsave(&vector_lock, flags);
+	if (!__assign_irq_vector(irq, cfg, mask))
+		ret = irq;
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+	if (ret) {
+		irq_set_chip_data(irq, cfg);
+		irq_clear_status_flags(irq, IRQ_NOREQUEST);
+	} else {
+		free_irq_at(irq, cfg);
+	}
+	return ret;
+}
+
+int arch_pi_alloc_irq(void *vmx)
+{
+	int irq, cpu = smp_processor_id();
+	struct irq_cfg *cfg;
+
+	irq = arch_pi_create_irq(cpumask_of(cpu));
+	if (!irq) {
+		pr_err("Posted Interrupt: no free irq\n");
+		return -EINVAL;
+	}
+	irq_set_handler_data(irq, vmx);
+	irq_set_chip_and_handler_name(irq, &pi_chip, handle_edge_irq, "edge");
+	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+	irq_set_affinity(irq, cpumask_of(cpu));
+
+	cfg = irq_cfg(irq);
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(arch_pi_alloc_irq);
+
+void arch_pi_free_irq(unsigned int irq, void *vmx)
+{
+	if (irq) {
+		irq_set_handler_data(irq, NULL);
+		/* This will mask the irq */
+		free_irq(irq, vmx);
+		destroy_irq(irq);
+	}
+}
+EXPORT_SYMBOL_GPL(arch_pi_free_irq);
+
+int arch_pi_get_vector(unsigned int irq)
+{
+	struct irq_cfg *cfg;
+
+	if (!irq)
+		return -EINVAL;
+
+	cfg = irq_cfg(irq);
+	return cfg->vector;
+}
+EXPORT_SYMBOL_GPL(arch_pi_get_vector);
+
 #ifdef CONFIG_HPET_TIMER
 
 static int hpet_msi_set_affinity(struct irq_data *data,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index af48361..04220de 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -656,7 +656,7 @@ void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode)
 {
-	int result = 0;
+	int result = 0, send;
 	struct kvm_vcpu *vcpu = apic->vcpu;
 
 	switch (delivery_mode) {
@@ -674,6 +674,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		} else {
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
 			kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
+			if (kvm_apic_pi_enabled(vcpu)) {
+				send = kvm_x86_ops->send_nv(vcpu, vector);
+				if (send) {
+					result = 1;
+					break;
+				}
+			}
 		}
 
 		result = !apic_test_and_set_irr(vector, apic);
@@ -1541,6 +1548,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 
 	if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
 		apic->vid_enabled = true;
+
+	if (kvm_x86_ops->has_posted_interrupt(vcpu))
+		apic->pi_enabled = true;
+
 	return 0;
 nomem_free_apic:
 	kfree(apic);
@@ -1575,6 +1586,24 @@ int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
 
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	unsigned int *reg;
+	unsigned int i;
+
+	if (!apic || !apic_enabled(apic))
+		return;
+
+	for (i = 0; i <= 7; i++) {
+		reg = apic->regs + APIC_IRR + i * 0x10;
+		*reg |= pir[i];
+		pir[i] = 0;
+	}
+	return;
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
+
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 {
 	u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2503a64..ad35868 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -21,6 +21,7 @@ struct kvm_lapic {
 	struct kvm_vcpu *vcpu;
 	bool irr_pending;
 	bool vid_enabled;
+	bool pi_enabled;
 	/* Number of bits set in ISR. */
 	s16 isr_count;
 	/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
@@ -43,6 +44,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_extint(struct kvm_vcpu *v);
 int kvm_cpu_get_extint(struct kvm_vcpu *v);
 int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
@@ -94,6 +96,12 @@ static inline bool kvm_apic_vid_enabled(struct kvm_vcpu *vcpu)
 	return apic->vid_enabled;
 }
 
+static inline bool kvm_apic_pi_enabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	return apic->pi_enabled;
+}
+
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
 void kvm_lapic_init(void);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f6ef090..6448b96 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -31,6 +31,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
+#include <linux/interrupt.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -89,6 +90,8 @@ module_param(enable_apicv_reg, bool, S_IRUGO);
 static bool __read_mostly enable_apicv_vid = 0;
 module_param(enable_apicv_vid, bool, S_IRUGO);
 
+static bool __read_mostly enable_apicv_pi = 0;
+module_param(enable_apicv_pi, bool, S_IRUGO);
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -372,6 +375,44 @@ struct nested_vmx {
 	struct page *apic_access_page;
 };
 
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+	u32 pir[8];     /* Posted interrupt requested */
+	union {
+		struct {
+			u8  on:1,
+			    rsvd:7;
+		} control;
+		u32 rsvd[8];
+	} u;
+} __aligned(64);
+
+#define POSTED_INTR_ON  0
+u8 pi_test_on(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
+}
+void pi_set_on(struct pi_desc *pi_desc)
+{
+	set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
+}
+
+void pi_clear_on(struct pi_desc *pi_desc)
+{
+	clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
+}
+
+u8 pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->u.control);
+}
+
+void pi_set_pir(int vector, struct pi_desc *pi_desc)
+{
+	set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -439,6 +480,11 @@ struct vcpu_vmx {
 	u64 eoi_exit_bitmap[4];
 	u64 eoi_exit_bitmap_global[4];
 
+	/* Posted interrupt descriptor */
+	struct pi_desc *pi;
+	u32 irq;
+	u32 vector;
+
 	/* Support for a guest hypervisor (nested VMX) */
 	struct nested_vmx nested;
 };
@@ -698,6 +744,11 @@ static u64 host_efer;
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
+int arch_pi_get_vector(unsigned int irq);
+int arch_pi_alloc_irq(struct vcpu_vmx *vmx);
+void arch_pi_free_irq(unsigned int irq, struct vcpu_vmx *vmx);
+int arch_pi_migrate(int irq, int cpu);
+
 /*
  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
  * away by decrementing the array size.
@@ -783,6 +834,11 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
@@ -1555,6 +1611,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
+		if (enable_apicv_pi && to_vmx(vcpu)->pi)
+			pi_set_on(to_vmx(vcpu)->pi);
+
+		kvm_make_request(KVM_REQ_POSTED_INTR, vcpu);
+
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
 		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
@@ -1582,6 +1643,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 		vcpu->cpu = -1;
 		kvm_cpu_vmxoff();
 	}
+	if (enable_apicv_pi && to_vmx(vcpu)->pi)
+		pi_set_on(to_vmx(vcpu)->pi);
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -2451,12 +2514,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	u32 _vmexit_control = 0;
 	u32 _vmentry_control = 0;
 
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-				&_pin_based_exec_control) < 0)
-		return -EIO;
-
 	min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
@@ -2531,6 +2588,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_vmexit_control) < 0)
 		return -EIO;
 
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+				&_pin_based_exec_control) < 0)
+		return -EIO;
+
+	if (!(_cpu_based_2nd_exec_control &
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
+		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
 	min = 0;
 	opt = VM_ENTRY_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
@@ -2715,6 +2783,9 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_virtual_intr_delivery())
 		enable_apicv_vid = 0;
 
+	if (!cpu_has_vmx_posted_intr() || !x2apic_enabled())
+		enable_apicv_pi = 0;
+
 	if (nested)
 		nested_vmx_setup_ctls_msrs();
 
@@ -3881,6 +3952,93 @@ static void ept_set_mmio_spte_mask(void)
 	kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
 }
 
+irqreturn_t pi_handler(int irq, void *data)
+{
+	struct vcpu_vmx *vmx = data;
+
+	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+	kvm_vcpu_kick(&vmx->vcpu);
+
+	return IRQ_HANDLED;
+}
+
+static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+	return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi;
+}
+
+static void vmx_pi_migrate(struct kvm_vcpu *vcpu)
+{
+	int ret = 0;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!enable_apicv_pi)
+		return ;
+
+	preempt_disable();
+	local_irq_disable();
+	if (!vmx->irq) {
+		ret = arch_pi_alloc_irq(vmx);
+		if (ret < 0) {
+			vmx->irq = -1;
+			goto out;
+		}
+		vmx->irq = ret;
+
+		ret = request_irq(vmx->irq, pi_handler, IRQF_NO_THREAD,
+					"Posted Interrupt", vmx);
+		if (ret) {
+			vmx->irq = -1;
+			goto out;
+		}
+
+		ret = arch_pi_get_vector(vmx->irq);
+	} else
+		ret = arch_pi_migrate(vmx->irq, smp_processor_id());
+
+	if (ret < 0) {
+		vmx->irq = -1;
+		goto out;
+	} else {
+		vmx->vector = ret;
+		vmcs_write16(POSTED_INTR_NV, vmx->vector);
+		pi_clear_on(vmx->pi);
+	}
+out:
+	local_irq_enable();
+	preempt_enable();
+	return ;
+}
+
+static int vmx_send_nv(struct kvm_vcpu *vcpu,
+		int vector)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (unlikely(vmx->irq == -1))
+		return 0;
+
+	if (vcpu->cpu == smp_processor_id()) {
+		pi_set_on(vmx->pi);
+		return 0;
+	}
+
+	pi_set_pir(vector, vmx->pi);
+	if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) {
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), vmx->vector);
+		return 1;
+	}
+	return 0;
+}
+
+static void free_pi(struct vcpu_vmx *vmx)
+{
+	if (enable_apicv_pi) {
+		kfree(vmx->pi);
+		arch_pi_free_irq(vmx->irq, vmx);
+	}
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -3890,6 +4048,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	unsigned long a;
 #endif
 	int i;
+	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 
 	/* I/O */
 	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -3901,8 +4060,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
 	/* Control */
-	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-		vmcs_config.pin_based_exec_ctrl);
+	if (!enable_apicv_pi)
+		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl);
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -3920,6 +4081,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write16(GUEST_INTR_STATUS, 0);
 	}
 
+	if (enable_apicv_pi) {
+		vmx->pi = kmalloc(sizeof(struct pi_desc),
+				GFP_KERNEL | __GFP_ZERO);
+		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi)));
+	}
+
 	if (ple_gap) {
 		vmcs_write32(PLE_GAP, ple_gap);
 		vmcs_write32(PLE_WINDOW, ple_window);
@@ -6161,6 +6328,11 @@ static void vmx_update_irq(struct kvm_vcpu *vcpu)
 	if (!enable_apicv_vid)
 		return ;
 
+	if (enable_apicv_pi) {
+		kvm_apic_update_irr(vcpu, (unsigned int *)vmx->pi->pir);
+		pi_clear_on(vmx->pi);
+	}
+
 	vector = kvm_apic_get_highest_irr(vcpu);
 	if (vector == -1)
 		return;
@@ -6586,6 +6758,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 
 	free_vpid(vmx);
 	free_nested(vmx);
+	free_pi(vmx);
 	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
@@ -7483,8 +7656,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
+	.has_posted_interrupt = vmx_has_posted_interrupt,
 	.update_irq = vmx_update_irq,
 	.set_eoi_exitmap = vmx_set_eoi_exitmap,
+	.send_nv = vmx_send_nv,
+	.pi_migrate = vmx_pi_migrate,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b8de3b..f035267 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5250,6 +5250,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_immediate_exit = 0;
 
 	if (vcpu->requests) {
+		if (kvm_check_request(KVM_REQ_POSTED_INTR, vcpu))
+			kvm_x86_ops->pi_migrate(vcpu);
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ecc5543..f8d8d34 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -107,6 +107,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_IMMEDIATE_EXIT    15
 #define KVM_REQ_PMU               16
 #define KVM_REQ_PMI               17
+#define KVM_REQ_POSTED_INTR       18
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index be70035..05baf1c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1625,6 +1625,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 			smp_send_reschedule(cpu);
 	put_cpu();
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
+
 #endif /* !CONFIG_S390 */
 
 void kvm_resched(struct kvm_vcpu *vcpu)