diff mbox

[v4,2/2] KVM: VMX: Add Posted Interrupt supporting

Message ID 1361540552-2016-3-git-send-email-yang.z.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhang, Yang Z Feb. 22, 2013, 1:42 p.m. UTC
From: Yang Zhang <yang.z.zhang@Intel.com>

Posted Interrupt allows APIC interrupts to inject into guest directly
without any vmexit.

- When delivering a interrupt to guest, if target vcpu is running,
  update Posted-interrupt requests bitmap and send a notification event
  to the vcpu. Then the vcpu will handle this interrupt automatically,
  without any software involvemnt.

- If target vcpu is not running or there already a notification event
  pending in the vcpu, do nothing. The interrupt will be handled by
  next vm entry

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
---
 arch/x86/include/asm/entry_arch.h  |    4 +
 arch/x86/include/asm/hw_irq.h      |    1 +
 arch/x86/include/asm/irq_vectors.h |    5 +
 arch/x86/include/asm/kvm_host.h    |    3 +
 arch/x86/include/asm/vmx.h         |    4 +
 arch/x86/kernel/entry_64.S         |    5 +
 arch/x86/kernel/irq.c              |   20 +++++
 arch/x86/kernel/irqinit.c          |    4 +
 arch/x86/kvm/lapic.c               |   32 +++++++-
 arch/x86/kvm/lapic.h               |    2 +
 arch/x86/kvm/svm.c                 |   13 +++
 arch/x86/kvm/vmx.c                 |  161 +++++++++++++++++++++++++++++++-----
 arch/x86/kvm/x86.c                 |    1 +
 13 files changed, 232 insertions(+), 23 deletions(-)

Comments

Marcelo Tosatti Feb. 23, 2013, 1:43 p.m. UTC | #1
On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote:
> From: Yang Zhang <yang.z.zhang@Intel.com>
> 
> Posted Interrupt allows APIC interrupts to inject into guest directly
> without any vmexit.
> 
> - When delivering a interrupt to guest, if target vcpu is running,
>   update Posted-interrupt requests bitmap and send a notification event
>   to the vcpu. Then the vcpu will handle this interrupt automatically,
>   without any software involvemnt.
> 
> - If target vcpu is not running or there already a notification event
>   pending in the vcpu, do nothing. The interrupt will be handled by
>   next vm entry
> 
> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> ---

> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> index e4595f1..64616cc 100644
> --- a/arch/x86/kernel/irq.c
> +++ b/arch/x86/kernel/irq.c
> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
>  	set_irq_regs(old_regs);
>  }
>  
> +#ifdef CONFIG_HAVE_KVM
> +/*
> + * Handler for POSTED_INTERRUPT_VECTOR.
> + */
> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
> +{
> +	struct pt_regs *old_regs = set_irq_regs(regs);
> +
> +	ack_APIC_irq();
> +
> +	irq_enter();
> +
> +	exit_idle();
> +
> +	irq_exit();
> +
> +	set_irq_regs(old_regs);
> +}
> +#endif
> +

Add per-cpu counters, similarly to other handlers in the same file.

> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
>  	if (!apic->irr_pending)
>  		return -1;
>  
> +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
>  	result = apic_search_irr(apic);
>  	ASSERT(result == -1 || result >= 16);

kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest,
before inject_pending_event.

        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
		->
		inject_pending_event
		...
	}


> @@ -685,6 +705,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  {
>  	int result = 0;
>  	struct kvm_vcpu *vcpu = apic->vcpu;
> +	bool delivered = false;
>  
>  	switch (delivery_mode) {
>  	case APIC_DM_LOWEST:
> @@ -700,7 +721,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  		} else
>  			apic_clear_vector(vector, apic->regs + APIC_TMR);
>  
> -		result = !apic_test_and_set_irr(vector, apic);
> +		if (!kvm_x86_ops->deliver_posted_interrupt(vcpu, vector,
> +						&result, &delivered))
> +			result = !apic_test_and_set_irr(vector, apic);
> +
>  		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
>  					  trig_mode, vector, !result);
>  		if (!result) {
> @@ -710,8 +734,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  			break;
>  		}
>  
> -		kvm_make_request(KVM_REQ_EVENT, vcpu);
> -		kvm_vcpu_kick(vcpu);
> +		if (!delivered) {
> +			kvm_make_request(KVM_REQ_EVENT, vcpu);
> +			kvm_vcpu_kick(vcpu);
> +		}

This set bit / kick pair should be for non-APICv only (please
check for it directly).

> +	return test_bit(vector, (unsigned long *)pi_desc->pir);
> +}
> +
> +static void pi_set_pir(int vector, struct pi_desc *pi_desc)
> +{
> +	__set_bit(vector, (unsigned long *)pi_desc->pir);
> +}

This must be locked atomic operation (set_bit).

> +
>  struct vcpu_vmx {
>  	struct kvm_vcpu       vcpu;
>  	unsigned long         host_rsp;
> @@ -429,6 +465,10 @@ struct vcpu_vmx {
>  
>  	bool rdtscp_enabled;
>  
> +	/* Posted interrupt descriptor */
> +	struct pi_desc pi_desc;
> +	spinlock_t pi_lock;

Don't see why the lock is necessary. Please use the following
pseudocode for vmx_deliver_posted_interrupt:

vmx_deliver_posted_interrupt(), returns 'bool injected'.

1. orig_irr = read irr from vapic page
2. if (orig_irr != 0)
3.	return false;
4. kvm_make_request(KVM_REQ_EVENT)
5. bool injected = !test_and_set_bit(PIR)
6. if (vcpu->guest_mode && injected)
7. 	if (test_and_set_bit(PIR notification bit))
8.		send PIR IPI
9. return injected

On vcpu_enter_guest:

if (kvm_check_request(KVM_REQ_EVENT))  {
	pir->virr sync			(*)
	...
}
...
vcpu->mode = IN_GUEST_MODE;
local_irq_disable
clear pir notification bit unconditionally (*)

Right after local_irq_disable.

> +static bool vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu,
> +		int vector, int *result, bool *delivered)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long flags;
> +
> +	if (!vmx_vm_has_apicv(vcpu->kvm))
> +		return false;
> +
> +	spin_lock_irqsave(&vmx->pi_lock, flags);
> +	if (pi_test_pir(vector, &vmx->pi_desc) ||
> +		kvm_apic_test_irr(vector, vcpu->arch.apic)) {
> +		spin_unlock_irqrestore(&vmx->pi_lock, flags);
> +		return true;
> +	}
> +
> +	pi_set_pir(vector, &vmx->pi_desc);
> +	*result = 1;
> +	if (!pi_test_and_set_on(&vmx->pi_desc) &&
> +			(vcpu->mode == IN_GUEST_MODE)) {
> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
> +		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
> +				    POSTED_INTR_VECTOR);
> +		*delivered = true;
> +	}
> +	spin_unlock_irqrestore(&vmx->pi_lock, flags);
> +
> +	return true;
> +}

Please confirm whether a spinlock is necessary with the pseudocode 
above.

> +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long flags;
> +
> +	if (!vmx_vm_has_apicv(vcpu->kvm))
> +		return;
> +
> +	spin_lock_irqsave(&vmx->pi_lock, flags);


> +	if (!pi_test_and_clear_on(&vmx->pi_desc)) {

This needs to move to vcpu_enter_guest, after local_irq_disabled,
unconditionally, as noted.

> @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
>  				    struct kvm_lapic_state *s)
>  {
> +	kvm_x86_ops->sync_pir_to_irr(vcpu);
>  	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
>  
>  	return 0;
> -- 
> 1.7.1


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 23, 2013, 2:05 p.m. UTC | #2
Marcelo Tosatti wrote on 2013-02-23:
> 
> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote:
>> From: Yang Zhang <yang.z.zhang@Intel.com>
>> 
>> Posted Interrupt allows APIC interrupts to inject into guest directly
>> without any vmexit.
>> 
>> - When delivering a interrupt to guest, if target vcpu is running,
>>   update Posted-interrupt requests bitmap and send a notification event
>>   to the vcpu. Then the vcpu will handle this interrupt automatically,
>>   without any software involvemnt.
>> - If target vcpu is not running or there already a notification event
>>   pending in the vcpu, do nothing. The interrupt will be handled by
>>   next vm entry
>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>> ---
>> 
>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
>> index e4595f1..64616cc 100644
>> --- a/arch/x86/kernel/irq.c
>> +++ b/arch/x86/kernel/irq.c
>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
>>  	set_irq_regs(old_regs);
>>  }
>> +#ifdef CONFIG_HAVE_KVM
>> +/*
>> + * Handler for POSTED_INTERRUPT_VECTOR.
>> + */
>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
>> +{
>> +	struct pt_regs *old_regs = set_irq_regs(regs);
>> +
>> +	ack_APIC_irq();
>> +
>> +	irq_enter();
>> +
>> +	exit_idle();
>> +
>> +	irq_exit();
>> +
>> +	set_irq_regs(old_regs);
>> +}
>> +#endif
>> +
> 
> Add per-cpu counters, similarly to other handlers in the same file.
Sure.

>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic
> *apic)
>>  	if (!apic->irr_pending)
>>  		return -1;
>> +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
>>  	result = apic_search_irr(apic);
>>  	ASSERT(result == -1 || result >= 16);
> 
> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest,
> before inject_pending_event.
> 
>         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> 		->
> 		inject_pending_event
> 		...
> 	}
Some other places will search irr to do something, like kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch irr, not just before enter guest.

> 
>> @@ -685,6 +705,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>  {
>>  	int result = 0;
>>  	struct kvm_vcpu *vcpu = apic->vcpu;
>> +	bool delivered = false;
>> 
>>  	switch (delivery_mode) {
>>  	case APIC_DM_LOWEST:
>> @@ -700,7 +721,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>  		} else
>>  			apic_clear_vector(vector, apic->regs + APIC_TMR);
>> -		result = !apic_test_and_set_irr(vector, apic);
>> +		if (!kvm_x86_ops->deliver_posted_interrupt(vcpu, vector,
>> +						&result, &delivered))
>> +			result = !apic_test_and_set_irr(vector, apic);
>> +
>>  		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
>>  					  trig_mode, vector, !result);
>>  		if (!result) {
>> @@ -710,8 +734,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>  			break;
>>  		}
>> -		kvm_make_request(KVM_REQ_EVENT, vcpu);
>> -		kvm_vcpu_kick(vcpu);
>> +		if (!delivered) {
>> +			kvm_make_request(KVM_REQ_EVENT, vcpu);
>> +			kvm_vcpu_kick(vcpu);
>> +		}
> 
> This set bit / kick pair should be for non-APICv only (please
> check for it directly).
Sure
 
>> +	return test_bit(vector, (unsigned long *)pi_desc->pir);
>> +}
>> +
>> +static void pi_set_pir(int vector, struct pi_desc *pi_desc)
>> +{
>> +	__set_bit(vector, (unsigned long *)pi_desc->pir);
>> +}
> 
> This must be locked atomic operation (set_bit).
If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit.

 
>> +
>>  struct vcpu_vmx {
>>  	struct kvm_vcpu       vcpu;
>>  	unsigned long         host_rsp;
>> @@ -429,6 +465,10 @@ struct vcpu_vmx {
>> 
>>  	bool rdtscp_enabled;
>> +	/* Posted interrupt descriptor */
>> +	struct pi_desc pi_desc;
>> +	spinlock_t pi_lock;
> 
> Don't see why the lock is necessary. Please use the following
> pseudocode for vmx_deliver_posted_interrupt:
> 
> vmx_deliver_posted_interrupt(), returns 'bool injected'.
> 
> 1. orig_irr = read irr from vapic page
> 2. if (orig_irr != 0)
> 3.	return false;
> 4. kvm_make_request(KVM_REQ_EVENT)
> 5. bool injected = !test_and_set_bit(PIR)
> 6. if (vcpu->guest_mode && injected)
> 7. 	if (test_and_set_bit(PIR notification bit))
> 8.		send PIR IPI
> 9. return injected

Consider follow case:
vcpu 0                      |                vcpu1
send intr to vcpu1
check irr
                                            receive a posted intr
										pir->irr(pir is cleared, irr is set)
injected=test_and_set_bit=true
pir=set

Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong.

> On vcpu_enter_guest:
> 
> if (kvm_check_request(KVM_REQ_EVENT))  {
> 	pir->virr sync			(*)
> 	...
> }
> ...
> vcpu->mode = IN_GUEST_MODE;
> local_irq_disable
> clear pir notification bit unconditionally (*)
Why clear ON bit here? If there is no pir->irr operation in this vmentry, clear on here is redundant. 
 
> Right after local_irq_disable.
> 
>> +static bool vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu,
>> +		int vector, int *result, bool *delivered)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +	unsigned long flags;
>> +
>> +	if (!vmx_vm_has_apicv(vcpu->kvm))
>> +		return false;
>> +
>> +	spin_lock_irqsave(&vmx->pi_lock, flags);
>> +	if (pi_test_pir(vector, &vmx->pi_desc) ||
>> +		kvm_apic_test_irr(vector, vcpu->arch.apic)) {
>> +		spin_unlock_irqrestore(&vmx->pi_lock, flags);
>> +		return true;
>> +	}
>> +
>> +	pi_set_pir(vector, &vmx->pi_desc);
>> +	*result = 1;
>> +	if (!pi_test_and_set_on(&vmx->pi_desc) &&
>> +			(vcpu->mode == IN_GUEST_MODE)) {
>> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
>> +		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
>> +				    POSTED_INTR_VECTOR);
>> +		*delivered = true;
>> +	}
>> +	spin_unlock_irqrestore(&vmx->pi_lock, flags);
>> +
>> +	return true;
>> +}
> 
> Please confirm whether a spinlock is necessary with the pseudocode above.
In current implementation, spinlock is necessary to avoid race condition between vcpus when delivery posted interrupt and sync pir->irr.

>> +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +	unsigned long flags;
>> +
>> +	if (!vmx_vm_has_apicv(vcpu->kvm))
>> +		return;
>> +
>> +	spin_lock_irqsave(&vmx->pi_lock, flags);
> 
> 
>> +	if (!pi_test_and_clear_on(&vmx->pi_desc)) {
> 
> This needs to move to vcpu_enter_guest, after local_irq_disabled,
> unconditionally, as noted.
> 
>> @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 				   
>>  struct kvm_lapic_state *s) { +	kvm_x86_ops->sync_pir_to_irr(vcpu);
>>  	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
>>  
>>  	return 0;
>> --
>> 1.7.1
>


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 23, 2013, 2:35 p.m. UTC | #3
On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote:
> >> +	return test_bit(vector, (unsigned long *)pi_desc->pir);
> >> +}
> >> +
> >> +static void pi_set_pir(int vector, struct pi_desc *pi_desc)
> >> +{
> >> +	__set_bit(vector, (unsigned long *)pi_desc->pir);
> >> +}
> > 
> > This must be locked atomic operation (set_bit).
> If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit.
> 
HW still access it concurrently without bothering taking your lock.

>  
> >> +
> >>  struct vcpu_vmx {
> >>  	struct kvm_vcpu       vcpu;
> >>  	unsigned long         host_rsp;
> >> @@ -429,6 +465,10 @@ struct vcpu_vmx {
> >> 
> >>  	bool rdtscp_enabled;
> >> +	/* Posted interrupt descriptor */
> >> +	struct pi_desc pi_desc;
> >> +	spinlock_t pi_lock;
> > 
> > Don't see why the lock is necessary. Please use the following
> > pseudocode for vmx_deliver_posted_interrupt:
> > 
> > vmx_deliver_posted_interrupt(), returns 'bool injected'.
> > 
> > 1. orig_irr = read irr from vapic page
> > 2. if (orig_irr != 0)
> > 3.	return false;
> > 4. kvm_make_request(KVM_REQ_EVENT)
> > 5. bool injected = !test_and_set_bit(PIR)
> > 6. if (vcpu->guest_mode && injected)
> > 7. 	if (test_and_set_bit(PIR notification bit))
> > 8.		send PIR IPI
> > 9. return injected
> 
> Consider follow case:
> vcpu 0                      |                vcpu1
> send intr to vcpu1
> check irr
>                                             receive a posted intr
> 										pir->irr(pir is cleared, irr is set)
> injected=test_and_set_bit=true
> pir=set
> 
> Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong.
> 
I and Marcelo discussed the lockless logic that should be used here on
the previous patch submission. All is left for you is to implement it.
We worked hard to make irq injection path lockless, we will not going to
add one now.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 23, 2013, 2:44 p.m. UTC | #4
On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote:
> Marcelo Tosatti wrote on 2013-02-23:
> > 
> > On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote:
> >> From: Yang Zhang <yang.z.zhang@Intel.com>
> >> 
> >> Posted Interrupt allows APIC interrupts to inject into guest directly
> >> without any vmexit.
> >> 
> >> - When delivering a interrupt to guest, if target vcpu is running,
> >>   update Posted-interrupt requests bitmap and send a notification event
> >>   to the vcpu. Then the vcpu will handle this interrupt automatically,
> >>   without any software involvemnt.
> >> - If target vcpu is not running or there already a notification event
> >>   pending in the vcpu, do nothing. The interrupt will be handled by
> >>   next vm entry
> >> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >> ---
> >> 
> >> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> >> index e4595f1..64616cc 100644
> >> --- a/arch/x86/kernel/irq.c
> >> +++ b/arch/x86/kernel/irq.c
> >> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
> >>  	set_irq_regs(old_regs);
> >>  }
> >> +#ifdef CONFIG_HAVE_KVM
> >> +/*
> >> + * Handler for POSTED_INTERRUPT_VECTOR.
> >> + */
> >> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
> >> +{
> >> +	struct pt_regs *old_regs = set_irq_regs(regs);
> >> +
> >> +	ack_APIC_irq();
> >> +
> >> +	irq_enter();
> >> +
> >> +	exit_idle();
> >> +
> >> +	irq_exit();
> >> +
> >> +	set_irq_regs(old_regs);
> >> +}
> >> +#endif
> >> +
> > 
> > Add per-cpu counters, similarly to other handlers in the same file.
> Sure.
> 
> >> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic
> > *apic)
> >>  	if (!apic->irr_pending)
> >>  		return -1;
> >> +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
> >>  	result = apic_search_irr(apic);
> >>  	ASSERT(result == -1 || result >= 16);
> > 
> > kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest,
> > before inject_pending_event.
> > 
> >         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> > 		->
> > 		inject_pending_event
> > 		...
> > 	}
> Some other places will search irr to do something, like kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch irr, not just before enter guest.

I see. The only call site that needs IRR/PIR information with posted
interrupt enabled is kvm_arch_vcpu_runnable, correct?

If so, can we contain reading PIR to only that location?

And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the
standard way of event processing and also reduce reading the PIR).

> >> +{
> >> +	__set_bit(vector, (unsigned long *)pi_desc->pir);
> >> +}
> > 
> > This must be locked atomic operation (set_bit).
> If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit.

The manual demands atomic locked accesses (remember this a remote
access). See the posted interrupt page.

> > 
> > Don't see why the lock is necessary. Please use the following
> > pseudocode for vmx_deliver_posted_interrupt:
> > 
> > vmx_deliver_posted_interrupt(), returns 'bool injected'.
> > 
> > 1. orig_irr = read irr from vapic page
> > 2. if (orig_irr != 0)
> > 3.	return false;
> > 4. kvm_make_request(KVM_REQ_EVENT)
> > 5. bool injected = !test_and_set_bit(PIR)
> > 6. if (vcpu->guest_mode && injected)
> > 7. 	if (test_and_set_bit(PIR notification bit))
> > 8.		send PIR IPI
> > 9. return injected
> 
> Consider follow case:
> vcpu 0                      |                vcpu1
> send intr to vcpu1
> check irr
>                                             receive a posted intr
> 					      pir->irr(pir is cleared, irr is set)
> injected=test_and_set_bit=true
> pir=set
> 
> Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong.

True. Have a lock around it and at step 1 also verify if PIR is set. That
would do it, yes?

> > On vcpu_enter_guest:
> > 
> > if (kvm_check_request(KVM_REQ_EVENT))  {
> > 	pir->virr sync			(*)
> > 	...
> > }
> > ...
> > vcpu->mode = IN_GUEST_MODE;
> > local_irq_disable
> > clear pir notification bit unconditionally (*)
> Why clear ON bit here? If there is no pir->irr operation in this vmentry, clear on here is redundant. 

A PIR IPI must not be lost. Because if its lost, then interrupt
injection can be delayed while it must be performed immediately.

vcpu entry path has:
1. vcpu->mode = GUEST_MODE
2. local_irq_disable

injection path has:
1. if (vcpu->guest_mode && test_and_set_bit(PIR notif bit))
	send IPI

So it is possible that a PIR IPI is delivered and handled before guest
entry.

By clearing PIR notification bit after local_irq_disable, but before
the last check for vcpu->requests, we guarantee that a PIR IPI is never
lost.

Makes sense? (please check the logic, it might be flawed).

> > 
> > Please confirm whether a spinlock is necessary with the pseudocode above.
> In current implementation, spinlock is necessary to avoid race condition between vcpus when delivery posted interrupt and sync pir->irr.

Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the
need for the spinlock there.

About serializing concurrent injectors, yes, you're right. OK then.

Please use the pseucode with a spinlock (the order of setting
KVM_REQ_EVENT etc must be there).

> >> +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
> >> +{
> >> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +	unsigned long flags;
> >> +
> >> +	if (!vmx_vm_has_apicv(vcpu->kvm))
> >> +		return;
> >> +
> >> +	spin_lock_irqsave(&vmx->pi_lock, flags);
> > 
> > 
> >> +	if (!pi_test_and_clear_on(&vmx->pi_desc)) {
> > 
> > This needs to move to vcpu_enter_guest, after local_irq_disabled,
> > unconditionally, as noted.
> > 
> >> @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 				   
> >>  struct kvm_lapic_state *s) { +	kvm_x86_ops->sync_pir_to_irr(vcpu);
> >>  	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
> >>  
> >>  	return 0;
> >> --
> >> 1.7.1
> >
> 
> 
> Best regards,
> Yang
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 23, 2013, 2:48 p.m. UTC | #5
On Sat, Feb 23, 2013 at 04:35:30PM +0200, Gleb Natapov wrote:
> On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote:
> > >> +	return test_bit(vector, (unsigned long *)pi_desc->pir);
> > >> +}
> > >> +
> > >> +static void pi_set_pir(int vector, struct pi_desc *pi_desc)
> > >> +{
> > >> +	__set_bit(vector, (unsigned long *)pi_desc->pir);
> > >> +}
> > > 
> > > This must be locked atomic operation (set_bit).
> > If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit.
> > 
> HW still access it concurrently without bothering taking your lock.
> 
> >  
> > >> +
> > >>  struct vcpu_vmx {
> > >>  	struct kvm_vcpu       vcpu;
> > >>  	unsigned long         host_rsp;
> > >> @@ -429,6 +465,10 @@ struct vcpu_vmx {
> > >> 
> > >>  	bool rdtscp_enabled;
> > >> +	/* Posted interrupt descriptor */
> > >> +	struct pi_desc pi_desc;
> > >> +	spinlock_t pi_lock;
> > > 
> > > Don't see why the lock is necessary. Please use the following
> > > pseudocode for vmx_deliver_posted_interrupt:
> > > 
> > > vmx_deliver_posted_interrupt(), returns 'bool injected'.
> > > 
> > > 1. orig_irr = read irr from vapic page
> > > 2. if (orig_irr != 0)
> > > 3.	return false;
> > > 4. kvm_make_request(KVM_REQ_EVENT)
> > > 5. bool injected = !test_and_set_bit(PIR)
> > > 6. if (vcpu->guest_mode && injected)
> > > 7. 	if (test_and_set_bit(PIR notification bit))
> > > 8.		send PIR IPI
> > > 9. return injected
> > 
> > Consider follow case:
> > vcpu 0                      |                vcpu1
> > send intr to vcpu1
> > check irr
> >                                             receive a posted intr
> > 										pir->irr(pir is cleared, irr is set)
> > injected=test_and_set_bit=true
> > pir=set
> > 
> > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong.
> > 
> I and Marcelo discussed the lockless logic that should be used here on
> the previous patch submission. All is left for you is to implement it.
> We worked hard to make irq injection path lockless, we will not going to
> add one now.

He is right, the scheme is still flawed (because of concurrent injectors
along with HW in VMX non-root). I'd said lets add a spinlock think about
lockless scheme in the meantime.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 23, 2013, 3:16 p.m. UTC | #6
Marcelo Tosatti wrote on 2013-02-23:
> On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote:
>> Marcelo Tosatti wrote on 2013-02-23:
>>> 
>>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote:
>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
>>>> 
>>>> Posted Interrupt allows APIC interrupts to inject into guest directly
>>>> without any vmexit.
>>>> 
>>>> - When delivering a interrupt to guest, if target vcpu is running,
>>>>   update Posted-interrupt requests bitmap and send a notification
>>>>   event to the vcpu. Then the vcpu will handle this interrupt
>>>>   automatically, without any software involvemnt. - If target vcpu is
>>>>   not running or there already a notification event pending in the
>>>>   vcpu, do nothing. The interrupt will be handled by next vm entry
>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>>> ---
>>>> 
>>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
>>>> index e4595f1..64616cc 100644
>>>> --- a/arch/x86/kernel/irq.c
>>>> +++ b/arch/x86/kernel/irq.c
>>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
>>>>  	set_irq_regs(old_regs);
>>>>  }
>>>> +#ifdef CONFIG_HAVE_KVM
>>>> +/*
>>>> + * Handler for POSTED_INTERRUPT_VECTOR.
>>>> + */
>>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
>>>> +{
>>>> +	struct pt_regs *old_regs = set_irq_regs(regs);
>>>> +
>>>> +	ack_APIC_irq();
>>>> +
>>>> +	irq_enter();
>>>> +
>>>> +	exit_idle();
>>>> +
>>>> +	irq_exit();
>>>> +
>>>> +	set_irq_regs(old_regs);
>>>> +}
>>>> +#endif
>>>> +
>>> 
>>> Add per-cpu counters, similarly to other handlers in the same file.
>> Sure.
>> 
>>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic
>>> *apic)
>>>>  	if (!apic->irr_pending) 		return -1;
>>>>  +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu); 	result =
>>>>  apic_search_irr(apic); 	ASSERT(result == -1 || result >= 16);
>>> 
>>> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest,
>>> before inject_pending_event.
>>> 
>>>         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
>>> 		->
>>> 		inject_pending_event
>>> 		...
>>> 	}
>> Some other places will search irr to do something, like
> kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch irr, not just
> before enter guest.
> 
> I see. The only call site that needs IRR/PIR information with posted
> interrupt enabled is kvm_arch_vcpu_runnable, correct?
Yes.

> If so, can we contain reading PIR to only that location?
Yes, we can do it. 
Actually, why I do pir->irr here is to avoid the wrong usage in future of check pending interrupt just by call kvm_vcpu_has_interrupt().Also, there may need an extra callback to check pir.
So you think your suggestions is better?

> And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the
> standard way of event processing and also reduce reading the PIR).
Since we will check ON bit before each reading PIR, the cost can be ignored.

>>>> +{
>>>> +	__set_bit(vector, (unsigned long *)pi_desc->pir);
>>>> +}
>>> 
>>> This must be locked atomic operation (set_bit).
>> If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit.
> 
> The manual demands atomic locked accesses (remember this a remote
> access). See the posted interrupt page.
You are right.

>>> 
>>> Don't see why the lock is necessary. Please use the following
>>> pseudocode for vmx_deliver_posted_interrupt:
>>> 
>>> vmx_deliver_posted_interrupt(), returns 'bool injected'.
>>> 
>>> 1. orig_irr = read irr from vapic page
>>> 2. if (orig_irr != 0)
>>> 3.	return false;
>>> 4. kvm_make_request(KVM_REQ_EVENT)
>>> 5. bool injected = !test_and_set_bit(PIR)
>>> 6. if (vcpu->guest_mode && injected)
>>> 7. 	if (test_and_set_bit(PIR notification bit))
>>> 8.		send PIR IPI
>>> 9. return injected
>> 
>> Consider follow case:
>> vcpu 0                      |                vcpu1
>> send intr to vcpu1
>> check irr
>>                                             receive a posted intr
>> 					      pir->irr(pir is cleared, irr is set)
>> injected=test_and_set_bit=true
>> pir=set
>> 
>> Then both irr and pir have the interrupt pending, they may merge to one later,
> but injected reported as true. Wrong.
> 
> True. Have a lock around it and at step 1 also verify if PIR is set. That
> would do it, yes?
Yes.
 
>>> On vcpu_enter_guest:
>>> 
>>> if (kvm_check_request(KVM_REQ_EVENT))  {
>>> 	pir->virr sync			(*)
>>> 	...
>>> }
>>> ...
>>> vcpu->mode = IN_GUEST_MODE;
>>> local_irq_disable
>>> clear pir notification bit unconditionally (*)
>> Why clear ON bit here? If there is no pir->irr operation in this vmentry, clear on
> here is redundant.
> 
> A PIR IPI must not be lost. Because if its lost, then interrupt
> injection can be delayed while it must be performed immediately.
> 
> vcpu entry path has:
> 1. vcpu->mode = GUEST_MODE
> 2. local_irq_disable
> 
> injection path has:
> 1. if (vcpu->guest_mode && test_and_set_bit(PIR notif bit))
> 	send IPI
> 
> So it is possible that a PIR IPI is delivered and handled before guest
> entry.
> 
> By clearing PIR notification bit after local_irq_disable, but before the
> last check for vcpu->requests, we guarantee that a PIR IPI is never lost.
> 
> Makes sense? (please check the logic, it might be flawed).
I am not sure whether I understand you. But I don't think the IPI will lost in current patch:

if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode == IN_GUEST_MODE)) {
                kvm_make_request(KVM_REQ_EVENT, vcpu);
                apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
                                    POSTED_INTR_VECTOR);
                *delivered = true;
}

vcpu entry has: 
vcpu->mode = GUEST_MODE
local irq disable
check request

We will send the IPI after making request, if IPI is received after set guest_mode before disable irq, then it still will be handled by the following check request.
Please correct me if I am wrong.
 
>>> 
>>> Please confirm whether a spinlock is necessary with the pseudocode above.
>> In current implementation, spinlock is necessary to avoid race condition
> between vcpus when delivery posted interrupt and sync pir->irr.
> 
> Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the
> need for the spinlock there.
In function sync_pir_to_irr():
tmp_pir=xchg(pir,0)
xchg(tmp_pir, irr)

It is not atomically, the lock still is needed.
 
> About serializing concurrent injectors, yes, you're right. OK then.
> 
> Please use the pseucode with a spinlock (the order of setting
> KVM_REQ_EVENT etc must be there).
> 
>>>> +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>> +	unsigned long flags;
>>>> +
>>>> +	if (!vmx_vm_has_apicv(vcpu->kvm))
>>>> +		return;
>>>> +
>>>> +	spin_lock_irqsave(&vmx->pi_lock, flags);
>>> 
>>> 
>>>> +	if (!pi_test_and_clear_on(&vmx->pi_desc)) {
>>> 
>>> This needs to move to vcpu_enter_guest, after local_irq_disabled,
>>> unconditionally, as noted.
>>> 
>>>> @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>>  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
>>>>  
>>>>  struct kvm_lapic_state *s) { +	kvm_x86_ops->sync_pir_to_irr(vcpu);
>>>>  	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
>>>>  
>>>>  	return 0;
>>>> --
>>>> 1.7.1
>>> 
>> 
>> 
>> Best regards,
>> Yang
>> 
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 23, 2013, 3:31 p.m. UTC | #7
On Sat, Feb 23, 2013 at 11:48:54AM -0300, Marcelo Tosatti wrote:
> > > > 1. orig_irr = read irr from vapic page
> > > > 2. if (orig_irr != 0)
> > > > 3.	return false;
> > > > 4. kvm_make_request(KVM_REQ_EVENT)
> > > > 5. bool injected = !test_and_set_bit(PIR)
> > > > 6. if (vcpu->guest_mode && injected)
> > > > 7. 	if (test_and_set_bit(PIR notification bit))
> > > > 8.		send PIR IPI
> > > > 9. return injected
> > > 
> > > Consider follow case:
> > > vcpu 0                      |                vcpu1
> > > send intr to vcpu1
> > > check irr
> > >                                             receive a posted intr
> > > 										pir->irr(pir is cleared, irr is set)
> > > injected=test_and_set_bit=true
> > > pir=set
> > > 
> > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong.
> > > 
> > I and Marcelo discussed the lockless logic that should be used here on
> > the previous patch submission. All is left for you is to implement it.
> > We worked hard to make irq injection path lockless, we will not going to
> > add one now.
> 
> He is right, the scheme is still flawed (because of concurrent injectors
> along with HW in VMX non-root). I'd said lets add a spinlock think about
> lockless scheme in the meantime.
The logic proposed was (from that thread):
 apic_accept_interrupt() {
  if (PIR || IRR)
    return coalesced;
  else
    set PIR;
 }

Which should map to something like:
if (!test_and_set_bit(PIR))
	return coalesced;
if (irr on vapic page is set)
        return coalesced;

I do not see how the race above can happen this way. Other can though if
vcpu is outside a guest. My be we should deliver interrupt differently
depending on whether vcpu is in guest or not.

I'd rather think about proper way to do lockless injection before
committing anything. The case where we care about correct injection
status is rare, but we always care about scalability and since we
violate the spec by reading vapic page while vcpu is in non-root
operation anyway the result may be incorrect with or without the lock.
Our use can was not in HW designers mind when they designed this thing
obviously :(

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 23, 2013, 4:50 p.m. UTC | #8
On Sat, Feb 23, 2013 at 03:16:11PM +0000, Zhang, Yang Z wrote:
> Marcelo Tosatti wrote on 2013-02-23:
> > On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote:
> >> Marcelo Tosatti wrote on 2013-02-23:
> >>> 
> >>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote:
> >>>> From: Yang Zhang <yang.z.zhang@Intel.com>
> >>>> 
> >>>> Posted Interrupt allows APIC interrupts to inject into guest directly
> >>>> without any vmexit.
> >>>> 
> >>>> - When delivering a interrupt to guest, if target vcpu is running,
> >>>>   update Posted-interrupt requests bitmap and send a notification
> >>>>   event to the vcpu. Then the vcpu will handle this interrupt
> >>>>   automatically, without any software involvemnt. - If target vcpu is
> >>>>   not running or there already a notification event pending in the
> >>>>   vcpu, do nothing. The interrupt will be handled by next vm entry
> >>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >>>> ---
> >>>> 
> >>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> >>>> index e4595f1..64616cc 100644
> >>>> --- a/arch/x86/kernel/irq.c
> >>>> +++ b/arch/x86/kernel/irq.c
> >>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
> >>>>  	set_irq_regs(old_regs);
> >>>>  }
> >>>> +#ifdef CONFIG_HAVE_KVM
> >>>> +/*
> >>>> + * Handler for POSTED_INTERRUPT_VECTOR.
> >>>> + */
> >>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
> >>>> +{
> >>>> +	struct pt_regs *old_regs = set_irq_regs(regs);
> >>>> +
> >>>> +	ack_APIC_irq();
> >>>> +
> >>>> +	irq_enter();
> >>>> +
> >>>> +	exit_idle();
> >>>> +
> >>>> +	irq_exit();
> >>>> +
> >>>> +	set_irq_regs(old_regs);
> >>>> +}
> >>>> +#endif
> >>>> +
> >>> 
> >>> Add per-cpu counters, similarly to other handlers in the same file.
> >> Sure.
> >> 
> >>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic
> >>> *apic)
> >>>>  	if (!apic->irr_pending) 		return -1;
> >>>>  +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu); 	result =
> >>>>  apic_search_irr(apic); 	ASSERT(result == -1 || result >= 16);
> >>> 
> >>> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest,
> >>> before inject_pending_event.
> >>> 
> >>>         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> >>> 		->
> >>> 		inject_pending_event
> >>> 		...
> >>> 	}
> >> Some other places will search irr to do something, like
> > kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch irr, not just
> > before enter guest.
> > 
> > I see. The only call site that needs IRR/PIR information with posted
> > interrupt enabled is kvm_arch_vcpu_runnable, correct?
> Yes.
> 
> > If so, can we contain reading PIR to only that location?
> Yes, we can do it. 
> Actually, why I do pir->irr here is to avoid the wrong usage in future of check pending interrupt just by
> call kvm_vcpu_has_interrupt().

Yes, i see.

> Also, there may need an extra callback to check pir.
> So you think your suggestions is better?

Because it respects standard request processing mechanism, yes.

> > And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the
> > standard way of event processing and also reduce reading the PIR).
> Since we will check ON bit before each reading PIR, the cost can be
> ignored.

Note reading ON bit is not OK, because if injection path did not see
vcpu->mode == IN_GUEST_MODE, ON bit will not be set.

> > 
> > So it is possible that a PIR IPI is delivered and handled before guest
> > entry.
> > 
> > By clearing PIR notification bit after local_irq_disable, but before the
> > last check for vcpu->requests, we guarantee that a PIR IPI is never lost.
> > 
> > Makes sense? (please check the logic, it might be flawed).
> I am not sure whether I understand you. But I don't think the IPI will lost in current patch:
> 
> if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode == IN_GUEST_MODE)) {
>                 kvm_make_request(KVM_REQ_EVENT, vcpu);
>                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
>                                     POSTED_INTR_VECTOR);
>                 *delivered = true;
> }
> 
> vcpu entry has: 
> vcpu->mode = GUEST_MODE
> local irq disable
> check request
> 
> We will send the IPI after making request, if IPI is received after set guest_mode before disable irq, then it still will be handled by the following check request.
> Please correct me if I am wrong.

cpu0					cpu1		vcpu0
test_and_set_bit(PIR-A)					
set KVM_REQ_EVENT					
							process REQ_EVENT
							PIR-A->IRR
					
							vcpu->mode=IN_GUEST
					
if (vcpu0->guest_mode)
	if (!t_a_s_bit(PIR notif))			
		send IPI
							linux_pir_handler

					t_a_s_b(PIR-B)=1
					no PIR IPI sent

> > 
> > Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the
> > need for the spinlock there.
> In function sync_pir_to_irr():
> tmp_pir=xchg(pir,0)
> xchg(tmp_pir, irr)
> 
> It is not atomically, the lock still is needed.

IRR is only accessed by local vcpu context, only PIR is accessed concurrently,
therefore only PIR accesses must be atomic. xchg instruction is 
locked and atomic.

+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+       u32 i, pir_val;
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       for (i = 0; i <= 7; i++) {
+               pir_val = xchg(&pir[i], 0);
+               if (pir_val)
+                       *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);

Right?


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 23, 2013, 5:05 p.m. UTC | #9
On Sat, Feb 23, 2013 at 05:31:44PM +0200, Gleb Natapov wrote:
> On Sat, Feb 23, 2013 at 11:48:54AM -0300, Marcelo Tosatti wrote:
> > > > > 1. orig_irr = read irr from vapic page
> > > > > 2. if (orig_irr != 0)
> > > > > 3.	return false;
> > > > > 4. kvm_make_request(KVM_REQ_EVENT)
> > > > > 5. bool injected = !test_and_set_bit(PIR)
> > > > > 6. if (vcpu->guest_mode && injected)
> > > > > 7. 	if (test_and_set_bit(PIR notification bit))
> > > > > 8.		send PIR IPI
> > > > > 9. return injected
> > > > 
> > > > Consider follow case:
> > > > vcpu 0                      |                vcpu1
> > > > send intr to vcpu1
> > > > check irr
> > > >                                             receive a posted intr
> > > > 										pir->irr(pir is cleared, irr is set)
> > > > injected=test_and_set_bit=true
> > > > pir=set
> > > > 
> > > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong.
> > > > 
> > > I and Marcelo discussed the lockless logic that should be used here on
> > > the previous patch submission. All is left for you is to implement it.
> > > We worked hard to make irq injection path lockless, we will not going to
> > > add one now.
> > 
> > He is right, the scheme is still flawed (because of concurrent injectors
> > along with HW in VMX non-root). I'd said lets add a spinlock think about
> > lockless scheme in the meantime.
> The logic proposed was (from that thread):
>  apic_accept_interrupt() {
>   if (PIR || IRR)
>     return coalesced;
>   else
>     set PIR;
>  }
> 
> Which should map to something like:
> if (!test_and_set_bit(PIR))
> 	return coalesced;

HW transfers PIR to IRR, here. Say due to PIR IPI sent 
due to setting of a different vector.

> if (irr on vapic page is set)
>         return coalesced;

> 
> I do not see how the race above can happen this way. Other can though if
> vcpu is outside a guest. My be we should deliver interrupt differently
> depending on whether vcpu is in guest or not.

Problem is with 3 contexes: two injectors and one vcpu in guest
mode. Earlier on that thread you mentioned

"The point is that we need to check PIR and IRR atomically and this is
impossible."

That would be one way to fix it.

> I'd rather think about proper way to do lockless injection before
> committing anything. The case where we care about correct injection
> status is rare, but we always care about scalability and since we
> violate the spec by reading vapic page while vcpu is in non-root
> operation anyway the result may be incorrect with or without the lock.
> Our use can was not in HW designers mind when they designed this thing
> obviously :(

Zhang, can you comment on whether reading vapic page with CPU in
VMX-non root accessing it is safe?

Gleb, yes, a comment mentioning the race (instead of the spinlock) and
explanation why its believed to be benign (given how the injection
return value is interpreted) could also work. Its ugly, though... murphy
is around.

OTOH spinlock is not the end of the world, can figure out something later 
(we've tried without success so far).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 23, 2013, 7:42 p.m. UTC | #10
On Sat, Feb 23, 2013 at 02:05:13PM -0300, Marcelo Tosatti wrote:
> On Sat, Feb 23, 2013 at 05:31:44PM +0200, Gleb Natapov wrote:
> > On Sat, Feb 23, 2013 at 11:48:54AM -0300, Marcelo Tosatti wrote:
> > > > > > 1. orig_irr = read irr from vapic page
> > > > > > 2. if (orig_irr != 0)
> > > > > > 3.	return false;
> > > > > > 4. kvm_make_request(KVM_REQ_EVENT)
> > > > > > 5. bool injected = !test_and_set_bit(PIR)
> > > > > > 6. if (vcpu->guest_mode && injected)
> > > > > > 7. 	if (test_and_set_bit(PIR notification bit))
> > > > > > 8.		send PIR IPI
> > > > > > 9. return injected
> > > > > 
> > > > > Consider follow case:
> > > > > vcpu 0                      |                vcpu1
> > > > > send intr to vcpu1
> > > > > check irr
> > > > >                                             receive a posted intr
> > > > > 										pir->irr(pir is cleared, irr is set)
> > > > > injected=test_and_set_bit=true
> > > > > pir=set
> > > > > 
> > > > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong.
> > > > > 
> > > > I and Marcelo discussed the lockless logic that should be used here on
> > > > the previous patch submission. All is left for you is to implement it.
> > > > We worked hard to make irq injection path lockless, we will not going to
> > > > add one now.
> > > 
> > > He is right, the scheme is still flawed (because of concurrent injectors
> > > along with HW in VMX non-root). I'd said lets add a spinlock think about
> > > lockless scheme in the meantime.
> > The logic proposed was (from that thread):
> >  apic_accept_interrupt() {
> >   if (PIR || IRR)
> >     return coalesced;
> >   else
> >     set PIR;
> >  }
> > 
> > Which should map to something like:
> > if (!test_and_set_bit(PIR))
> > 	return coalesced;
> 
> HW transfers PIR to IRR, here. Say due to PIR IPI sent 
> due to setting of a different vector.
> 
Hmm, yes. Haven't thought about different vector :(

> > if (irr on vapic page is set)
> >         return coalesced;
> 
> > 
> > I do not see how the race above can happen this way. Other can though if
> > vcpu is outside a guest. My be we should deliver interrupt differently
> > depending on whether vcpu is in guest or not.
> 
> Problem is with 3 contexes: two injectors and one vcpu in guest
> mode. Earlier on that thread you mentioned
> 
> "The point is that we need to check PIR and IRR atomically and this is
> impossible."
> 
> That would be one way to fix it.
> 
I do not think it fixes it. There is no guaranty that IPI will be
processed by remote cpu while sending cpu is still in locked section, so
the same race may happen regardless. As you say above there are 3
contexts, but only two use locks.

> > I'd rather think about proper way to do lockless injection before
> > committing anything. The case where we care about correct injection
> > status is rare, but we always care about scalability and since we
> > violate the spec by reading vapic page while vcpu is in non-root
> > operation anyway the result may be incorrect with or without the lock.
> > Our use can was not in HW designers mind when they designed this thing
> > obviously :(
> 
> Zhang, can you comment on whether reading vapic page with CPU in
> VMX-non root accessing it is safe?
> 
> Gleb, yes, a comment mentioning the race (instead of the spinlock) and
> explanation why its believed to be benign (given how the injection
> return value is interpreted) could also work. Its ugly, though... murphy
> is around.
The race above is not benign. It will report interrupt as coalesced
while in reality it is injected. This may cause to many interrupt to be
injected. If this happens rare enough ntp may be able to fix time drift
resulted from this.

> 
> OTOH spinlock is not the end of the world, can figure out something later 
> (we've tried without success so far).
It serializes all injections into vcpu. I do not believe now that even
with lock we are safe for the reason I mention above. We can use pir->on
bit as a lock, but that only emphasise how ridiculous serialization of
injections becomes.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 23, 2013, 7:52 p.m. UTC | #11
On Sat, Feb 23, 2013 at 09:42:14PM +0200, Gleb Natapov wrote:
> > explanation why its believed to be benign (given how the injection
> > return value is interpreted) could also work. Its ugly, though... murphy
> > is around.
> The race above is not benign. It will report interrupt as coalesced
> while in reality it is injected. This may cause to many interrupt to be
> injected. If this happens rare enough ntp may be able to fix time drift
> resulted from this.

OK.

> > OTOH spinlock is not the end of the world, can figure out something later 
> > (we've tried without success so far).
> It serializes all injections into vcpu. I do not believe now that even
> with lock we are safe for the reason I mention above. We can use pir->on
> bit as a lock, but that only emphasise how ridiculous serialization of
> injections becomes.

Please review the 2nd iteration of pseudocode in patchset v4 thread, it
should be good.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 23, 2013, 7:59 p.m. UTC | #12
On Sat, Feb 23, 2013 at 04:52:59PM -0300, Marcelo Tosatti wrote:
> > > OTOH spinlock is not the end of the world, can figure out something later 
> > > (we've tried without success so far).
> > It serializes all injections into vcpu. I do not believe now that even
> > with lock we are safe for the reason I mention above. We can use pir->on
> > bit as a lock, but that only emphasise how ridiculous serialization of
> > injections becomes.
> 
> Please review the 2nd iteration of pseudocode in patchset v4 thread, it
> should be good.
Can you point me to what exactly I should look at? All I see is a
discussion on how to no miss an IPI, but this is not the problem I am talking
about here.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 24, 2013, 1:17 p.m. UTC | #13
Marcelo Tosatti wrote on 2013-02-24:
> On Sat, Feb 23, 2013 at 03:16:11PM +0000, Zhang, Yang Z wrote:
>> Marcelo Tosatti wrote on 2013-02-23:
>>> On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote:
>>>> Marcelo Tosatti wrote on 2013-02-23:
>>>>> 
>>>>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote:
>>>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
>>>>>> 
>>>>>> Posted Interrupt allows APIC interrupts to inject into guest directly
>>>>>> without any vmexit.
>>>>>> 
>>>>>> - When delivering a interrupt to guest, if target vcpu is running,
>>>>>>   update Posted-interrupt requests bitmap and send a notification
>>>>>>   event to the vcpu. Then the vcpu will handle this interrupt
>>>>>>   automatically, without any software involvemnt. - If target vcpu is
>>>>>>   not running or there already a notification event pending in the
>>>>>>   vcpu, do nothing. The interrupt will be handled by next vm entry
>>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>>>>> ---
>>>>>> 
>>>>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
>>>>>> index e4595f1..64616cc 100644
>>>>>> --- a/arch/x86/kernel/irq.c
>>>>>> +++ b/arch/x86/kernel/irq.c
>>>>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
>>>>>>  	set_irq_regs(old_regs);
>>>>>>  }
>>>>>> +#ifdef CONFIG_HAVE_KVM
>>>>>> +/*
>>>>>> + * Handler for POSTED_INTERRUPT_VECTOR.
>>>>>> + */
>>>>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
>>>>>> +{
>>>>>> +	struct pt_regs *old_regs = set_irq_regs(regs);
>>>>>> +
>>>>>> +	ack_APIC_irq();
>>>>>> +
>>>>>> +	irq_enter();
>>>>>> +
>>>>>> +	exit_idle();
>>>>>> +
>>>>>> +	irq_exit();
>>>>>> +
>>>>>> +	set_irq_regs(old_regs);
>>>>>> +}
>>>>>> +#endif
>>>>>> +
>>>>> 
>>>>> Add per-cpu counters, similarly to other handlers in the same file.
>>>> Sure.
>>>> 
>>>>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct
> kvm_lapic
>>>>> *apic)
>>>>>>  	if (!apic->irr_pending) 		return -1;
>>>>>>  +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu); 	result =
>>>>>>  apic_search_irr(apic); 	ASSERT(result == -1 || result >= 16);
>>>>> 
>>>>> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest,
>>>>> before inject_pending_event.
>>>>> 
>>>>>         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win)
> {
>>>>> 		->
>>>>> 		inject_pending_event
>>>>> 		...
>>>>> 	}
>>>> Some other places will search irr to do something, like
>>> kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch
>>> irr, not just before enter guest.
>>> 
>>> I see. The only call site that needs IRR/PIR information with posted
>>> interrupt enabled is kvm_arch_vcpu_runnable, correct?
>> Yes.
>> 
>>> If so, can we contain reading PIR to only that location?
>> Yes, we can do it. Actually, why I do pir->irr here is to avoid the
>> wrong usage in future of check pending interrupt just by call
>> kvm_vcpu_has_interrupt().
> 
> Yes, i see.
> 
>> Also, there may need an extra callback to check pir.
>> So you think your suggestions is better?
> 
> Because it respects standard request processing mechanism, yes.
Sure. Will change it in next patch.
 
>>> And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the
>>> standard way of event processing and also reduce reading the PIR).
>> Since we will check ON bit before each reading PIR, the cost can be
>> ignored.
> 
> Note reading ON bit is not OK, because if injection path did not see
> vcpu->mode == IN_GUEST_MODE, ON bit will not be set.
In my patch, It will set ON bit before check vcpu->mode. So it's ok.
Actually, I think the ON bit must be set unconditionally after change PIR regardless of vcpu->mode.
 
>>> 
>>> So it is possible that a PIR IPI is delivered and handled before guest
>>> entry.
>>> 
>>> By clearing PIR notification bit after local_irq_disable, but before the
>>> last check for vcpu->requests, we guarantee that a PIR IPI is never lost.
>>> 
>>> Makes sense? (please check the logic, it might be flawed).
>> I am not sure whether I understand you. But I don't think the IPI will
>> lost in current patch:
>> 
>> if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode ==
> IN_GUEST_MODE)) {
>>                 kvm_make_request(KVM_REQ_EVENT, vcpu);
>>                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
>>                                     POSTED_INTR_VECTOR);
>>                 *delivered = true;
>> }
>> 
>> vcpu entry has:
>> vcpu->mode = GUEST_MODE
>> local irq disable
>> check request
>> 
>> We will send the IPI after making request, if IPI is received after set
> guest_mode before disable irq, then it still will be handled by the following check
> request.
>> Please correct me if I am wrong.
> 
> cpu0					cpu1		vcpu0
> test_and_set_bit(PIR-A)
> set KVM_REQ_EVENT
> 							process REQ_EVENT
> 							PIR-A->IRR
> 
> 							vcpu->mode=IN_GUEST
> 
> if (vcpu0->guest_mode)
> 	if (!t_a_s_bit(PIR notif))
> 		send IPI
> 							linux_pir_handler
> 
> 					t_a_s_b(PIR-B)=1
> 					no PIR IPI sent
No, this exists with your implementation not mine.
As I said, in this patch, it will make request after vcpu ==guest mode, then send ipi. Even the ipi is lost, but the request still will be tracked. Because we have the follow check:
vcpu->mode = GUEST_MODE
(ipi may arrived here and lost)
local irq disable
check request (this will ensure the pir modification will be picked up by vcpu before vmentry)

>>> 
>>> Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the
>>> need for the spinlock there.
>> In function sync_pir_to_irr():
>> tmp_pir=xchg(pir,0)
>> xchg(tmp_pir, irr)
>> 
>> It is not atomically, the lock still is needed.
> 
> IRR is only accessed by local vcpu context, only PIR is accessed concurrently,
> therefore only PIR accesses must be atomic. xchg instruction is
> locked and atomic.
There has same problem as we discussed before. Consider this:
Before delivery poste ipi, the irr is 0. then:

cpu0                           cpu1  vcpu0
delivery_posted_intr()
                               sync_pir_toirr():
                                  tmp_pir=xchg(pir,0)(pir is cleared)
check pir(pass)
check irr(pass)
							   xchg(tmp_pir, irr)
injected= true
set pir  

Same problem: both pir and irr is set, but it reported as injected. Still need the lock to protect it.                                

> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ +      
> u32 i, pir_val; +       struct kvm_lapic *apic = vcpu->arch.apic; + +   
>    for (i = 0; i <= 7; i++) { +               pir_val = xchg(&pir[i],
> 0); +               if (pir_val) +                       *((u32
> *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; +       } +}
> +EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
> 
> Right?
>


Best regards,
Yang

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 24, 2013, 1:55 p.m. UTC | #14
Gleb Natapov wrote on 2013-02-24:
> On Sat, Feb 23, 2013 at 02:05:13PM -0300, Marcelo Tosatti wrote:
>> On Sat, Feb 23, 2013 at 05:31:44PM +0200, Gleb Natapov wrote:
>>> On Sat, Feb 23, 2013 at 11:48:54AM -0300, Marcelo Tosatti wrote:
>>>>>>> 1. orig_irr = read irr from vapic page
>>>>>>> 2. if (orig_irr != 0)
>>>>>>> 3.	return false;
>>>>>>> 4. kvm_make_request(KVM_REQ_EVENT)
>>>>>>> 5. bool injected = !test_and_set_bit(PIR)
>>>>>>> 6. if (vcpu->guest_mode && injected)
>>>>>>> 7. 	if (test_and_set_bit(PIR notification bit))
>>>>>>> 8.		send PIR IPI
>>>>>>> 9. return injected
>>>>>> 
>>>>>> Consider follow case:
>>>>>> vcpu 0                      |                vcpu1
>>>>>> send intr to vcpu1
>>>>>> check irr
>>>>>>                                             receive a posted intr
>>>>>> 										pir->irr(pir is cleared, irr is set)
>>>>>> injected=test_and_set_bit=true pir=set
>>>>>> 
>>>>>> Then both irr and pir have the interrupt pending, they may merge to
>>>>>> one later, but injected reported as true. Wrong.
>>>>>> 
>>>>> I and Marcelo discussed the lockless logic that should be used here on
>>>>> the previous patch submission. All is left for you is to implement it.
>>>>> We worked hard to make irq injection path lockless, we will not going to
>>>>> add one now.
>>>> 
>>>  He is right, the scheme is still flawed (because of concurrent
>>>  injectors along with HW in VMX non-root). I'd said lets add a
>>>  spinlock think about lockless scheme in the meantime. The logic
>>>  proposed was (from that thread): apic_accept_interrupt() {
>>>   if (PIR || IRR)
>>>     return coalesced; else set PIR;
>>>  }
>>> Which should map to something like:
>>> if (!test_and_set_bit(PIR))
>>> 	return coalesced;
>> 
>> HW transfers PIR to IRR, here. Say due to PIR IPI sent
>> due to setting of a different vector.
>> 
> Hmm, yes. Haven't thought about different vector :(
> 
>>> if (irr on vapic page is set)
>>>         return coalesced;
>> 
>>> 
>>> I do not see how the race above can happen this way. Other can though if
>>> vcpu is outside a guest. My be we should deliver interrupt differently
>>> depending on whether vcpu is in guest or not.
>> 
>> Problem is with 3 contexes: two injectors and one vcpu in guest
>> mode. Earlier on that thread you mentioned
>> 
>> "The point is that we need to check PIR and IRR atomically and this is
>> impossible."
>> 
>> That would be one way to fix it.
>> 
> I do not think it fixes it. There is no guaranty that IPI will be
> processed by remote cpu while sending cpu is still in locked section, so
> the same race may happen regardless. As you say above there are 3
> contexts, but only two use locks.
See following logic, I think the problem you mentioned will not happened with current logic.

get lock
if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.)
   return coalesced
if test_irr
   return coalesced
set_pir
injected=true
if t_a_s(on) && in guest mode
   send ipi
unlock

 
>>> I'd rather think about proper way to do lockless injection before
>>> committing anything. The case where we care about correct injection
>>> status is rare, but we always care about scalability and since we
>>> violate the spec by reading vapic page while vcpu is in non-root
>>> operation anyway the result may be incorrect with or without the lock.
>>> Our use can was not in HW designers mind when they designed this thing
>>> obviously :(
>> 
>> Zhang, can you comment on whether reading vapic page with CPU in
>> VMX-non root accessing it is safe?
See 24.11.4
In addition to data in the VMCS region itself, VMX non-root operation can be controlled by data structures that are
referenced by pointers in a VMCS (for example, the I/O bitmaps). While the pointers to these data structures are
parts of the VMCS, the data structures themselves are not. They are not accessible using VMREAD and VMWRITE
but by ordinary memory writes.
Software should ensure that each such data structure is modified only when no logical processor with a current
VMCS that references it is in VMX non-root operation. Doing otherwise may lead to unpredictable behavior.

This means the initial design of KVM is wrong. It should not to modify vIRR directly.

The good thing is that reading is ok.

>> Gleb, yes, a comment mentioning the race (instead of the spinlock) and
>> explanation why its believed to be benign (given how the injection
>> return value is interpreted) could also work. Its ugly, though... murphy
>> is around.
> The race above is not benign. It will report interrupt as coalesced
> while in reality it is injected. This may cause to many interrupt to be
> injected. If this happens rare enough ntp may be able to fix time drift
> resulted from this.
Please check the above logic. Does it have same problem? If yes, please point out.

>> 
>> OTOH spinlock is not the end of the world, can figure out something later
>> (we've tried without success so far).
> It serializes all injections into vcpu. I do not believe now that even
> with lock we are safe for the reason I mention above. We can use pir->on
> bit as a lock, but that only emphasise how ridiculous serialization of
> injections becomes.
> 
> --
> 			Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 24, 2013, 2:19 p.m. UTC | #15
On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote:
> > I do not think it fixes it. There is no guaranty that IPI will be
> > processed by remote cpu while sending cpu is still in locked section, so
> > the same race may happen regardless. As you say above there are 3
> > contexts, but only two use locks.
> See following logic, I think the problem you mentioned will not happened with current logic.
> 
> get lock
> if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.)
I do not see where those assumptions are coming from. Testing pir does
not guaranty that the IPI is not processed by VCPU right now.

iothread:                           vcpu:
send_irq()
lock(pir)
check pir and irr
set pir
send IPI (*)
unlock(pir)

send_irq()
lock(pir)
                                 receive IPI (*)
                                 atomic {
                                   pir_tmp = pir
                                   pir = 0
                                 }
check pir and irr
                                 irr &= pir_tmp
set pir
send IPI
unlock(pir)

At this point both pir and irr are set and interrupt may be coalesced,
but it is reported as delivered.

So what prevents the scenario above from happening?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 24, 2013, 2:26 p.m. UTC | #16
Gleb Natapov wrote on 2013-02-24:
> On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote:
>>> I do not think it fixes it. There is no guaranty that IPI will be
>>> processed by remote cpu while sending cpu is still in locked section, so
>>> the same race may happen regardless. As you say above there are 3
>>> contexts, but only two use locks.
>> See following logic, I think the problem you mentioned will not
>> happened with current logic.
>> 
>> get lock
>> if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we
> are taking the lock now, no IPI will be sent before we release the lock and no
> pir->irr is performed by hardware for same interrupt.)
> I do not see where those assumptions are coming from. Testing pir does
> not guaranty that the IPI is not processed by VCPU right now.
> 
> iothread:                           vcpu:
> send_irq()
> lock(pir)
> check pir and irr
> set pir
> send IPI (*)
> unlock(pir)
> 
> send_irq()
> lock(pir)
>                                  receive IPI (*)
>                                  atomic {
>                                    pir_tmp = pir
>                                    pir = 0
>                                  } check pir and irr irr &= pir_tmp
> set pir
> send IPI
> unlock(pir)
> 
> At this point both pir and irr are set and interrupt may be coalesced,
> but it is reported as delivered.
> 
> So what prevents the scenario above from happening?
Yes, you are right. For this case, it can do nothing. And we cannot solve this problem in current KVM, right?

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 24, 2013, 2:39 p.m. UTC | #17
On Sun, Feb 24, 2013 at 02:26:36PM +0000, Zhang, Yang Z wrote:
> > I do not see where those assumptions are coming from. Testing pir does
> > not guaranty that the IPI is not processed by VCPU right now.
> > 
> > iothread:                           vcpu:
> > send_irq()
> > lock(pir)
> > check pir and irr
> > set pir
> > send IPI (*)
> > unlock(pir)
> > 
> > send_irq()
> > lock(pir)
> >                                  receive IPI (*)
> >                                  atomic {
> >                                    pir_tmp = pir
> >                                    pir = 0
> >                                  } check pir and irr irr &= pir_tmp
> > set pir
> > send IPI
> > unlock(pir)
> > 
> > At this point both pir and irr are set and interrupt may be coalesced,
> > but it is reported as delivered.
> > 
> > So what prevents the scenario above from happening?
> Yes, you are right. For this case, it can do nothing. And we cannot solve this problem in current KVM, right?
> 
Yes, as far as I can see we cannot reliably implement KVM's 
KVM_IRQ_LINE_STATUS interface with given HW interface. Now we should
think about how to minimize the damage.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 24, 2013, 5:39 p.m. UTC | #18
On Sun, Feb 24, 2013 at 01:17:59PM +0000, Zhang, Yang Z wrote:
> Marcelo Tosatti wrote on 2013-02-24:
> > On Sat, Feb 23, 2013 at 03:16:11PM +0000, Zhang, Yang Z wrote:
> >> Marcelo Tosatti wrote on 2013-02-23:
> >>> On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote:
> >>>> Marcelo Tosatti wrote on 2013-02-23:
> >>>>> 
> >>>>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote:
> >>>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
> >>>>>> 
> >>>>>> Posted Interrupt allows APIC interrupts to inject into guest directly
> >>>>>> without any vmexit.
> >>>>>> 
> >>>>>> - When delivering a interrupt to guest, if target vcpu is running,
> >>>>>>   update Posted-interrupt requests bitmap and send a notification
> >>>>>>   event to the vcpu. Then the vcpu will handle this interrupt
> >>>>>>   automatically, without any software involvemnt. - If target vcpu is
> >>>>>>   not running or there already a notification event pending in the
> >>>>>>   vcpu, do nothing. The interrupt will be handled by next vm entry
> >>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >>>>>> ---
> >>>>>> 
> >>>>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> >>>>>> index e4595f1..64616cc 100644
> >>>>>> --- a/arch/x86/kernel/irq.c
> >>>>>> +++ b/arch/x86/kernel/irq.c
> >>>>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
> >>>>>>  	set_irq_regs(old_regs);
> >>>>>>  }
> >>>>>> +#ifdef CONFIG_HAVE_KVM
> >>>>>> +/*
> >>>>>> + * Handler for POSTED_INTERRUPT_VECTOR.
> >>>>>> + */
> >>>>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
> >>>>>> +{
> >>>>>> +	struct pt_regs *old_regs = set_irq_regs(regs);
> >>>>>> +
> >>>>>> +	ack_APIC_irq();
> >>>>>> +
> >>>>>> +	irq_enter();
> >>>>>> +
> >>>>>> +	exit_idle();
> >>>>>> +
> >>>>>> +	irq_exit();
> >>>>>> +
> >>>>>> +	set_irq_regs(old_regs);
> >>>>>> +}
> >>>>>> +#endif
> >>>>>> +
> >>>>> 
> >>>>> Add per-cpu counters, similarly to other handlers in the same file.
> >>>> Sure.
> >>>> 
> >>>>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct
> > kvm_lapic
> >>>>> *apic)
> >>>>>>  	if (!apic->irr_pending) 		return -1;
> >>>>>>  +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu); 	result =
> >>>>>>  apic_search_irr(apic); 	ASSERT(result == -1 || result >= 16);
> >>>>> 
> >>>>> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest,
> >>>>> before inject_pending_event.
> >>>>> 
> >>>>>         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win)
> > {
> >>>>> 		->
> >>>>> 		inject_pending_event
> >>>>> 		...
> >>>>> 	}
> >>>> Some other places will search irr to do something, like
> >>> kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch
> >>> irr, not just before enter guest.
> >>> 
> >>> I see. The only call site that needs IRR/PIR information with posted
> >>> interrupt enabled is kvm_arch_vcpu_runnable, correct?
> >> Yes.
> >> 
> >>> If so, can we contain reading PIR to only that location?
> >> Yes, we can do it. Actually, why I do pir->irr here is to avoid the
> >> wrong usage in future of check pending interrupt just by call
> >> kvm_vcpu_has_interrupt().
> > 
> > Yes, i see.
> > 
> >> Also, there may need an extra callback to check pir.
> >> So you think your suggestions is better?
> > 
> > Because it respects standard request processing mechanism, yes.
> Sure. Will change it in next patch.
>  
> >>> And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the
> >>> standard way of event processing and also reduce reading the PIR).
> >> Since we will check ON bit before each reading PIR, the cost can be
> >> ignored.
> > 
> > Note reading ON bit is not OK, because if injection path did not see
> > vcpu->mode == IN_GUEST_MODE, ON bit will not be set.
> In my patch, It will set ON bit before check vcpu->mode. So it's ok.
> Actually, I think the ON bit must be set unconditionally after change PIR regardless of vcpu->mode.
>  
> >>> 
> >>> So it is possible that a PIR IPI is delivered and handled before guest
> >>> entry.
> >>> 
> >>> By clearing PIR notification bit after local_irq_disable, but before the
> >>> last check for vcpu->requests, we guarantee that a PIR IPI is never lost.
> >>> 
> >>> Makes sense? (please check the logic, it might be flawed).
> >> I am not sure whether I understand you. But I don't think the IPI will
> >> lost in current patch:
> >> 
> >> if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode ==
> > IN_GUEST_MODE)) {
> >>                 kvm_make_request(KVM_REQ_EVENT, vcpu);
> >>                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
> >>                                     POSTED_INTR_VECTOR);
> >>                 *delivered = true;
> >> }
> >> 
> >> vcpu entry has:
> >> vcpu->mode = GUEST_MODE
> >> local irq disable
> >> check request
> >> 
> >> We will send the IPI after making request, if IPI is received after set
> > guest_mode before disable irq, then it still will be handled by the following check
> > request.
> >> Please correct me if I am wrong.

p1)

> > cpu0					cpu1		vcpu0
> > test_and_set_bit(PIR-A)
> > set KVM_REQ_EVENT
> > 							process REQ_EVENT
> > 							PIR-A->IRR
> > 
> > 							vcpu->mode=IN_GUEST
> > 
> > if (vcpu0->guest_mode)
> > 	if (!t_a_s_bit(PIR notif))
> > 		send IPI
> > 							linux_pir_handler
> > 
> > 					t_a_s_b(PIR-B)=1
> > 					no PIR IPI sent

p2)

> No, this exists with your implementation not mine.
> As I said, in this patch, it will make request after vcpu ==guest mode, then send ipi. Even the ipi is lost, but the request still will be tracked. Because we have the follow check:
> vcpu->mode = GUEST_MODE
> (ipi may arrived here and lost)
> local irq disable
> check request (this will ensure the pir modification will be picked up by vcpu before vmentry)

Please read the sequence above again, between p1) and p2). Yes, if the
IPI is lost, the request _for the CPU which sent PIR IPI_ is guaranteed
to be processed, but not the request for another cpu (cpu1).

If in fact the scenario is not possible, then please point out between
p1) and p2) where the error in representation is.

Note there are 3 contexes: CPU0, CPU1, VCPU0 (virtual CPU on some CPU != 0 and !=
1).

> >>> Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the
> >>> need for the spinlock there.
> >> In function sync_pir_to_irr():
> >> tmp_pir=xchg(pir,0)
> >> xchg(tmp_pir, irr)
> >> 
> >> It is not atomically, the lock still is needed.
> > 
> > IRR is only accessed by local vcpu context, only PIR is accessed concurrently,
> > therefore only PIR accesses must be atomic. xchg instruction is
> > locked and atomic.
> There has same problem as we discussed before. Consider this:
> Before delivery poste ipi, the irr is 0. then:
> 
> cpu0                           cpu1  vcpu0
> delivery_posted_intr()
>                                sync_pir_toirr():
>                                   tmp_pir=xchg(pir,0)(pir is cleared)
> check pir(pass)
> check irr(pass)
> 							   xchg(tmp_pir, irr)
> injected= true
> set pir  
> 
> Same problem: both pir and irr is set, but it reported as injected. Still need the lock to protect it.                                

See:

cpu0

check pir(pass)
check irr(pass)
injected = !test_and_set_bit(pir)

versus 

cpu1 
xchg(pir)

No information can be lost because all accesses to shared data are
atomic.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 24, 2013, 5:44 p.m. UTC | #19
On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote:
> > contexts, but only two use locks.
> See following logic, I think the problem you mentioned will not happened with current logic.
> 
> get lock
> if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.)
>    return coalesced
> if test_irr
>    return coalesced
> set_pir
> injected=true
> if t_a_s(on) && in guest mode
>    send ipi
> unlock
> 
>  
> >>> I'd rather think about proper way to do lockless injection before
> >>> committing anything. The case where we care about correct injection
> >>> status is rare, but we always care about scalability and since we
> >>> violate the spec by reading vapic page while vcpu is in non-root
> >>> operation anyway the result may be incorrect with or without the lock.
> >>> Our use can was not in HW designers mind when they designed this thing
> >>> obviously :(
> >> 
> >> Zhang, can you comment on whether reading vapic page with CPU in
> >> VMX-non root accessing it is safe?
> See 24.11.4
> In addition to data in the VMCS region itself, VMX non-root operation can be controlled by data structures that are
> referenced by pointers in a VMCS (for example, the I/O bitmaps). While the pointers to these data structures are
> parts of the VMCS, the data structures themselves are not. They are not accessible using VMREAD and VMWRITE
> but by ordinary memory writes.
> Software should ensure that each such data structure is modified only when no logical processor with a current
> VMCS that references it is in VMX non-root operation. Doing otherwise may lead to unpredictable behavior.
> 
> This means the initial design of KVM is wrong. It should not to modify vIRR directly.
> 
> The good thing is that reading is ok.

OK.

> >> Gleb, yes, a comment mentioning the race (instead of the spinlock) and
> >> explanation why its believed to be benign (given how the injection
> >> return value is interpreted) could also work. Its ugly, though... murphy
> >> is around.
> > The race above is not benign. It will report interrupt as coalesced
> > while in reality it is injected. This may cause to many interrupt to be
> > injected. If this happens rare enough ntp may be able to fix time drift
> > resulted from this.
> Please check the above logic. Does it have same problem? If yes, please point out.

1) set_pir must be test_and_set_bit() (so the lock at vcpu entry path can
be dropped).

2)  if t_a_s(on) && in guest mode
      send ipi

should be inverted:

    if guest mode && t_a_s(on)
      send ipi

So you avoid setting ON bit if guest is outside of guest mode.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 24, 2013, 6:08 p.m. UTC | #20
On Sun, Feb 24, 2013 at 04:19:17PM +0200, Gleb Natapov wrote:
> On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote:
> > > I do not think it fixes it. There is no guaranty that IPI will be
> > > processed by remote cpu while sending cpu is still in locked section, so
> > > the same race may happen regardless. As you say above there are 3
> > > contexts, but only two use locks.
> > See following logic, I think the problem you mentioned will not happened with current logic.
> > 
> > get lock
> > if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.)

Does the PIR IPI interrupt returns synchronously _after_ PIR->IRR transfer
is made? Don't think so.

PIR IPI interrupt returns after remote CPU has acked interrupt receival,
not after remote CPU has acked _and_ performed PIR->IRR transfer.

Right? (yes, right, see step 4. below).

Should try to make it easier to drop the lock later, so depend on it as
little as possible (also please document what it protects in detail).

Also note:

"
3. The processor clears the outstanding-notification bit in the
posted-interrupt descriptor. This is done atomically
so as to leave the remainder of the descriptor unmodified (e.g., with a
locked AND operation).
4. The processor writes zero to the EOI register in the local APIC; this
dismisses the interrupt with the postedinterrupt
notification vector from the local APIC.
5. The logical processor performs a logical-OR of PIR into VIRR and
clears PIR. No other agent can read or write a
PIR bit (or group of bits) between the time it is read (to determine
what to OR into VIRR) and when it is cleared.
"

So checking the ON bit does not mean the HW has finished performing
PIR->VIRR transfer (which means ON bit can only be used as an indication 
of whether to send interrupt, not whether PIR->VIRR transfer is
finished).

So its fine to do

-> atomic set pir
-> if (atomic_test_and_set(PIR ON bit))
->	send IPI

But HW can transfer two distinct bits, set by different serialized instances
of kvm_set_irq (protected with a lock), in a single PIR->IRR pass.

> I do not see where those assumptions are coming from. Testing pir does
> not guaranty that the IPI is not processed by VCPU right now.
> 
> iothread:                           vcpu:
> send_irq()
> lock(pir)
> check pir and irr
> set pir
> send IPI (*)
> unlock(pir)
> 
> send_irq()
> lock(pir)
>                                  receive IPI (*)
>                                  atomic {
>                                    pir_tmp = pir
>                                    pir = 0
>                                  }
> check pir and irr
>                                  irr &= pir_tmp
> set pir
> send IPI
> unlock(pir)
> 
> At this point both pir and irr are set and interrupt may be coalesced,
> but it is reported as delivered.

s/"set pir"/"injected = !t_a_s(pir)"/

> So what prevents the scenario above from happening?
> 
> --
> 			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Feb. 24, 2013, 6:59 p.m. UTC | #21
I didn't really follow, but is the root cause the need to keep track
of interrupt coalescing?  If so we can recommend that users use
KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection
with irq coalescing support to vcpu context.

It's not pleasant to cause a performance regression, so it should be a
last resort (or maybe do both if it helps).

On Sun, Feb 24, 2013 at 8:08 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Sun, Feb 24, 2013 at 04:19:17PM +0200, Gleb Natapov wrote:
>> On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote:
>> > > I do not think it fixes it. There is no guaranty that IPI will be
>> > > processed by remote cpu while sending cpu is still in locked section, so
>> > > the same race may happen regardless. As you say above there are 3
>> > > contexts, but only two use locks.
>> > See following logic, I think the problem you mentioned will not happened with current logic.
>> >
>> > get lock
>> > if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.)
>
> Does the PIR IPI interrupt returns synchronously _after_ PIR->IRR transfer
> is made? Don't think so.
>
> PIR IPI interrupt returns after remote CPU has acked interrupt receival,
> not after remote CPU has acked _and_ performed PIR->IRR transfer.
>
> Right? (yes, right, see step 4. below).
>
> Should try to make it easier to drop the lock later, so depend on it as
> little as possible (also please document what it protects in detail).
>
> Also note:
>
> "
> 3. The processor clears the outstanding-notification bit in the
> posted-interrupt descriptor. This is done atomically
> so as to leave the remainder of the descriptor unmodified (e.g., with a
> locked AND operation).
> 4. The processor writes zero to the EOI register in the local APIC; this
> dismisses the interrupt with the postedinterrupt
> notification vector from the local APIC.
> 5. The logical processor performs a logical-OR of PIR into VIRR and
> clears PIR. No other agent can read or write a
> PIR bit (or group of bits) between the time it is read (to determine
> what to OR into VIRR) and when it is cleared.
> "
>
> So checking the ON bit does not mean the HW has finished performing
> PIR->VIRR transfer (which means ON bit can only be used as an indication
> of whether to send interrupt, not whether PIR->VIRR transfer is
> finished).
>
> So its fine to do
>
> -> atomic set pir
> -> if (atomic_test_and_set(PIR ON bit))
> ->      send IPI
>
> But HW can transfer two distinct bits, set by different serialized instances
> of kvm_set_irq (protected with a lock), in a single PIR->IRR pass.
>
>> I do not see where those assumptions are coming from. Testing pir does
>> not guaranty that the IPI is not processed by VCPU right now.
>>
>> iothread:                           vcpu:
>> send_irq()
>> lock(pir)
>> check pir and irr
>> set pir
>> send IPI (*)
>> unlock(pir)
>>
>> send_irq()
>> lock(pir)
>>                                  receive IPI (*)
>>                                  atomic {
>>                                    pir_tmp = pir
>>                                    pir = 0
>>                                  }
>> check pir and irr
>>                                  irr &= pir_tmp
>> set pir
>> send IPI
>> unlock(pir)
>>
>> At this point both pir and irr are set and interrupt may be coalesced,
>> but it is reported as delivered.
>
> s/"set pir"/"injected = !t_a_s(pir)"/
>
>> So what prevents the scenario above from happening?
>>
>> --
>>                       Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 25, 2013, 6:55 a.m. UTC | #22
Marcelo Tosatti wrote on 2013-02-25:
> On Sun, Feb 24, 2013 at 01:17:59PM +0000, Zhang, Yang Z wrote:
>> Marcelo Tosatti wrote on 2013-02-24:
>>> On Sat, Feb 23, 2013 at 03:16:11PM +0000, Zhang, Yang Z wrote:
>>>> Marcelo Tosatti wrote on 2013-02-23:
>>>>> On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote:
>>>>>> Marcelo Tosatti wrote on 2013-02-23:
>>>>>>> 
>>>>>>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote:
>>>>>>>> From: Yang Zhang <yang.z.zhang@Intel.com>
>>>>>>>> 
>>>>>>>> Posted Interrupt allows APIC interrupts to inject into guest directly
>>>>>>>> without any vmexit.
>>>>>>>> 
>>>>>>>> - When delivering a interrupt to guest, if target vcpu is running,
>>>>>>>>   update Posted-interrupt requests bitmap and send a notification
>>>>>>>>   event to the vcpu. Then the vcpu will handle this interrupt
>>>>>>>>   automatically, without any software involvemnt. - If target vcpu is
>>>>>>>>   not running or there already a notification event pending in the
>>>>>>>>   vcpu, do nothing. The interrupt will be handled by next vm entry
>>>>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>>>>>>> ---
>>>>>>>> 
>>>>>>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
>>>>>>>> index e4595f1..64616cc 100644
>>>>>>>> --- a/arch/x86/kernel/irq.c
>>>>>>>> +++ b/arch/x86/kernel/irq.c
>>>>>>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs
> *regs)
>>>>>>>>  	set_irq_regs(old_regs);
>>>>>>>>  }
>>>>>>>> +#ifdef CONFIG_HAVE_KVM
>>>>>>>> +/*
>>>>>>>> + * Handler for POSTED_INTERRUPT_VECTOR.
>>>>>>>> + */
>>>>>>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
>>>>>>>> +{
>>>>>>>> +	struct pt_regs *old_regs = set_irq_regs(regs);
>>>>>>>> +
>>>>>>>> +	ack_APIC_irq();
>>>>>>>> +
>>>>>>>> +	irq_enter();
>>>>>>>> +
>>>>>>>> +	exit_idle();
>>>>>>>> +
>>>>>>>> +	irq_exit();
>>>>>>>> +
>>>>>>>> +	set_irq_regs(old_regs);
>>>>>>>> +}
>>>>>>>> +#endif
>>>>>>>> +
>>>>>>> 
>>>>>>> Add per-cpu counters, similarly to other handlers in the same file.
>>>>>> Sure.
>>>>>> 
>>>>>>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct
>>> kvm_lapic
>>>>>>> *apic)
>>>>>>>>  	if (!apic->irr_pending) 		return -1;
>>>>>>>>  +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu); 	result =
>>>>>>>>  apic_search_irr(apic); 	ASSERT(result == -1 || result >= 16);
>>>>>>> 
>>>>>>> kvm_x86_ops->sync_pir_to_irr() should be called from
>>>>>>> vcpu_enter_guest, before inject_pending_event.
>>>>>>> 
>>>>>>>         if (kvm_check_request(KVM_REQ_EVENT, vcpu) ||
> req_int_win)
>>> {
>>>>>>> 		->
>>>>>>> 		inject_pending_event
>>>>>>> 		...
>>>>>>> 	}
>>>>>> Some other places will search irr to do something, like
>>>>> kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch
>>>>> irr, not just before enter guest.
>>>>> 
>>>>> I see. The only call site that needs IRR/PIR information with posted
>>>>> interrupt enabled is kvm_arch_vcpu_runnable, correct?
>>>> Yes.
>>>> 
>>>>> If so, can we contain reading PIR to only that location?
>>>> Yes, we can do it. Actually, why I do pir->irr here is to avoid the
>>>> wrong usage in future of check pending interrupt just by call
>>>> kvm_vcpu_has_interrupt().
>>> 
>>> Yes, i see.
>>> 
>>>> Also, there may need an extra callback to check pir.
>>>> So you think your suggestions is better?
>>> 
>>> Because it respects standard request processing mechanism, yes.
>> Sure. Will change it in next patch.
>> 
>>>>> And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect
>>>>> the standard way of event processing and also reduce reading the
>>>>> PIR).
>>>> Since we will check ON bit before each reading PIR, the cost can be
>>>> ignored.
>>> 
>>> Note reading ON bit is not OK, because if injection path did not see
>>> vcpu->mode == IN_GUEST_MODE, ON bit will not be set.
>> In my patch, It will set ON bit before check vcpu->mode. So it's ok.
>> Actually, I think the ON bit must be set unconditionally after change
>> PIR regardless of vcpu->mode.
>> 
>>>>> 
>>>>> So it is possible that a PIR IPI is delivered and handled before guest
>>>>> entry.
>>>>> 
>>>>> By clearing PIR notification bit after local_irq_disable, but before the
>>>>> last check for vcpu->requests, we guarantee that a PIR IPI is never lost.
>>>>> 
>>>>> Makes sense? (please check the logic, it might be flawed).
>>>> I am not sure whether I understand you. But I don't think the IPI will
>>>> lost in current patch:
>>>> 
>>>> if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode ==
>>> IN_GUEST_MODE)) {
>>>>                 kvm_make_request(KVM_REQ_EVENT, vcpu);
>>>>                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
>>>>                                     POSTED_INTR_VECTOR);
>>>>                 *delivered = true;
>>>> }
>>>> 
>>>> vcpu entry has:
>>>> vcpu->mode = GUEST_MODE
>>>> local irq disable
>>>> check request
>>>> 
>>>> We will send the IPI after making request, if IPI is received after set
>>> guest_mode before disable irq, then it still will be handled by the
>>> following check request.
>>>> Please correct me if I am wrong.
> 
> p1)
> 
>>> cpu0					cpu1		vcpu0
>>> test_and_set_bit(PIR-A)
>>> set KVM_REQ_EVENT
>>> 							process REQ_EVENT
>>> 							PIR-A->IRR
>>> 
>>> 							vcpu->mode=IN_GUEST
>>> 
>>> if (vcpu0->guest_mode)
>>> 	if (!t_a_s_bit(PIR notif))
>>> 		send IPI
>>> 							linux_pir_handler
>>> 
>>> 					t_a_s_b(PIR-B)=1
>>> 					no PIR IPI sent
> 
> p2)
> 
>> No, this exists with your implementation not mine.
>> As I said, in this patch, it will make request after vcpu ==guest mode, then send
> ipi. Even the ipi is lost, but the request still will be tracked. Because we have the
> follow check:
>> vcpu->mode = GUEST_MODE
>> (ipi may arrived here and lost)
>> local irq disable
>> check request (this will ensure the pir modification will be picked up by vcpu
> before vmentry)
> 
> Please read the sequence above again, between p1) and p2). Yes, if the
> IPI is lost, the request _for the CPU which sent PIR IPI_ is guaranteed
> to be processed, but not the request for another cpu (cpu1).
If no PIR IPI sent in cpu1, the delivered is false, and it will roll back to old logic:
__apic_accept_irq():
if (!delivered) {
             kvm_make_request(KVM_REQ_EVENT, vcpu);
             kvm_vcpu_kick(vcpu);
}
This can solve the problem you mentioned above. Right?

> If in fact the scenario is not possible, then please point out between
> p1) and p2) where the error in representation is.
>
> Note there are 3 contexes: CPU0, CPU1, VCPU0 (virtual CPU on some CPU != 0
> and !=
> 1).

>>>>> Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the
>>>>> need for the spinlock there.
>>>> In function sync_pir_to_irr():
>>>> tmp_pir=xchg(pir,0)
>>>> xchg(tmp_pir, irr)
>>>> 
>>>> It is not atomically, the lock still is needed.
>>> 
>>> IRR is only accessed by local vcpu context, only PIR is accessed concurrently,
>>> therefore only PIR accesses must be atomic. xchg instruction is
>>> locked and atomic.
>> There has same problem as we discussed before. Consider this:
>> Before delivery poste ipi, the irr is 0. then:
>> 
>> cpu0                           cpu1  vcpu0
>> delivery_posted_intr()
>>                                sync_pir_toirr():
>>                                   tmp_pir=xchg(pir,0)(pir is cleared)
>> check pir(pass)
>> check irr(pass)
>> 							   xchg(tmp_pir, irr)
>> injected= true
>> set pir
>> 
>> Same problem: both pir and irr is set, but it reported as injected. Still need the
> lock to protect it.
> 
> See:
> 
> cpu0
> 
> check pir(pass)
> check irr(pass)
> injected = !test_and_set_bit(pir)
> 
> versus
> 
> cpu1
>xchg(pir)
> 
> No information can be lost because all accesses to shared data are
> atomic.
Sorry, I cannot get your point. Why 'xchg(pir)' can solve the problem I mentioned above? Can you elaborate it?
The spinlock I used is trying to protect the whole procedure of sync pir to irr, not just access pir.

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 25, 2013, 7:24 a.m. UTC | #23
Marcelo Tosatti wrote on 2013-02-25:
> On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote:
>>> contexts, but only two use locks.
>> See following logic, I think the problem you mentioned will not
>> happened with current logic.
>> 
>> get lock
>> if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we
> are taking the lock now, no IPI will be sent before we release the lock and no
> pir->irr is performed by hardware for same interrupt.)
>>    return coalesced if test_irr return coalesced
>> set_pir
>> injected=true
>> if t_a_s(on) && in guest mode
>>    send ipi
>> unlock
>> 
>> 
>>>>> I'd rather think about proper way to do lockless injection before
>>>>> committing anything. The case where we care about correct injection
>>>>> status is rare, but we always care about scalability and since we
>>>>> violate the spec by reading vapic page while vcpu is in non-root
>>>>> operation anyway the result may be incorrect with or without the lock.
>>>>> Our use can was not in HW designers mind when they designed this thing
>>>>> obviously :(
>>>> 
>>>> Zhang, can you comment on whether reading vapic page with CPU in
>>>> VMX-non root accessing it is safe?
>> See 24.11.4 In addition to data in the VMCS region itself, VMX non-root
>> operation can be controlled by data structures that are referenced by
>> pointers in a VMCS (for example, the I/O bitmaps). While the pointers
>> to these data structures are parts of the VMCS, the data structures
>> themselves are not. They are not accessible using VMREAD and VMWRITE
>> but by ordinary memory writes. Software should ensure that each such
>> data structure is modified only when no logical processor with a
>> current VMCS that references it is in VMX non-root operation. Doing
>> otherwise may lead to unpredictable behavior.
>> 
>> This means the initial design of KVM is wrong. It should not to modify
>> vIRR directly.
>> 
>> The good thing is that reading is ok.
> 
> OK.
> 
>>>> Gleb, yes, a comment mentioning the race (instead of the spinlock) and
>>>> explanation why its believed to be benign (given how the injection
>>>> return value is interpreted) could also work. Its ugly, though... murphy
>>>> is around.
>>> The race above is not benign. It will report interrupt as coalesced
>>> while in reality it is injected. This may cause to many interrupt to be
>>> injected. If this happens rare enough ntp may be able to fix time drift
>>> resulted from this.
>> Please check the above logic. Does it have same problem? If yes, please point
> out.
> 
> 1) set_pir must be test_and_set_bit() (so the lock at vcpu entry path can
> be dropped).
> 
> 2)  if t_a_s(on) && in guest mode
>       send ipi
> should be inverted:
> 
>     if guest mode && t_a_s(on)
>       send ipi
> So you avoid setting ON bit if guest is outside of guest mode.
Need the ON bit to track whether there are pending bit in pir. Or else, must traverse pir even it is empty when calling sync_pir_to_irr.

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 25, 2013, 8:42 a.m. UTC | #24
Avi Kivity wrote on 2013-02-25:
> I didn't really follow, but is the root cause the need to keep track
> of interrupt coalescing?  If so we can recommend that users use
> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection
> with irq coalescing support to vcpu context.
So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted interrupt is enabled to force users doesn't to use KVM_IRQ_LINE_STATUS. Does this acceptable?

The only case in KVM that need to know the interrupt injection status is vlapic timer. But since vlapic timer and vcpu are always in same pcpu, so there is no problem.

> It's not pleasant to cause a performance regression, so it should be a
> last resort (or maybe do both if it helps).
> 
> On Sun, Feb 24, 2013 at 8:08 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> On Sun, Feb 24, 2013 at 04:19:17PM +0200, Gleb Natapov wrote:
>>> On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote:
>>>>> I do not think it fixes it. There is no guaranty that IPI will be
>>>>> processed by remote cpu while sending cpu is still in locked section, so
>>>>> the same race may happen regardless. As you say above there are 3
>>>>> contexts, but only two use locks.
>>>> See following logic, I think the problem you mentioned will not
>>>> happened with current logic.
>>>> 
>>>> get lock
>>>> if test_pir (this will ensure there is no in flight IPI for same interrupt. Since
> we are taking the lock now, no IPI will be sent before we release the lock and no
> pir->irr is performed by hardware for same interrupt.)
>> 
>> Does the PIR IPI interrupt returns synchronously _after_ PIR->IRR transfer
>> is made? Don't think so.
>> 
>> PIR IPI interrupt returns after remote CPU has acked interrupt receival,
>> not after remote CPU has acked _and_ performed PIR->IRR transfer.
>> 
>> Right? (yes, right, see step 4. below).
>> 
>> Should try to make it easier to drop the lock later, so depend on it as
>> little as possible (also please document what it protects in detail).
>> 
>> Also note:
>> 
>> "
>> 3. The processor clears the outstanding-notification bit in the
>> posted-interrupt descriptor. This is done atomically
>> so as to leave the remainder of the descriptor unmodified (e.g., with a
>> locked AND operation).
>> 4. The processor writes zero to the EOI register in the local APIC; this
>> dismisses the interrupt with the postedinterrupt
>> notification vector from the local APIC.
>> 5. The logical processor performs a logical-OR of PIR into VIRR and
>> clears PIR. No other agent can read or write a
>> PIR bit (or group of bits) between the time it is read (to determine
>> what to OR into VIRR) and when it is cleared.
>> "
>> 
>> So checking the ON bit does not mean the HW has finished performing
>> PIR->VIRR transfer (which means ON bit can only be used as an indication
>> of whether to send interrupt, not whether PIR->VIRR transfer is
>> finished).
>> 
>> So its fine to do
>> 
>> -> atomic set pir
>> -> if (atomic_test_and_set(PIR ON bit))
>> ->      send IPI
>> 
>> But HW can transfer two distinct bits, set by different serialized instances
>> of kvm_set_irq (protected with a lock), in a single PIR->IRR pass.
>> 
>>> I do not see where those assumptions are coming from. Testing pir does
>>> not guaranty that the IPI is not processed by VCPU right now.
>>> 
>>> iothread:                           vcpu:
>>> send_irq()
>>> lock(pir)
>>> check pir and irr
>>> set pir
>>> send IPI (*)
>>> unlock(pir)
>>> 
>>> send_irq()
>>> lock(pir)
>>>                                  receive IPI (*)
>>>                                  atomic {
>>>                                    pir_tmp = pir
>>>                                    pir = 0
>>>                                  } check pir and irr irr &= pir_tmp
>>> set pir
>>> send IPI
>>> unlock(pir)
>>> 
>>> At this point both pir and irr are set and interrupt may be coalesced,
>>> but it is reported as delivered.
>> 
>> s/"set pir"/"injected = !t_a_s(pir)"/
>> 
>>> So what prevents the scenario above from happening?
>>> 
>>> --
>>>                       Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 25, 2013, 11:01 a.m. UTC | #25
On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote:
> Avi Kivity wrote on 2013-02-25:
> > I didn't really follow, but is the root cause the need to keep track
> > of interrupt coalescing?  If so we can recommend that users use
> > KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection
> > with irq coalescing support to vcpu context.
> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted interrupt is enabled to force users doesn't to use KVM_IRQ_LINE_STATUS. Does this acceptable?
> 
> The only case in KVM that need to know the interrupt injection status is vlapic timer. But since vlapic timer and vcpu are always in same pcpu, so there is no problem.
> 
Not really. The primary user of this interface is RTC interrupt
re-injection for Windows guests.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 25, 2013, 11:04 a.m. UTC | #26
Gleb Natapov wrote on 2013-02-25:
> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote:
>> Avi Kivity wrote on 2013-02-25:
>>> I didn't really follow, but is the root cause the need to keep track
>>> of interrupt coalescing?  If so we can recommend that users use
>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection
>>> with irq coalescing support to vcpu context.
>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted
> interrupt is enabled to force users doesn't to use KVM_IRQ_LINE_STATUS. Does
> this acceptable?
>> 
>> The only case in KVM that need to know the interrupt injection status is vlapic
> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no
> problem.
>> 
> Not really. The primary user of this interface is RTC interrupt
> re-injection for Windows guests.
So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well?

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 25, 2013, 11:07 a.m. UTC | #27
On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-02-25:
> > On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote:
> >> Avi Kivity wrote on 2013-02-25:
> >>> I didn't really follow, but is the root cause the need to keep track
> >>> of interrupt coalescing?  If so we can recommend that users use
> >>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection
> >>> with irq coalescing support to vcpu context.
> >> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted
> > interrupt is enabled to force users doesn't to use KVM_IRQ_LINE_STATUS. Does
> > this acceptable?
> >> 
> >> The only case in KVM that need to know the interrupt injection status is vlapic
> > timer. But since vlapic timer and vcpu are always in same pcpu, so there is no
> > problem.
> >> 
> > Not really. The primary user of this interface is RTC interrupt
> > re-injection for Windows guests.
> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well?
> 
Windows guests may experience timedrift under CPU overcommit scenario.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 25, 2013, 11:13 a.m. UTC | #28
Gleb Natapov wrote on 2013-02-25:
> On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-02-25:
>>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote:
>>>> Avi Kivity wrote on 2013-02-25:
>>>>> I didn't really follow, but is the root cause the need to keep track
>>>>> of interrupt coalescing?  If so we can recommend that users use
>>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt
>>>>> injection with irq coalescing support to vcpu context.
>>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted
>>> interrupt is enabled to force users doesn't to use
>>> KVM_IRQ_LINE_STATUS. Does this acceptable?
>>>> 
>>>> The only case in KVM that need to know the interrupt injection status is
> vlapic
>>> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no
>>> problem.
>>>> 
>>> Not really. The primary user of this interface is RTC interrupt
>>> re-injection for Windows guests.
>> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well?
>> 
> Windows guests may experience timedrift under CPU overcommit scenario.
Ok, I see. Seems we are stuck. :(
Do you have any suggestion to solve or workaround current problem?

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 25, 2013, 12:49 p.m. UTC | #29
On Mon, Feb 25, 2013 at 11:13:25AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-02-25:
> > On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote:
> >> Gleb Natapov wrote on 2013-02-25:
> >>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote:
> >>>> Avi Kivity wrote on 2013-02-25:
> >>>>> I didn't really follow, but is the root cause the need to keep track
> >>>>> of interrupt coalescing?  If so we can recommend that users use
> >>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt
> >>>>> injection with irq coalescing support to vcpu context.
> >>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted
> >>> interrupt is enabled to force users doesn't to use
> >>> KVM_IRQ_LINE_STATUS. Does this acceptable?
> >>>> 
> >>>> The only case in KVM that need to know the interrupt injection status is
> > vlapic
> >>> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no
> >>> problem.
> >>>> 
> >>> Not really. The primary user of this interface is RTC interrupt
> >>> re-injection for Windows guests.
> >> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well?
> >> 
> > Windows guests may experience timedrift under CPU overcommit scenario.
> Ok, I see. Seems we are stuck. :(
> Do you have any suggestion to solve or workaround current problem?

Depend on knowledge about atomicity (item 5 IIRC) of the sequence 
in the manual.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 25, 2013, 12:52 p.m. UTC | #30
Marcelo Tosatti wrote on 2013-02-25:
> On Mon, Feb 25, 2013 at 11:13:25AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-02-25:
>>> On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote:
>>>> Gleb Natapov wrote on 2013-02-25:
>>>>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote:
>>>>>> Avi Kivity wrote on 2013-02-25:
>>>>>>> I didn't really follow, but is the root cause the need to keep track
>>>>>>> of interrupt coalescing?  If so we can recommend that users use
>>>>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt
>>>>>>> injection with irq coalescing support to vcpu context.
>>>>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when
> posted
>>>>> interrupt is enabled to force users doesn't to use
>>>>> KVM_IRQ_LINE_STATUS. Does this acceptable?
>>>>>> 
>>>>>> The only case in KVM that need to know the interrupt injection status is
>>> vlapic
>>>>> timer. But since vlapic timer and vcpu are always in same pcpu, so
>>>>> there is no problem.
>>>>>> 
>>>>> Not really. The primary user of this interface is RTC interrupt
>>>>> re-injection for Windows guests.
>>>> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well?
>>>> 
>>> Windows guests may experience timedrift under CPU overcommit scenario.
>> Ok, I see. Seems we are stuck. :(
>> Do you have any suggestion to solve or workaround current problem?
> 
> Depend on knowledge about atomicity (item 5 IIRC) of the sequence
> in the manual.
I am sure it is not atomic:
tmp_pir=xhcg(pir, 0)
irr |= tmp_pir

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 25, 2013, 1:01 p.m. UTC | #31
On Mon, Feb 25, 2013 at 06:55:58AM +0000, Zhang, Yang Z wrote:
> > 
> > p1)
> > 
> >>> cpu0					cpu1		vcpu0
> >>> test_and_set_bit(PIR-A)
> >>> set KVM_REQ_EVENT
> >>> 							process REQ_EVENT
> >>> 							PIR-A->IRR
> >>> 
> >>> 							vcpu->mode=IN_GUEST
> >>> 
> >>> if (vcpu0->guest_mode)
> >>> 	if (!t_a_s_bit(PIR notif))
> >>> 		send IPI
> >>> 							linux_pir_handler
> >>> 
> >>> 					t_a_s_b(PIR-B)=1
> >>> 					no PIR IPI sent
> > 
> > p2)
> > 
> >> No, this exists with your implementation not mine.
> >> As I said, in this patch, it will make request after vcpu ==guest mode, then send
> > ipi. Even the ipi is lost, but the request still will be tracked. Because we have the
> > follow check:
> >> vcpu->mode = GUEST_MODE
> >> (ipi may arrived here and lost)
> >> local irq disable
> >> check request (this will ensure the pir modification will be picked up by vcpu
> > before vmentry)
> > 
> > Please read the sequence above again, between p1) and p2). Yes, if the
> > IPI is lost, the request _for the CPU which sent PIR IPI_ is guaranteed
> > to be processed, but not the request for another cpu (cpu1).
> If no PIR IPI sent in cpu1, the delivered is false, and it will roll back to old logic:
> __apic_accept_irq():
> if (!delivered) {
>              kvm_make_request(KVM_REQ_EVENT, vcpu);
>              kvm_vcpu_kick(vcpu);
> }
> This can solve the problem you mentioned above. Right?

Should not be doing this kick in the first place. One result of it is
that you'll always take a VM-exit if a second injection happens while a VCPU
has not handled the first one.

What is the drawback of the suggestion to unconditionally clear PIR
notification bit on VM-entry?

We can avoid it, but lets get it correct first  (BTW can stick a comment
saying FIXME: optimize) on that PIR clearing.

> > cpu0
> > 
> > check pir(pass)
> > check irr(pass)
> > injected = !test_and_set_bit(pir)
> > 
> > versus
> > 
> > cpu1
> >xchg(pir)

> > 
> > No information can be lost because all accesses to shared data are
> > atomic.
> Sorry, I cannot get your point. Why 'xchg(pir)' can solve the problem I mentioned above? Can you elaborate it?
> The spinlock I used is trying to protect the whole procedure of sync pir to irr, not just access pir.

You're right, its the same problem as with the hardware update.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 25, 2013, 1:34 p.m. UTC | #32
On Mon, Feb 25, 2013 at 11:13:25AM +0000, Zhang, Yang Z wrote:
> Gleb Natapov wrote on 2013-02-25:
> > On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote:
> >> Gleb Natapov wrote on 2013-02-25:
> >>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote:
> >>>> Avi Kivity wrote on 2013-02-25:
> >>>>> I didn't really follow, but is the root cause the need to keep track
> >>>>> of interrupt coalescing?  If so we can recommend that users use
> >>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt
> >>>>> injection with irq coalescing support to vcpu context.
> >>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted
> >>> interrupt is enabled to force users doesn't to use
> >>> KVM_IRQ_LINE_STATUS. Does this acceptable?
> >>>> 
> >>>> The only case in KVM that need to know the interrupt injection status is
> > vlapic
> >>> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no
> >>> problem.
> >>>> 
> >>> Not really. The primary user of this interface is RTC interrupt
> >>> re-injection for Windows guests.
> >> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well?
> >> 
> > Windows guests may experience timedrift under CPU overcommit scenario.
> Ok, I see. Seems we are stuck. :(
> Do you have any suggestion to solve or workaround current problem?
> 
I see a couple of possible solutions:
1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons:
current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it
will be slow on newer kernels
2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not
running during injection. This assumes that if vcpu is running and does
not process interrupt it is guest fault and the same can happen on real
HW too. Coalescing when vcpu is not running though is the result of CPU
overcommit and should be reported. Cons interface definition is kind of
murky.
3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
notifiers for interrupt reinjection. This requires us to add interface
for reporting EOI to userspace. This is not in the scope of this
patchset. Cons: need to introduce new interface (and the one that will
not work on AMD BTW)

Other ideas?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 25, 2013, 2 p.m. UTC | #33
On Mon, Feb 25, 2013 at 03:34:19PM +0200, Gleb Natapov wrote:
> On Mon, Feb 25, 2013 at 11:13:25AM +0000, Zhang, Yang Z wrote:
> > Gleb Natapov wrote on 2013-02-25:
> > > On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote:
> > >> Gleb Natapov wrote on 2013-02-25:
> > >>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote:
> > >>>> Avi Kivity wrote on 2013-02-25:
> > >>>>> I didn't really follow, but is the root cause the need to keep track
> > >>>>> of interrupt coalescing?  If so we can recommend that users use
> > >>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt
> > >>>>> injection with irq coalescing support to vcpu context.
> > >>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted
> > >>> interrupt is enabled to force users doesn't to use
> > >>> KVM_IRQ_LINE_STATUS. Does this acceptable?
> > >>>> 
> > >>>> The only case in KVM that need to know the interrupt injection status is
> > > vlapic
> > >>> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no
> > >>> problem.
> > >>>> 
> > >>> Not really. The primary user of this interface is RTC interrupt
> > >>> re-injection for Windows guests.
> > >> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well?
> > >> 
> > > Windows guests may experience timedrift under CPU overcommit scenario.
> > Ok, I see. Seems we are stuck. :(
> > Do you have any suggestion to solve or workaround current problem?
> > 
> I see a couple of possible solutions:
> 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons:
> current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it
> will be slow on newer kernels

Can add a capability to QEMU and enable APICv selectively only 
in newer QEMU, which can issue KVM_IRQ_LINE_STATUS on target vcpu
only when necessary (and KVM_IRQ_LINE otherwise).

Even a lock serializing injection is not safe because ON bit is cleared
before XCHG(PIR, 0). Must do something heavier (such as running on
target vcpu context).

> 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not
> running during injection. This assumes that if vcpu is running and does
> not process interrupt it is guest fault and the same can happen on real
> HW too. Coalescing when vcpu is not running though is the result of CPU
> overcommit and should be reported. Cons interface definition is kind of
> murky.
> 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
> notifiers for interrupt reinjection. This requires us to add interface
> for reporting EOI to userspace. This is not in the scope of this
> patchset. Cons: need to introduce new interface (and the one that will
> not work on AMD BTW)

Breaks older userspace?
> 
> Other ideas?

Can HW write a 'finished' bit after 6 in the reserved area? Suppose its
not a KVM-specific problem?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 25, 2013, 2:17 p.m. UTC | #34
On Mon, Feb 25, 2013 at 11:00:21AM -0300, Marcelo Tosatti wrote:
> > I see a couple of possible solutions:
> > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons:
> > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it
> > will be slow on newer kernels
> 
> Can add a capability to QEMU and enable APICv selectively only 
> in newer QEMU, which can issue KVM_IRQ_LINE_STATUS on target vcpu
> only when necessary (and KVM_IRQ_LINE otherwise).

Bad idea. What happens with mixed scenarios.

> Even a lock serializing injection is not safe because ON bit is cleared
> before XCHG(PIR, 0). Must do something heavier (such as running on
> target vcpu context).

Note always running on target vcpu is likely to be slower than no-APICv.

So need to do something heavier on the kernel under serialization, if 
firmware cannot be changed (injection from simultaneous CPUs should be
rare so if data to serialize __accept_apic_irq is cache-line aligned 
it should reduce performance impact).

> > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not
> > running during injection. This assumes that if vcpu is running and does
> > not process interrupt it is guest fault and the same can happen on real
> > HW too. Coalescing when vcpu is not running though is the result of CPU
> > overcommit and should be reported. Cons interface definition is kind of
> > murky.
> > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
> > notifiers for interrupt reinjection. This requires us to add interface
> > for reporting EOI to userspace. This is not in the scope of this
> > patchset. Cons: need to introduce new interface (and the one that will
> > not work on AMD BTW)

Is there no int-ack notification at RTC HW level?

> Breaks older userspace?
> > 
> > Other ideas?
> 
> Can HW write a 'finished' bit after 6 in the reserved area? Suppose its
> not a KVM-specific problem?
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 25, 2013, 2:32 p.m. UTC | #35
Marcelo Tosatti wrote on 2013-02-25:
> On Mon, Feb 25, 2013 at 06:55:58AM +0000, Zhang, Yang Z wrote:
>>> 
>>> p1)
>>> 
>>>>> cpu0					cpu1		vcpu0
>>>>> test_and_set_bit(PIR-A)
>>>>> set KVM_REQ_EVENT
>>>>> 							process REQ_EVENT
>>>>> 							PIR-A->IRR
>>>>> 
>>>>> 							vcpu->mode=IN_GUEST
>>>>> 
>>>>> if (vcpu0->guest_mode)
>>>>> 	if (!t_a_s_bit(PIR notif))
>>>>> 		send IPI
>>>>> 							linux_pir_handler
>>>>> 
>>>>> 					t_a_s_b(PIR-B)=1
>>>>> 					no PIR IPI sent
>>> 
>>> p2)
>>> 
>>>> No, this exists with your implementation not mine.
>>>> As I said, in this patch, it will make request after vcpu ==guest mode, then
> send
>>> ipi. Even the ipi is lost, but the request still will be tracked.
>>> Because we have the follow check:
>>>> vcpu->mode = GUEST_MODE
>>>> (ipi may arrived here and lost)
>>>> local irq disable
>>>> check request (this will ensure the pir modification will be picked up by vcpu
>>> before vmentry)
>>> 
>>> Please read the sequence above again, between p1) and p2). Yes, if the
>>> IPI is lost, the request _for the CPU which sent PIR IPI_ is guaranteed
>>> to be processed, but not the request for another cpu (cpu1).
>> If no PIR IPI sent in cpu1, the delivered is false, and it will roll back to old logic:
>> __apic_accept_irq():
>> if (!delivered) {
>>              kvm_make_request(KVM_REQ_EVENT, vcpu);
>>              kvm_vcpu_kick(vcpu);
>> }
>> This can solve the problem you mentioned above. Right?
> 
> Should not be doing this kick in the first place. One result of it is
> that you'll always take a VM-exit if a second injection happens while a VCPU
> has not handled the first one.
You are right. 
 
> What is the drawback of the suggestion to unconditionally clear PIR
> notification bit on VM-entry?
The only thing is we need to traverse the whole pir when calling sync_pir_to_irr even the pir is empty.

>
> We can avoid it, but lets get it correct first  (BTW can stick a comment
> saying FIXME: optimize) on that PIR clearing.
Ok, I will adopt your suggestion. BTW, Where should the comment be add? on sync_pir_to_irr()?

> 
>>> cpu0
>>> 
>>> check pir(pass)
>>> check irr(pass)
>>> injected = !test_and_set_bit(pir)
>>> 
>>> versus
>>> 
>>> cpu1
>>> xchg(pir)
>>> 
>>> 
>>> No information can be lost because all accesses to shared data are
>>> atomic.
>> Sorry, I cannot get your point. Why 'xchg(pir)' can solve the problem I
>> mentioned above? Can you elaborate it? The spinlock I used is trying to
>> protect the whole procedure of sync pir to irr,
> not just access pir.
> 
> You're right, its the same problem as with the hardware update.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Feb. 25, 2013, 4:50 p.m. UTC | #36
>>
> I see a couple of possible solutions:
> 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons:
> current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it
> will be slow on newer kernels

You could backport the qemu change, verify that it builds, and push it
to stable branches.  It's hardly ideal but if nothing else comes up
then it's a solution.


> 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not
> running during injection. This assumes that if vcpu is running and does
> not process interrupt it is guest fault and the same can happen on real
> HW too. Coalescing when vcpu is not running though is the result of CPU
> overcommit and should be reported. Cons interface definition is kind of
> murky.

You still have a window where the vcpu is scheduled out with guest
interrupts disabled, then scheduled back in and before it manages to
handle the interrupt, the second one hits.

It's worth testing though.

> 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
> notifiers for interrupt reinjection. This requires us to add interface
> for reporting EOI to userspace. This is not in the scope of this
> patchset. Cons: need to introduce new interface (and the one that will
> not work on AMD BTW)
>
> Other ideas?

1. inject RTC interrupt
2. not see EOI
3. inject RTC interrupt
4. due to 2, report coalesced

AMD can still use IRR test-and-set.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 25, 2013, 5:40 p.m. UTC | #37
On Mon, Feb 25, 2013 at 11:17:39AM -0300, Marcelo Tosatti wrote:
> On Mon, Feb 25, 2013 at 11:00:21AM -0300, Marcelo Tosatti wrote:
> > > I see a couple of possible solutions:
> > > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons:
> > > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it
> > > will be slow on newer kernels
> > 
> > Can add a capability to QEMU and enable APICv selectively only 
> > in newer QEMU, which can issue KVM_IRQ_LINE_STATUS on target vcpu
> > only when necessary (and KVM_IRQ_LINE otherwise).
> 
> Bad idea. What happens with mixed scenarios.
> 
Exactly.

> > Even a lock serializing injection is not safe because ON bit is cleared
> > before XCHG(PIR, 0). Must do something heavier (such as running on
> > target vcpu context).
> 
> Note always running on target vcpu is likely to be slower than no-APICv.
> 
Yes, but we will do it only for RTC interrupt. Still performance hit
should be very visible when RTC is in 1000Hz mode.

> So need to do something heavier on the kernel under serialization, if 
> firmware cannot be changed (injection from simultaneous CPUs should be
> rare so if data to serialize __accept_apic_irq is cache-line aligned 
> it should reduce performance impact).
> 
> > > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not
> > > running during injection. This assumes that if vcpu is running and does
> > > not process interrupt it is guest fault and the same can happen on real
> > > HW too. Coalescing when vcpu is not running though is the result of CPU
> > > overcommit and should be reported. Cons interface definition is kind of
> > > murky.
> > > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
> > > notifiers for interrupt reinjection. This requires us to add interface
> > > for reporting EOI to userspace. This is not in the scope of this
> > > patchset. Cons: need to introduce new interface (and the one that will
> > > not work on AMD BTW)
> 
> Is there no int-ack notification at RTC HW level?
There is, but Windows calls it twice for each injected interrupt. I
tried to use it in the past to detect interrupt coalescing, but this
double ack prevented me from doing so. May be revisit this approach with
willingness to be more hacky about it. Also it is possible to disable
RTC ack for HyperV guests. We do not do it and if we will use the ack we
will not be able to.

> 
> > Breaks older userspace?
Older userspace will have timedrif with Windows, yes. Note that we some
version of Windows it has it now too.

> > > 
> > > Other ideas?
> > 
> > Can HW write a 'finished' bit after 6 in the reserved area? Suppose its
> > not a KVM-specific problem?
> > 
I still think adding locking just to obtain correct injection status is bad
trade-off even if HW would allow us to get away with it.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 25, 2013, 5:43 p.m. UTC | #38
On Mon, Feb 25, 2013 at 06:50:40PM +0200, Avi Kivity wrote:
> >>
> > I see a couple of possible solutions:
> > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons:
> > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it
> > will be slow on newer kernels
> 
> You could backport the qemu change, verify that it builds, and push it
> to stable branches.  It's hardly ideal but if nothing else comes up
> then it's a solution.
> 
> 
> > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not
> > running during injection. This assumes that if vcpu is running and does
> > not process interrupt it is guest fault and the same can happen on real
> > HW too. Coalescing when vcpu is not running though is the result of CPU
> > overcommit and should be reported. Cons interface definition is kind of
> > murky.
> 
> You still have a window where the vcpu is scheduled out with guest
> interrupts disabled, then scheduled back in and before it manages to
> handle the interrupt, the second one hits.
> 
Yes, definitely not ideal solution.

> It's worth testing though.
> 
Yes again. Windows almost never disables interrupts and RTC interrupt is
of highest priority.

> > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
> > notifiers for interrupt reinjection. This requires us to add interface
> > for reporting EOI to userspace. This is not in the scope of this
> > patchset. Cons: need to introduce new interface (and the one that will
> > not work on AMD BTW)
> >
> > Other ideas?
> 
> 1. inject RTC interrupt
> 2. not see EOI
> 3. inject RTC interrupt
> 4. due to 2, report coalesced
>
That's the 3 in my list, no?
 
> AMD can still use IRR test-and-set.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Feb. 25, 2013, 6:56 p.m. UTC | #39
On Mon, Feb 25, 2013 at 7:43 PM, Gleb Natapov <gleb@redhat.com> wrote:
>
>> > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
>> > notifiers for interrupt reinjection. This requires us to add interface
>> > for reporting EOI to userspace. This is not in the scope of this
>> > patchset. Cons: need to introduce new interface (and the one that will
>> > not work on AMD BTW)
>> >
>> > Other ideas?
>>
>> 1. inject RTC interrupt
>> 2. not see EOI
>> 3. inject RTC interrupt
>> 4. due to 2, report coalesced
>>
> That's the 3 in my list, no?


No, this idea doesn't involve user interface changes.  We still report
through KVM_IRQ_LINE_STATUS or whatever it's called.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 25, 2013, 7:01 p.m. UTC | #40
On Mon, Feb 25, 2013 at 08:56:07PM +0200, Avi Kivity wrote:
> On Mon, Feb 25, 2013 at 7:43 PM, Gleb Natapov <gleb@redhat.com> wrote:
> >
> >> > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
> >> > notifiers for interrupt reinjection. This requires us to add interface
> >> > for reporting EOI to userspace. This is not in the scope of this
> >> > patchset. Cons: need to introduce new interface (and the one that will
> >> > not work on AMD BTW)
> >> >
> >> > Other ideas?
> >>
> >> 1. inject RTC interrupt
> >> 2. not see EOI
> >> 3. inject RTC interrupt
> >> 4. due to 2, report coalesced
> >>
> > That's the 3 in my list, no?
> 
> 
> No, this idea doesn't involve user interface changes.  We still report
> through KVM_IRQ_LINE_STATUS or whatever it's called.
Interesting idea!

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 25, 2013, 10:29 p.m. UTC | #41
On Mon, Feb 25, 2013 at 07:40:07PM +0200, Gleb Natapov wrote:
> On Mon, Feb 25, 2013 at 11:17:39AM -0300, Marcelo Tosatti wrote:
> > On Mon, Feb 25, 2013 at 11:00:21AM -0300, Marcelo Tosatti wrote:
> > > > I see a couple of possible solutions:
> > > > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons:
> > > > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it
> > > > will be slow on newer kernels
> > > 
> > > Can add a capability to QEMU and enable APICv selectively only 
> > > in newer QEMU, which can issue KVM_IRQ_LINE_STATUS on target vcpu
> > > only when necessary (and KVM_IRQ_LINE otherwise).
> > 
> > Bad idea. What happens with mixed scenarios.
> > 
> Exactly.
> 
> > > Even a lock serializing injection is not safe because ON bit is cleared
> > > before XCHG(PIR, 0). Must do something heavier (such as running on
> > > target vcpu context).
> > 
> > Note always running on target vcpu is likely to be slower than no-APICv.
> > 
> Yes, but we will do it only for RTC interrupt. Still performance hit
> should be very visible when RTC is in 1000Hz mode.

So older qemu-kvm on APICv enabled hardware becomes slow? If that is
acceptable, fine.

> > So need to do something heavier on the kernel under serialization, if 
> > firmware cannot be changed (injection from simultaneous CPUs should be
> > rare so if data to serialize __accept_apic_irq is cache-line aligned 
> > it should reduce performance impact).
> > 
> > > > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not
> > > > running during injection. This assumes that if vcpu is running and does
> > > > not process interrupt it is guest fault and the same can happen on real
> > > > HW too. Coalescing when vcpu is not running though is the result of CPU
> > > > overcommit and should be reported. Cons interface definition is kind of
> > > > murky.
> > > > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
> > > > notifiers for interrupt reinjection. This requires us to add interface
> > > > for reporting EOI to userspace. This is not in the scope of this
> > > > patchset. Cons: need to introduce new interface (and the one that will
> > > > not work on AMD BTW)
> > 
> > Is there no int-ack notification at RTC HW level?
> There is, but Windows calls it twice for each injected interrupt. I
> tried to use it in the past to detect interrupt coalescing, but this
> double ack prevented me from doing so. May be revisit this approach with
> willingness to be more hacky about it. Also it is possible to disable
> RTC ack for HyperV guests. We do not do it and if we will use the ack we
> will not be able to.

Is it OK to kill the ability to have coalesced information?

> > > Breaks older userspace?
> Older userspace will have timedrif with Windows, yes. Note that we some
> version of Windows it has it now too.

> > > > Other ideas?
> > > 
> > > Can HW write a 'finished' bit after 6 in the reserved area? Suppose its
> > > not a KVM-specific problem?
> > > 
> I still think adding locking just to obtain correct injection status is bad
> trade-off even if HW would allow us to get away with it.

Perhaps with notification at end of copy it could be lockless.

With the current scheme, it is not possible to get it right because
the notification bit disappears temporarily from sight. And it is not
possible to distinguish between 'interrupt injected' and 'interrupt
merged', as discussed here. So would have to serialize completly along
the lines of:

Lock and only inject if can identify that interrupt handler is not
running.

If its possible to drop KVM_IRQ_LINE_STATUS, then demand APICv HW to use
recent software, fine (did not grasp Avi's second idea).


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Feb. 26, 2013, 8:12 a.m. UTC | #42
On Mon, Feb 25, 2013 at 09:01:02PM +0200, Gleb Natapov wrote:
> On Mon, Feb 25, 2013 at 08:56:07PM +0200, Avi Kivity wrote:
> > On Mon, Feb 25, 2013 at 7:43 PM, Gleb Natapov <gleb@redhat.com> wrote:
> > >
> > >> > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI
> > >> > notifiers for interrupt reinjection. This requires us to add interface
> > >> > for reporting EOI to userspace. This is not in the scope of this
> > >> > patchset. Cons: need to introduce new interface (and the one that will
> > >> > not work on AMD BTW)
> > >> >
> > >> > Other ideas?
> > >>
> > >> 1. inject RTC interrupt
> > >> 2. not see EOI
> > >> 3. inject RTC interrupt
> > >> 4. due to 2, report coalesced
> > >>
> > > That's the 3 in my list, no?
> > 
> > 
> > No, this idea doesn't involve user interface changes.  We still report
> > through KVM_IRQ_LINE_STATUS or whatever it's called.
> Interesting idea!
> 
But do not see how to implement efficiently without interface change. The
idea is basically to register ACK notifier for RTC interrupt but terminate
it in the kernel instead of reporting to userspace. Kernel should know
somehow what GSI should it register ACK notifier for. There are couple
of solutions:
 1. New interface to tell what GSI to track.
     - Cons: KVM_IRQ_LINE_STATUS will stop working for old QEMUs that do
             not use it. New special purpose API.
 2. Register ACK notifier only for RTC.
     - Cons: KVM_IRQ_LINE_STATUS will work only for RTC. This is the only
             thing that use it currently, but such change will make it one
             trick pony officially.
 3. Register ACK notifiers for all edge interrupts in IOAPIC.
     - Cons: with APICv edge interrupt does not require EOI to do vmexit.
             Registering ACK notifiers for all of them will make them all
             do vmexit.
 4. Combinations of 1 and 3. Register ACK notifiers for all edge interrupts
    in IOAPIC and provide API to drop unneeded notifiers.
     - Cons: New special purpose API.

Thoughts?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Feb. 26, 2013, 4:13 p.m. UTC | #43
On Tue, Feb 26, 2013 at 10:12 AM, Gleb Natapov <gleb@redhat.com> wrote:
>>
> But do not see how to implement efficiently without interface change. The
> idea is basically to register ACK notifier for RTC interrupt but terminate
> it in the kernel instead of reporting to userspace. Kernel should know
> somehow what GSI should it register ACK notifier for.

Right, my idea does not help at all.

> There are couple
> of solutions:
>  1. New interface to tell what GSI to track.
>      - Cons: KVM_IRQ_LINE_STATUS will stop working for old QEMUs that do
>              not use it. New special purpose API.
>  2. Register ACK notifier only for RTC.
>      - Cons: KVM_IRQ_LINE_STATUS will work only for RTC. This is the only
>              thing that use it currently, but such change will make it one
>              trick pony officially.
>  3. Register ACK notifiers for all edge interrupts in IOAPIC.
>      - Cons: with APICv edge interrupt does not require EOI to do vmexit.
>              Registering ACK notifiers for all of them will make them all
>              do vmexit.
>  4. Combinations of 1 and 3. Register ACK notifiers for all edge interrupts
>     in IOAPIC and provide API to drop unneeded notifiers.
>      - Cons: New special purpose API.
>

5. Make KVM_IRQ_LINE_STATUS register an ack notifier, document it as
more expensive than KVM_IRQ_LINE, and change qemu to use KVM_IRQ_LINE
when it is sufficient.

Pros: fully backwards compatible
Cons: to realize full performance, need to patch qemu
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 40afa00..9bd4eca 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -19,6 +19,10 @@  BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 
 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 
+#ifdef CONFIG_HAVE_KVM
+BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
+#endif
+
 /*
  * every pentium local APIC has two 'local interrupts', with a
  * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eb92a6e..cebef02 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,6 +28,7 @@ 
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
+extern void kvm_posted_intr_ipi(void);
 extern void error_interrupt(void);
 extern void irq_work_interrupt(void);
 
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 1508e51..774dc9f 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -102,6 +102,11 @@ 
  */
 #define X86_PLATFORM_IPI_VECTOR		0xf7
 
+/* Vector for KVM to deliver posted interrupt IPI */
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR		0xf2
+#endif
+
 /*
  * IRQ work vector:
  */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b8388e9..79da55e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -704,6 +704,9 @@  struct kvm_x86_ops {
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
+	bool (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector,
+					int *result, bool *delivered);
+	void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5c9dbad..ce8ac80 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -158,6 +158,7 @@ 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
+#define PIN_BASED_POSTED_INTR                   0x00000080
 
 #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
@@ -180,6 +181,7 @@ 
 /* VMCS Encodings */
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID            = 0x00000000,
+	POSTED_INTR_NV                  = 0x00000002,
 	GUEST_ES_SELECTOR               = 0x00000800,
 	GUEST_CS_SELECTOR               = 0x00000802,
 	GUEST_SS_SELECTOR               = 0x00000804,
@@ -214,6 +216,8 @@  enum vmcs_field {
 	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
 	APIC_ACCESS_ADDR		= 0x00002014,
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	POSTED_INTR_DESC_ADDR           = 0x00002016,
+	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
 	EPT_POINTER                     = 0x0000201a,
 	EPT_POINTER_HIGH                = 0x0000201b,
 	EOI_EXIT_BITMAP0                = 0x0000201c,
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 70641af..781bc06 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1177,6 +1177,11 @@  apicinterrupt LOCAL_TIMER_VECTOR \
 apicinterrupt X86_PLATFORM_IPI_VECTOR \
 	x86_platform_ipi smp_x86_platform_ipi
 
+#ifdef CONFIG_HAVE_KVM
+apicinterrupt POSTED_INTR_VECTOR \
+	kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+#endif
+
 apicinterrupt THRESHOLD_APIC_VECTOR \
 	threshold_interrupt smp_threshold_interrupt
 apicinterrupt THERMAL_APIC_VECTOR \
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e4595f1..64616cc 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -228,6 +228,26 @@  void smp_x86_platform_ipi(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+#ifdef CONFIG_HAVE_KVM
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	irq_enter();
+
+	exit_idle();
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+#endif
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 6e03b0d..2329a54 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -205,6 +205,10 @@  static void __init apic_intr_init(void)
 
 	/* IPI for X86 platform specific use */
 	alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
+#ifdef CONFIG_HAVE_KVM
+	/* IPI for KVM to deliver posted interrupt */
+	alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+#endif
 
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 02b51dd..9e67a9d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -357,6 +357,25 @@  static u8 count_vectors(void *bitmap)
 	return count;
 }
 
+int kvm_apic_test_irr(int vec, struct kvm_lapic *apic)
+{
+	return apic_test_vector(vec, apic->regs + APIC_IRR);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_test_irr);
+
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+	u32 i, pir_val;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	for (i = 0; i <= 7; i++) {
+		pir_val = xchg(&pir[i], 0);
+		if (pir_val)
+			*((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
+
 static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
 {
 	apic->irr_pending = true;
@@ -379,6 +398,7 @@  static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 	if (!apic->irr_pending)
 		return -1;
 
+	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
 	result = apic_search_irr(apic);
 	ASSERT(result == -1 || result >= 16);
 
@@ -685,6 +705,7 @@  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 {
 	int result = 0;
 	struct kvm_vcpu *vcpu = apic->vcpu;
+	bool delivered = false;
 
 	switch (delivery_mode) {
 	case APIC_DM_LOWEST:
@@ -700,7 +721,10 @@  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		} else
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
 
-		result = !apic_test_and_set_irr(vector, apic);
+		if (!kvm_x86_ops->deliver_posted_interrupt(vcpu, vector,
+						&result, &delivered))
+			result = !apic_test_and_set_irr(vector, apic);
+
 		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
 					  trig_mode, vector, !result);
 		if (!result) {
@@ -710,8 +734,10 @@  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			break;
 		}
 
-		kvm_make_request(KVM_REQ_EVENT, vcpu);
-		kvm_vcpu_kick(vcpu);
+		if (!delivered) {
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
+			kvm_vcpu_kick(vcpu);
+		}
 		break;
 
 	case APIC_DM_REMRD:
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1676d34..53b98ac 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -157,5 +157,7 @@  static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
 void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
 				struct kvm_lapic_irq *irq,
 				u64 *eoi_bitmap);
+int kvm_apic_test_irr(int vec, struct kvm_lapic *apic);
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a7d60d7..9e705e3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3591,6 +3591,17 @@  static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
 	return;
 }
 
+static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+	return;
+}
+
+static bool svm_deliver_posted_interrupt(struct kvm_vcpu *vcpu,
+		int vector, int *result, bool *delivered)
+{
+	return false;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4319,6 +4330,8 @@  static struct kvm_x86_ops svm_x86_ops = {
 	.vm_has_apicv = svm_vm_has_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.hwapic_isr_update = svm_hwapic_isr_update,
+	.sync_pir_to_irr = svm_sync_pir_to_irr,
+	.deliver_posted_interrupt = svm_deliver_posted_interrupt,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 12a91b0..a06364b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,7 +84,8 @@  module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv_reg_vid;
+static bool __read_mostly enable_apicv = 1;
+module_param(enable_apicv, bool, S_IRUGO);
 
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
@@ -365,6 +366,41 @@  struct nested_vmx {
 	struct page *apic_access_page;
 };
 
+#define POSTED_INTR_ON  0
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+	u32 pir[8];     /* Posted interrupt requested */
+	union {
+		struct {
+			u8  on:1,
+			    rsvd:7;
+		} control;
+		u32 rsvd[8];
+	} u;
+} __aligned(64);
+
+static bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->u.control);
+}
+
+static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+	return test_and_clear_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->u.control);
+}
+
+static int pi_test_pir(int vector, struct pi_desc *pi_desc)
+{
+	return test_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static void pi_set_pir(int vector, struct pi_desc *pi_desc)
+{
+	__set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -429,6 +465,10 @@  struct vcpu_vmx {
 
 	bool rdtscp_enabled;
 
+	/* Posted interrupt descriptor */
+	struct pi_desc pi_desc;
+	spinlock_t pi_lock;
+
 	/* Support for a guest hypervisor (nested VMX) */
 	struct nested_vmx nested;
 };
@@ -783,6 +823,18 @@  static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool cpu_has_vmx_apicv(void)
+{
+	return cpu_has_vmx_apic_register_virt() &&
+		cpu_has_vmx_virtual_intr_delivery() &&
+		cpu_has_vmx_posted_intr();
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
@@ -2530,12 +2582,6 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	u32 _vmexit_control = 0;
 	u32 _vmentry_control = 0;
 
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-				&_pin_based_exec_control) < 0)
-		return -EIO;
-
 	min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
@@ -2612,6 +2658,17 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_vmexit_control) < 0)
 		return -EIO;
 
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+				&_pin_based_exec_control) < 0)
+		return -EIO;
+
+	if (!(_cpu_based_2nd_exec_control &
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
+		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
 	min = 0;
 	opt = VM_ENTRY_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
@@ -2790,11 +2847,10 @@  static __init int hardware_setup(void)
 	if (!cpu_has_vmx_ple())
 		ple_gap = 0;
 
-	if (!cpu_has_vmx_apic_register_virt() ||
-				!cpu_has_vmx_virtual_intr_delivery())
-		enable_apicv_reg_vid = 0;
+	if (!cpu_has_vmx_apicv())
+		enable_apicv = 0;
 
-	if (enable_apicv_reg_vid)
+	if (enable_apicv)
 		kvm_x86_ops->update_cr8_intercept = NULL;
 	else
 		kvm_x86_ops->hwapic_irr_update = NULL;
@@ -3871,6 +3927,59 @@  static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 			msr, MSR_TYPE_W);
 }
 
+static int vmx_vm_has_apicv(struct kvm *kvm)
+{
+	return enable_apicv && irqchip_in_kernel(kvm);
+}
+
+static bool vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu,
+		int vector, int *result, bool *delivered)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long flags;
+
+	if (!vmx_vm_has_apicv(vcpu->kvm))
+		return false;
+
+	spin_lock_irqsave(&vmx->pi_lock, flags);
+	if (pi_test_pir(vector, &vmx->pi_desc) ||
+		kvm_apic_test_irr(vector, vcpu->arch.apic)) {
+		spin_unlock_irqrestore(&vmx->pi_lock, flags);
+		return true;
+	}
+
+	pi_set_pir(vector, &vmx->pi_desc);
+	*result = 1;
+	if (!pi_test_and_set_on(&vmx->pi_desc) &&
+			(vcpu->mode == IN_GUEST_MODE)) {
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+				    POSTED_INTR_VECTOR);
+		*delivered = true;
+	}
+	spin_unlock_irqrestore(&vmx->pi_lock, flags);
+
+	return true;
+}
+
+static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long flags;
+
+	if (!vmx_vm_has_apicv(vcpu->kvm))
+		return;
+
+	spin_lock_irqsave(&vmx->pi_lock, flags);
+	if (!pi_test_and_clear_on(&vmx->pi_desc)) {
+		spin_unlock_irqrestore(&vmx->pi_lock, flags);
+		return;
+	}
+	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+
+	spin_unlock_irqrestore(&vmx->pi_lock, flags);
+}
+
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -3931,6 +4040,15 @@  static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 }
 
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+{
+	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
+
+	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+	return pin_based_exec_ctrl;
+}
+
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -3948,11 +4066,6 @@  static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	return exec_control;
 }
 
-static int vmx_vm_has_apicv(struct kvm *kvm)
-{
-	return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
-}
-
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -4008,8 +4121,7 @@  static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
 	/* Control */
-	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-		vmcs_config.pin_based_exec_ctrl);
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -4018,13 +4130,17 @@  static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				vmx_secondary_exec_control(vmx));
 	}
 
-	if (enable_apicv_reg_vid) {
+	if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
 		vmcs_write64(EOI_EXIT_BITMAP3, 0);
 
 		vmcs_write16(GUEST_INTR_STATUS, 0);
+
+		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+		spin_lock_init(&vmx->pi_lock);
 	}
 
 	if (ple_gap) {
@@ -4174,6 +4290,9 @@  static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write64(APIC_ACCESS_ADDR,
 			     page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
 
+	if (vmx_vm_has_apicv(vcpu->kvm))
+		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
+
 	if (vmx->vpid != 0)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
@@ -7639,6 +7758,8 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
+	.sync_pir_to_irr = vmx_sync_pir_to_irr,
+	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
@@ -7742,7 +7863,7 @@  static int __init vmx_init(void)
 	memcpy(vmx_msr_bitmap_longmode_x2apic,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
 
-	if (enable_apicv_reg_vid) {
+	if (enable_apicv) {
 		for (msr = 0x800; msr <= 0x8ff; msr++)
 			vmx_disable_intercept_msr_read_x2apic(msr);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1fa37e..62f8c94 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2679,6 +2679,7 @@  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
+	kvm_x86_ops->sync_pir_to_irr(vcpu);
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
 
 	return 0;