Message ID | 1361540552-2016-3-git-send-email-yang.z.zhang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote: > From: Yang Zhang <yang.z.zhang@Intel.com> > > Posted Interrupt allows APIC interrupts to inject into guest directly > without any vmexit. > > - When delivering a interrupt to guest, if target vcpu is running, > update Posted-interrupt requests bitmap and send a notification event > to the vcpu. Then the vcpu will handle this interrupt automatically, > without any software involvemnt. > > - If target vcpu is not running or there already a notification event > pending in the vcpu, do nothing. The interrupt will be handled by > next vm entry > > Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> > --- > diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c > index e4595f1..64616cc 100644 > --- a/arch/x86/kernel/irq.c > +++ b/arch/x86/kernel/irq.c > @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs) > set_irq_regs(old_regs); > } > > +#ifdef CONFIG_HAVE_KVM > +/* > + * Handler for POSTED_INTERRUPT_VECTOR. > + */ > +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) > +{ > + struct pt_regs *old_regs = set_irq_regs(regs); > + > + ack_APIC_irq(); > + > + irq_enter(); > + > + exit_idle(); > + > + irq_exit(); > + > + set_irq_regs(old_regs); > +} > +#endif > + Add per-cpu counters, similarly to other handlers in the same file. > @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) > if (!apic->irr_pending) > return -1; > > + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); > result = apic_search_irr(apic); > ASSERT(result == -1 || result >= 16); kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest, before inject_pending_event. if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { -> inject_pending_event ... } > @@ -685,6 +705,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > { > int result = 0; > struct kvm_vcpu *vcpu = apic->vcpu; > + bool delivered = false; > > switch (delivery_mode) { > case APIC_DM_LOWEST: > @@ -700,7 +721,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > } else > apic_clear_vector(vector, apic->regs + APIC_TMR); > > - result = !apic_test_and_set_irr(vector, apic); > + if (!kvm_x86_ops->deliver_posted_interrupt(vcpu, vector, > + &result, &delivered)) > + result = !apic_test_and_set_irr(vector, apic); > + > trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, > trig_mode, vector, !result); > if (!result) { > @@ -710,8 +734,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > break; > } > > - kvm_make_request(KVM_REQ_EVENT, vcpu); > - kvm_vcpu_kick(vcpu); > + if (!delivered) { > + kvm_make_request(KVM_REQ_EVENT, vcpu); > + kvm_vcpu_kick(vcpu); > + } This set bit / kick pair should be for non-APICv only (please check for it directly). > + return test_bit(vector, (unsigned long *)pi_desc->pir); > +} > + > +static void pi_set_pir(int vector, struct pi_desc *pi_desc) > +{ > + __set_bit(vector, (unsigned long *)pi_desc->pir); > +} This must be locked atomic operation (set_bit). > + > struct vcpu_vmx { > struct kvm_vcpu vcpu; > unsigned long host_rsp; > @@ -429,6 +465,10 @@ struct vcpu_vmx { > > bool rdtscp_enabled; > > + /* Posted interrupt descriptor */ > + struct pi_desc pi_desc; > + spinlock_t pi_lock; Don't see why the lock is necessary. Please use the following pseudocode for vmx_deliver_posted_interrupt: vmx_deliver_posted_interrupt(), returns 'bool injected'. 1. orig_irr = read irr from vapic page 2. if (orig_irr != 0) 3. return false; 4. kvm_make_request(KVM_REQ_EVENT) 5. bool injected = !test_and_set_bit(PIR) 6. if (vcpu->guest_mode && injected) 7. if (test_and_set_bit(PIR notification bit)) 8. send PIR IPI 9. return injected On vcpu_enter_guest: if (kvm_check_request(KVM_REQ_EVENT)) { pir->virr sync (*) ... } ... vcpu->mode = IN_GUEST_MODE; local_irq_disable clear pir notification bit unconditionally (*) Right after local_irq_disable. > +static bool vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, > + int vector, int *result, bool *delivered) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + unsigned long flags; > + > + if (!vmx_vm_has_apicv(vcpu->kvm)) > + return false; > + > + spin_lock_irqsave(&vmx->pi_lock, flags); > + if (pi_test_pir(vector, &vmx->pi_desc) || > + kvm_apic_test_irr(vector, vcpu->arch.apic)) { > + spin_unlock_irqrestore(&vmx->pi_lock, flags); > + return true; > + } > + > + pi_set_pir(vector, &vmx->pi_desc); > + *result = 1; > + if (!pi_test_and_set_on(&vmx->pi_desc) && > + (vcpu->mode == IN_GUEST_MODE)) { > + kvm_make_request(KVM_REQ_EVENT, vcpu); > + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), > + POSTED_INTR_VECTOR); > + *delivered = true; > + } > + spin_unlock_irqrestore(&vmx->pi_lock, flags); > + > + return true; > +} Please confirm whether a spinlock is necessary with the pseudocode above. > +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + unsigned long flags; > + > + if (!vmx_vm_has_apicv(vcpu->kvm)) > + return; > + > + spin_lock_irqsave(&vmx->pi_lock, flags); > + if (!pi_test_and_clear_on(&vmx->pi_desc)) { This needs to move to vcpu_enter_guest, after local_irq_disabled, unconditionally, as noted. > @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) > static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, > struct kvm_lapic_state *s) > { > + kvm_x86_ops->sync_pir_to_irr(vcpu); > memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); > > return 0; > -- > 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti wrote on 2013-02-23: > > On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote: >> From: Yang Zhang <yang.z.zhang@Intel.com> >> >> Posted Interrupt allows APIC interrupts to inject into guest directly >> without any vmexit. >> >> - When delivering a interrupt to guest, if target vcpu is running, >> update Posted-interrupt requests bitmap and send a notification event >> to the vcpu. Then the vcpu will handle this interrupt automatically, >> without any software involvemnt. >> - If target vcpu is not running or there already a notification event >> pending in the vcpu, do nothing. The interrupt will be handled by >> next vm entry >> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> >> --- >> >> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c >> index e4595f1..64616cc 100644 >> --- a/arch/x86/kernel/irq.c >> +++ b/arch/x86/kernel/irq.c >> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs) >> set_irq_regs(old_regs); >> } >> +#ifdef CONFIG_HAVE_KVM >> +/* >> + * Handler for POSTED_INTERRUPT_VECTOR. >> + */ >> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) >> +{ >> + struct pt_regs *old_regs = set_irq_regs(regs); >> + >> + ack_APIC_irq(); >> + >> + irq_enter(); >> + >> + exit_idle(); >> + >> + irq_exit(); >> + >> + set_irq_regs(old_regs); >> +} >> +#endif >> + > > Add per-cpu counters, similarly to other handlers in the same file. Sure. >> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic > *apic) >> if (!apic->irr_pending) >> return -1; >> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); >> result = apic_search_irr(apic); >> ASSERT(result == -1 || result >= 16); > > kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest, > before inject_pending_event. > > if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { > -> > inject_pending_event > ... > } Some other places will search irr to do something, like kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch irr, not just before enter guest. > >> @@ -685,6 +705,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > delivery_mode, >> { >> int result = 0; >> struct kvm_vcpu *vcpu = apic->vcpu; >> + bool delivered = false; >> >> switch (delivery_mode) { >> case APIC_DM_LOWEST: >> @@ -700,7 +721,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > delivery_mode, >> } else >> apic_clear_vector(vector, apic->regs + APIC_TMR); >> - result = !apic_test_and_set_irr(vector, apic); >> + if (!kvm_x86_ops->deliver_posted_interrupt(vcpu, vector, >> + &result, &delivered)) >> + result = !apic_test_and_set_irr(vector, apic); >> + >> trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, >> trig_mode, vector, !result); >> if (!result) { >> @@ -710,8 +734,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > delivery_mode, >> break; >> } >> - kvm_make_request(KVM_REQ_EVENT, vcpu); >> - kvm_vcpu_kick(vcpu); >> + if (!delivered) { >> + kvm_make_request(KVM_REQ_EVENT, vcpu); >> + kvm_vcpu_kick(vcpu); >> + } > > This set bit / kick pair should be for non-APICv only (please > check for it directly). Sure >> + return test_bit(vector, (unsigned long *)pi_desc->pir); >> +} >> + >> +static void pi_set_pir(int vector, struct pi_desc *pi_desc) >> +{ >> + __set_bit(vector, (unsigned long *)pi_desc->pir); >> +} > > This must be locked atomic operation (set_bit). If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit. >> + >> struct vcpu_vmx { >> struct kvm_vcpu vcpu; >> unsigned long host_rsp; >> @@ -429,6 +465,10 @@ struct vcpu_vmx { >> >> bool rdtscp_enabled; >> + /* Posted interrupt descriptor */ >> + struct pi_desc pi_desc; >> + spinlock_t pi_lock; > > Don't see why the lock is necessary. Please use the following > pseudocode for vmx_deliver_posted_interrupt: > > vmx_deliver_posted_interrupt(), returns 'bool injected'. > > 1. orig_irr = read irr from vapic page > 2. if (orig_irr != 0) > 3. return false; > 4. kvm_make_request(KVM_REQ_EVENT) > 5. bool injected = !test_and_set_bit(PIR) > 6. if (vcpu->guest_mode && injected) > 7. if (test_and_set_bit(PIR notification bit)) > 8. send PIR IPI > 9. return injected Consider follow case: vcpu 0 | vcpu1 send intr to vcpu1 check irr receive a posted intr pir->irr(pir is cleared, irr is set) injected=test_and_set_bit=true pir=set Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong. > On vcpu_enter_guest: > > if (kvm_check_request(KVM_REQ_EVENT)) { > pir->virr sync (*) > ... > } > ... > vcpu->mode = IN_GUEST_MODE; > local_irq_disable > clear pir notification bit unconditionally (*) Why clear ON bit here? If there is no pir->irr operation in this vmentry, clear on here is redundant. > Right after local_irq_disable. > >> +static bool vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, >> + int vector, int *result, bool *delivered) >> +{ >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + unsigned long flags; >> + >> + if (!vmx_vm_has_apicv(vcpu->kvm)) >> + return false; >> + >> + spin_lock_irqsave(&vmx->pi_lock, flags); >> + if (pi_test_pir(vector, &vmx->pi_desc) || >> + kvm_apic_test_irr(vector, vcpu->arch.apic)) { >> + spin_unlock_irqrestore(&vmx->pi_lock, flags); >> + return true; >> + } >> + >> + pi_set_pir(vector, &vmx->pi_desc); >> + *result = 1; >> + if (!pi_test_and_set_on(&vmx->pi_desc) && >> + (vcpu->mode == IN_GUEST_MODE)) { >> + kvm_make_request(KVM_REQ_EVENT, vcpu); >> + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), >> + POSTED_INTR_VECTOR); >> + *delivered = true; >> + } >> + spin_unlock_irqrestore(&vmx->pi_lock, flags); >> + >> + return true; >> +} > > Please confirm whether a spinlock is necessary with the pseudocode above. In current implementation, spinlock is necessary to avoid race condition between vcpus when delivery posted interrupt and sync pir->irr. >> +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) >> +{ >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + unsigned long flags; >> + >> + if (!vmx_vm_has_apicv(vcpu->kvm)) >> + return; >> + >> + spin_lock_irqsave(&vmx->pi_lock, flags); > > >> + if (!pi_test_and_clear_on(&vmx->pi_desc)) { > > This needs to move to vcpu_enter_guest, after local_irq_disabled, > unconditionally, as noted. > >> @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) >> static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, >> struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); >> memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); >> >> return 0; >> -- >> 1.7.1 > Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote: > >> + return test_bit(vector, (unsigned long *)pi_desc->pir); > >> +} > >> + > >> +static void pi_set_pir(int vector, struct pi_desc *pi_desc) > >> +{ > >> + __set_bit(vector, (unsigned long *)pi_desc->pir); > >> +} > > > > This must be locked atomic operation (set_bit). > If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit. > HW still access it concurrently without bothering taking your lock. > > >> + > >> struct vcpu_vmx { > >> struct kvm_vcpu vcpu; > >> unsigned long host_rsp; > >> @@ -429,6 +465,10 @@ struct vcpu_vmx { > >> > >> bool rdtscp_enabled; > >> + /* Posted interrupt descriptor */ > >> + struct pi_desc pi_desc; > >> + spinlock_t pi_lock; > > > > Don't see why the lock is necessary. Please use the following > > pseudocode for vmx_deliver_posted_interrupt: > > > > vmx_deliver_posted_interrupt(), returns 'bool injected'. > > > > 1. orig_irr = read irr from vapic page > > 2. if (orig_irr != 0) > > 3. return false; > > 4. kvm_make_request(KVM_REQ_EVENT) > > 5. bool injected = !test_and_set_bit(PIR) > > 6. if (vcpu->guest_mode && injected) > > 7. if (test_and_set_bit(PIR notification bit)) > > 8. send PIR IPI > > 9. return injected > > Consider follow case: > vcpu 0 | vcpu1 > send intr to vcpu1 > check irr > receive a posted intr > pir->irr(pir is cleared, irr is set) > injected=test_and_set_bit=true > pir=set > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong. > I and Marcelo discussed the lockless logic that should be used here on the previous patch submission. All is left for you is to implement it. We worked hard to make irq injection path lockless, we will not going to add one now. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote: > Marcelo Tosatti wrote on 2013-02-23: > > > > On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote: > >> From: Yang Zhang <yang.z.zhang@Intel.com> > >> > >> Posted Interrupt allows APIC interrupts to inject into guest directly > >> without any vmexit. > >> > >> - When delivering a interrupt to guest, if target vcpu is running, > >> update Posted-interrupt requests bitmap and send a notification event > >> to the vcpu. Then the vcpu will handle this interrupt automatically, > >> without any software involvemnt. > >> - If target vcpu is not running or there already a notification event > >> pending in the vcpu, do nothing. The interrupt will be handled by > >> next vm entry > >> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> > >> --- > >> > >> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c > >> index e4595f1..64616cc 100644 > >> --- a/arch/x86/kernel/irq.c > >> +++ b/arch/x86/kernel/irq.c > >> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs) > >> set_irq_regs(old_regs); > >> } > >> +#ifdef CONFIG_HAVE_KVM > >> +/* > >> + * Handler for POSTED_INTERRUPT_VECTOR. > >> + */ > >> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) > >> +{ > >> + struct pt_regs *old_regs = set_irq_regs(regs); > >> + > >> + ack_APIC_irq(); > >> + > >> + irq_enter(); > >> + > >> + exit_idle(); > >> + > >> + irq_exit(); > >> + > >> + set_irq_regs(old_regs); > >> +} > >> +#endif > >> + > > > > Add per-cpu counters, similarly to other handlers in the same file. > Sure. > > >> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic > > *apic) > >> if (!apic->irr_pending) > >> return -1; > >> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); > >> result = apic_search_irr(apic); > >> ASSERT(result == -1 || result >= 16); > > > > kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest, > > before inject_pending_event. > > > > if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { > > -> > > inject_pending_event > > ... > > } > Some other places will search irr to do something, like kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch irr, not just before enter guest. I see. The only call site that needs IRR/PIR information with posted interrupt enabled is kvm_arch_vcpu_runnable, correct? If so, can we contain reading PIR to only that location? And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the standard way of event processing and also reduce reading the PIR). > >> +{ > >> + __set_bit(vector, (unsigned long *)pi_desc->pir); > >> +} > > > > This must be locked atomic operation (set_bit). > If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit. The manual demands atomic locked accesses (remember this a remote access). See the posted interrupt page. > > > > Don't see why the lock is necessary. Please use the following > > pseudocode for vmx_deliver_posted_interrupt: > > > > vmx_deliver_posted_interrupt(), returns 'bool injected'. > > > > 1. orig_irr = read irr from vapic page > > 2. if (orig_irr != 0) > > 3. return false; > > 4. kvm_make_request(KVM_REQ_EVENT) > > 5. bool injected = !test_and_set_bit(PIR) > > 6. if (vcpu->guest_mode && injected) > > 7. if (test_and_set_bit(PIR notification bit)) > > 8. send PIR IPI > > 9. return injected > > Consider follow case: > vcpu 0 | vcpu1 > send intr to vcpu1 > check irr > receive a posted intr > pir->irr(pir is cleared, irr is set) > injected=test_and_set_bit=true > pir=set > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong. True. Have a lock around it and at step 1 also verify if PIR is set. That would do it, yes? > > On vcpu_enter_guest: > > > > if (kvm_check_request(KVM_REQ_EVENT)) { > > pir->virr sync (*) > > ... > > } > > ... > > vcpu->mode = IN_GUEST_MODE; > > local_irq_disable > > clear pir notification bit unconditionally (*) > Why clear ON bit here? If there is no pir->irr operation in this vmentry, clear on here is redundant. A PIR IPI must not be lost. Because if its lost, then interrupt injection can be delayed while it must be performed immediately. vcpu entry path has: 1. vcpu->mode = GUEST_MODE 2. local_irq_disable injection path has: 1. if (vcpu->guest_mode && test_and_set_bit(PIR notif bit)) send IPI So it is possible that a PIR IPI is delivered and handled before guest entry. By clearing PIR notification bit after local_irq_disable, but before the last check for vcpu->requests, we guarantee that a PIR IPI is never lost. Makes sense? (please check the logic, it might be flawed). > > > > Please confirm whether a spinlock is necessary with the pseudocode above. > In current implementation, spinlock is necessary to avoid race condition between vcpus when delivery posted interrupt and sync pir->irr. Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the need for the spinlock there. About serializing concurrent injectors, yes, you're right. OK then. Please use the pseucode with a spinlock (the order of setting KVM_REQ_EVENT etc must be there). > >> +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) > >> +{ > >> + struct vcpu_vmx *vmx = to_vmx(vcpu); > >> + unsigned long flags; > >> + > >> + if (!vmx_vm_has_apicv(vcpu->kvm)) > >> + return; > >> + > >> + spin_lock_irqsave(&vmx->pi_lock, flags); > > > > > >> + if (!pi_test_and_clear_on(&vmx->pi_desc)) { > > > > This needs to move to vcpu_enter_guest, after local_irq_disabled, > > unconditionally, as noted. > > > >> @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) > >> static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, > >> struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); > >> memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); > >> > >> return 0; > >> -- > >> 1.7.1 > > > > > Best regards, > Yang > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 04:35:30PM +0200, Gleb Natapov wrote: > On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote: > > >> + return test_bit(vector, (unsigned long *)pi_desc->pir); > > >> +} > > >> + > > >> +static void pi_set_pir(int vector, struct pi_desc *pi_desc) > > >> +{ > > >> + __set_bit(vector, (unsigned long *)pi_desc->pir); > > >> +} > > > > > > This must be locked atomic operation (set_bit). > > If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit. > > > HW still access it concurrently without bothering taking your lock. > > > > > >> + > > >> struct vcpu_vmx { > > >> struct kvm_vcpu vcpu; > > >> unsigned long host_rsp; > > >> @@ -429,6 +465,10 @@ struct vcpu_vmx { > > >> > > >> bool rdtscp_enabled; > > >> + /* Posted interrupt descriptor */ > > >> + struct pi_desc pi_desc; > > >> + spinlock_t pi_lock; > > > > > > Don't see why the lock is necessary. Please use the following > > > pseudocode for vmx_deliver_posted_interrupt: > > > > > > vmx_deliver_posted_interrupt(), returns 'bool injected'. > > > > > > 1. orig_irr = read irr from vapic page > > > 2. if (orig_irr != 0) > > > 3. return false; > > > 4. kvm_make_request(KVM_REQ_EVENT) > > > 5. bool injected = !test_and_set_bit(PIR) > > > 6. if (vcpu->guest_mode && injected) > > > 7. if (test_and_set_bit(PIR notification bit)) > > > 8. send PIR IPI > > > 9. return injected > > > > Consider follow case: > > vcpu 0 | vcpu1 > > send intr to vcpu1 > > check irr > > receive a posted intr > > pir->irr(pir is cleared, irr is set) > > injected=test_and_set_bit=true > > pir=set > > > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong. > > > I and Marcelo discussed the lockless logic that should be used here on > the previous patch submission. All is left for you is to implement it. > We worked hard to make irq injection path lockless, we will not going to > add one now. He is right, the scheme is still flawed (because of concurrent injectors along with HW in VMX non-root). I'd said lets add a spinlock think about lockless scheme in the meantime. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti wrote on 2013-02-23: > On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote: >> Marcelo Tosatti wrote on 2013-02-23: >>> >>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote: >>>> From: Yang Zhang <yang.z.zhang@Intel.com> >>>> >>>> Posted Interrupt allows APIC interrupts to inject into guest directly >>>> without any vmexit. >>>> >>>> - When delivering a interrupt to guest, if target vcpu is running, >>>> update Posted-interrupt requests bitmap and send a notification >>>> event to the vcpu. Then the vcpu will handle this interrupt >>>> automatically, without any software involvemnt. - If target vcpu is >>>> not running or there already a notification event pending in the >>>> vcpu, do nothing. The interrupt will be handled by next vm entry >>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> >>>> --- >>>> >>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c >>>> index e4595f1..64616cc 100644 >>>> --- a/arch/x86/kernel/irq.c >>>> +++ b/arch/x86/kernel/irq.c >>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs) >>>> set_irq_regs(old_regs); >>>> } >>>> +#ifdef CONFIG_HAVE_KVM >>>> +/* >>>> + * Handler for POSTED_INTERRUPT_VECTOR. >>>> + */ >>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) >>>> +{ >>>> + struct pt_regs *old_regs = set_irq_regs(regs); >>>> + >>>> + ack_APIC_irq(); >>>> + >>>> + irq_enter(); >>>> + >>>> + exit_idle(); >>>> + >>>> + irq_exit(); >>>> + >>>> + set_irq_regs(old_regs); >>>> +} >>>> +#endif >>>> + >>> >>> Add per-cpu counters, similarly to other handlers in the same file. >> Sure. >> >>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic >>> *apic) >>>> if (!apic->irr_pending) return -1; >>>> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = >>>> apic_search_irr(apic); ASSERT(result == -1 || result >= 16); >>> >>> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest, >>> before inject_pending_event. >>> >>> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { >>> -> >>> inject_pending_event >>> ... >>> } >> Some other places will search irr to do something, like > kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch irr, not just > before enter guest. > > I see. The only call site that needs IRR/PIR information with posted > interrupt enabled is kvm_arch_vcpu_runnable, correct? Yes. > If so, can we contain reading PIR to only that location? Yes, we can do it. Actually, why I do pir->irr here is to avoid the wrong usage in future of check pending interrupt just by call kvm_vcpu_has_interrupt().Also, there may need an extra callback to check pir. So you think your suggestions is better? > And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the > standard way of event processing and also reduce reading the PIR). Since we will check ON bit before each reading PIR, the cost can be ignored. >>>> +{ >>>> + __set_bit(vector, (unsigned long *)pi_desc->pir); >>>> +} >>> >>> This must be locked atomic operation (set_bit). >> If using spinlock, pi_set_pir is not called concurrently. It safe to use _set_bit. > > The manual demands atomic locked accesses (remember this a remote > access). See the posted interrupt page. You are right. >>> >>> Don't see why the lock is necessary. Please use the following >>> pseudocode for vmx_deliver_posted_interrupt: >>> >>> vmx_deliver_posted_interrupt(), returns 'bool injected'. >>> >>> 1. orig_irr = read irr from vapic page >>> 2. if (orig_irr != 0) >>> 3. return false; >>> 4. kvm_make_request(KVM_REQ_EVENT) >>> 5. bool injected = !test_and_set_bit(PIR) >>> 6. if (vcpu->guest_mode && injected) >>> 7. if (test_and_set_bit(PIR notification bit)) >>> 8. send PIR IPI >>> 9. return injected >> >> Consider follow case: >> vcpu 0 | vcpu1 >> send intr to vcpu1 >> check irr >> receive a posted intr >> pir->irr(pir is cleared, irr is set) >> injected=test_and_set_bit=true >> pir=set >> >> Then both irr and pir have the interrupt pending, they may merge to one later, > but injected reported as true. Wrong. > > True. Have a lock around it and at step 1 also verify if PIR is set. That > would do it, yes? Yes. >>> On vcpu_enter_guest: >>> >>> if (kvm_check_request(KVM_REQ_EVENT)) { >>> pir->virr sync (*) >>> ... >>> } >>> ... >>> vcpu->mode = IN_GUEST_MODE; >>> local_irq_disable >>> clear pir notification bit unconditionally (*) >> Why clear ON bit here? If there is no pir->irr operation in this vmentry, clear on > here is redundant. > > A PIR IPI must not be lost. Because if its lost, then interrupt > injection can be delayed while it must be performed immediately. > > vcpu entry path has: > 1. vcpu->mode = GUEST_MODE > 2. local_irq_disable > > injection path has: > 1. if (vcpu->guest_mode && test_and_set_bit(PIR notif bit)) > send IPI > > So it is possible that a PIR IPI is delivered and handled before guest > entry. > > By clearing PIR notification bit after local_irq_disable, but before the > last check for vcpu->requests, we guarantee that a PIR IPI is never lost. > > Makes sense? (please check the logic, it might be flawed). I am not sure whether I understand you. But I don't think the IPI will lost in current patch: if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode == IN_GUEST_MODE)) { kvm_make_request(KVM_REQ_EVENT, vcpu); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); *delivered = true; } vcpu entry has: vcpu->mode = GUEST_MODE local irq disable check request We will send the IPI after making request, if IPI is received after set guest_mode before disable irq, then it still will be handled by the following check request. Please correct me if I am wrong. >>> >>> Please confirm whether a spinlock is necessary with the pseudocode above. >> In current implementation, spinlock is necessary to avoid race condition > between vcpus when delivery posted interrupt and sync pir->irr. > > Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the > need for the spinlock there. In function sync_pir_to_irr(): tmp_pir=xchg(pir,0) xchg(tmp_pir, irr) It is not atomically, the lock still is needed. > About serializing concurrent injectors, yes, you're right. OK then. > > Please use the pseucode with a spinlock (the order of setting > KVM_REQ_EVENT etc must be there). > >>>> +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) >>>> +{ >>>> + struct vcpu_vmx *vmx = to_vmx(vcpu); >>>> + unsigned long flags; >>>> + >>>> + if (!vmx_vm_has_apicv(vcpu->kvm)) >>>> + return; >>>> + >>>> + spin_lock_irqsave(&vmx->pi_lock, flags); >>> >>> >>>> + if (!pi_test_and_clear_on(&vmx->pi_desc)) { >>> >>> This needs to move to vcpu_enter_guest, after local_irq_disabled, >>> unconditionally, as noted. >>> >>>> @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) >>>> static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, >>>> >>>> struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); >>>> memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); >>>> >>>> return 0; >>>> -- >>>> 1.7.1 >>> >> >> >> Best regards, >> Yang >> >> >> -- >> To unsubscribe from this list: send the line "unsubscribe kvm" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 11:48:54AM -0300, Marcelo Tosatti wrote: > > > > 1. orig_irr = read irr from vapic page > > > > 2. if (orig_irr != 0) > > > > 3. return false; > > > > 4. kvm_make_request(KVM_REQ_EVENT) > > > > 5. bool injected = !test_and_set_bit(PIR) > > > > 6. if (vcpu->guest_mode && injected) > > > > 7. if (test_and_set_bit(PIR notification bit)) > > > > 8. send PIR IPI > > > > 9. return injected > > > > > > Consider follow case: > > > vcpu 0 | vcpu1 > > > send intr to vcpu1 > > > check irr > > > receive a posted intr > > > pir->irr(pir is cleared, irr is set) > > > injected=test_and_set_bit=true > > > pir=set > > > > > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong. > > > > > I and Marcelo discussed the lockless logic that should be used here on > > the previous patch submission. All is left for you is to implement it. > > We worked hard to make irq injection path lockless, we will not going to > > add one now. > > He is right, the scheme is still flawed (because of concurrent injectors > along with HW in VMX non-root). I'd said lets add a spinlock think about > lockless scheme in the meantime. The logic proposed was (from that thread): apic_accept_interrupt() { if (PIR || IRR) return coalesced; else set PIR; } Which should map to something like: if (!test_and_set_bit(PIR)) return coalesced; if (irr on vapic page is set) return coalesced; I do not see how the race above can happen this way. Other can though if vcpu is outside a guest. My be we should deliver interrupt differently depending on whether vcpu is in guest or not. I'd rather think about proper way to do lockless injection before committing anything. The case where we care about correct injection status is rare, but we always care about scalability and since we violate the spec by reading vapic page while vcpu is in non-root operation anyway the result may be incorrect with or without the lock. Our use can was not in HW designers mind when they designed this thing obviously :( -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 03:16:11PM +0000, Zhang, Yang Z wrote: > Marcelo Tosatti wrote on 2013-02-23: > > On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote: > >> Marcelo Tosatti wrote on 2013-02-23: > >>> > >>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote: > >>>> From: Yang Zhang <yang.z.zhang@Intel.com> > >>>> > >>>> Posted Interrupt allows APIC interrupts to inject into guest directly > >>>> without any vmexit. > >>>> > >>>> - When delivering a interrupt to guest, if target vcpu is running, > >>>> update Posted-interrupt requests bitmap and send a notification > >>>> event to the vcpu. Then the vcpu will handle this interrupt > >>>> automatically, without any software involvemnt. - If target vcpu is > >>>> not running or there already a notification event pending in the > >>>> vcpu, do nothing. The interrupt will be handled by next vm entry > >>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> > >>>> --- > >>>> > >>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c > >>>> index e4595f1..64616cc 100644 > >>>> --- a/arch/x86/kernel/irq.c > >>>> +++ b/arch/x86/kernel/irq.c > >>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs) > >>>> set_irq_regs(old_regs); > >>>> } > >>>> +#ifdef CONFIG_HAVE_KVM > >>>> +/* > >>>> + * Handler for POSTED_INTERRUPT_VECTOR. > >>>> + */ > >>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) > >>>> +{ > >>>> + struct pt_regs *old_regs = set_irq_regs(regs); > >>>> + > >>>> + ack_APIC_irq(); > >>>> + > >>>> + irq_enter(); > >>>> + > >>>> + exit_idle(); > >>>> + > >>>> + irq_exit(); > >>>> + > >>>> + set_irq_regs(old_regs); > >>>> +} > >>>> +#endif > >>>> + > >>> > >>> Add per-cpu counters, similarly to other handlers in the same file. > >> Sure. > >> > >>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic > >>> *apic) > >>>> if (!apic->irr_pending) return -1; > >>>> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = > >>>> apic_search_irr(apic); ASSERT(result == -1 || result >= 16); > >>> > >>> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest, > >>> before inject_pending_event. > >>> > >>> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { > >>> -> > >>> inject_pending_event > >>> ... > >>> } > >> Some other places will search irr to do something, like > > kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch irr, not just > > before enter guest. > > > > I see. The only call site that needs IRR/PIR information with posted > > interrupt enabled is kvm_arch_vcpu_runnable, correct? > Yes. > > > If so, can we contain reading PIR to only that location? > Yes, we can do it. > Actually, why I do pir->irr here is to avoid the wrong usage in future of check pending interrupt just by > call kvm_vcpu_has_interrupt(). Yes, i see. > Also, there may need an extra callback to check pir. > So you think your suggestions is better? Because it respects standard request processing mechanism, yes. > > And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the > > standard way of event processing and also reduce reading the PIR). > Since we will check ON bit before each reading PIR, the cost can be > ignored. Note reading ON bit is not OK, because if injection path did not see vcpu->mode == IN_GUEST_MODE, ON bit will not be set. > > > > So it is possible that a PIR IPI is delivered and handled before guest > > entry. > > > > By clearing PIR notification bit after local_irq_disable, but before the > > last check for vcpu->requests, we guarantee that a PIR IPI is never lost. > > > > Makes sense? (please check the logic, it might be flawed). > I am not sure whether I understand you. But I don't think the IPI will lost in current patch: > > if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode == IN_GUEST_MODE)) { > kvm_make_request(KVM_REQ_EVENT, vcpu); > apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), > POSTED_INTR_VECTOR); > *delivered = true; > } > > vcpu entry has: > vcpu->mode = GUEST_MODE > local irq disable > check request > > We will send the IPI after making request, if IPI is received after set guest_mode before disable irq, then it still will be handled by the following check request. > Please correct me if I am wrong. cpu0 cpu1 vcpu0 test_and_set_bit(PIR-A) set KVM_REQ_EVENT process REQ_EVENT PIR-A->IRR vcpu->mode=IN_GUEST if (vcpu0->guest_mode) if (!t_a_s_bit(PIR notif)) send IPI linux_pir_handler t_a_s_b(PIR-B)=1 no PIR IPI sent > > > > Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the > > need for the spinlock there. > In function sync_pir_to_irr(): > tmp_pir=xchg(pir,0) > xchg(tmp_pir, irr) > > It is not atomically, the lock still is needed. IRR is only accessed by local vcpu context, only PIR is accessed concurrently, therefore only PIR accesses must be atomic. xchg instruction is locked and atomic. +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ + u32 i, pir_val; + struct kvm_lapic *apic = vcpu->arch.apic; + + for (i = 0; i <= 7; i++) { + pir_val = xchg(&pir[i], 0); + if (pir_val) + *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; + } +} +EXPORT_SYMBOL_GPL(kvm_apic_update_irr); Right? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 05:31:44PM +0200, Gleb Natapov wrote: > On Sat, Feb 23, 2013 at 11:48:54AM -0300, Marcelo Tosatti wrote: > > > > > 1. orig_irr = read irr from vapic page > > > > > 2. if (orig_irr != 0) > > > > > 3. return false; > > > > > 4. kvm_make_request(KVM_REQ_EVENT) > > > > > 5. bool injected = !test_and_set_bit(PIR) > > > > > 6. if (vcpu->guest_mode && injected) > > > > > 7. if (test_and_set_bit(PIR notification bit)) > > > > > 8. send PIR IPI > > > > > 9. return injected > > > > > > > > Consider follow case: > > > > vcpu 0 | vcpu1 > > > > send intr to vcpu1 > > > > check irr > > > > receive a posted intr > > > > pir->irr(pir is cleared, irr is set) > > > > injected=test_and_set_bit=true > > > > pir=set > > > > > > > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong. > > > > > > > I and Marcelo discussed the lockless logic that should be used here on > > > the previous patch submission. All is left for you is to implement it. > > > We worked hard to make irq injection path lockless, we will not going to > > > add one now. > > > > He is right, the scheme is still flawed (because of concurrent injectors > > along with HW in VMX non-root). I'd said lets add a spinlock think about > > lockless scheme in the meantime. > The logic proposed was (from that thread): > apic_accept_interrupt() { > if (PIR || IRR) > return coalesced; > else > set PIR; > } > > Which should map to something like: > if (!test_and_set_bit(PIR)) > return coalesced; HW transfers PIR to IRR, here. Say due to PIR IPI sent due to setting of a different vector. > if (irr on vapic page is set) > return coalesced; > > I do not see how the race above can happen this way. Other can though if > vcpu is outside a guest. My be we should deliver interrupt differently > depending on whether vcpu is in guest or not. Problem is with 3 contexes: two injectors and one vcpu in guest mode. Earlier on that thread you mentioned "The point is that we need to check PIR and IRR atomically and this is impossible." That would be one way to fix it. > I'd rather think about proper way to do lockless injection before > committing anything. The case where we care about correct injection > status is rare, but we always care about scalability and since we > violate the spec by reading vapic page while vcpu is in non-root > operation anyway the result may be incorrect with or without the lock. > Our use can was not in HW designers mind when they designed this thing > obviously :( Zhang, can you comment on whether reading vapic page with CPU in VMX-non root accessing it is safe? Gleb, yes, a comment mentioning the race (instead of the spinlock) and explanation why its believed to be benign (given how the injection return value is interpreted) could also work. Its ugly, though... murphy is around. OTOH spinlock is not the end of the world, can figure out something later (we've tried without success so far). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 02:05:13PM -0300, Marcelo Tosatti wrote: > On Sat, Feb 23, 2013 at 05:31:44PM +0200, Gleb Natapov wrote: > > On Sat, Feb 23, 2013 at 11:48:54AM -0300, Marcelo Tosatti wrote: > > > > > > 1. orig_irr = read irr from vapic page > > > > > > 2. if (orig_irr != 0) > > > > > > 3. return false; > > > > > > 4. kvm_make_request(KVM_REQ_EVENT) > > > > > > 5. bool injected = !test_and_set_bit(PIR) > > > > > > 6. if (vcpu->guest_mode && injected) > > > > > > 7. if (test_and_set_bit(PIR notification bit)) > > > > > > 8. send PIR IPI > > > > > > 9. return injected > > > > > > > > > > Consider follow case: > > > > > vcpu 0 | vcpu1 > > > > > send intr to vcpu1 > > > > > check irr > > > > > receive a posted intr > > > > > pir->irr(pir is cleared, irr is set) > > > > > injected=test_and_set_bit=true > > > > > pir=set > > > > > > > > > > Then both irr and pir have the interrupt pending, they may merge to one later, but injected reported as true. Wrong. > > > > > > > > > I and Marcelo discussed the lockless logic that should be used here on > > > > the previous patch submission. All is left for you is to implement it. > > > > We worked hard to make irq injection path lockless, we will not going to > > > > add one now. > > > > > > He is right, the scheme is still flawed (because of concurrent injectors > > > along with HW in VMX non-root). I'd said lets add a spinlock think about > > > lockless scheme in the meantime. > > The logic proposed was (from that thread): > > apic_accept_interrupt() { > > if (PIR || IRR) > > return coalesced; > > else > > set PIR; > > } > > > > Which should map to something like: > > if (!test_and_set_bit(PIR)) > > return coalesced; > > HW transfers PIR to IRR, here. Say due to PIR IPI sent > due to setting of a different vector. > Hmm, yes. Haven't thought about different vector :( > > if (irr on vapic page is set) > > return coalesced; > > > > > I do not see how the race above can happen this way. Other can though if > > vcpu is outside a guest. My be we should deliver interrupt differently > > depending on whether vcpu is in guest or not. > > Problem is with 3 contexes: two injectors and one vcpu in guest > mode. Earlier on that thread you mentioned > > "The point is that we need to check PIR and IRR atomically and this is > impossible." > > That would be one way to fix it. > I do not think it fixes it. There is no guaranty that IPI will be processed by remote cpu while sending cpu is still in locked section, so the same race may happen regardless. As you say above there are 3 contexts, but only two use locks. > > I'd rather think about proper way to do lockless injection before > > committing anything. The case where we care about correct injection > > status is rare, but we always care about scalability and since we > > violate the spec by reading vapic page while vcpu is in non-root > > operation anyway the result may be incorrect with or without the lock. > > Our use can was not in HW designers mind when they designed this thing > > obviously :( > > Zhang, can you comment on whether reading vapic page with CPU in > VMX-non root accessing it is safe? > > Gleb, yes, a comment mentioning the race (instead of the spinlock) and > explanation why its believed to be benign (given how the injection > return value is interpreted) could also work. Its ugly, though... murphy > is around. The race above is not benign. It will report interrupt as coalesced while in reality it is injected. This may cause to many interrupt to be injected. If this happens rare enough ntp may be able to fix time drift resulted from this. > > OTOH spinlock is not the end of the world, can figure out something later > (we've tried without success so far). It serializes all injections into vcpu. I do not believe now that even with lock we are safe for the reason I mention above. We can use pir->on bit as a lock, but that only emphasise how ridiculous serialization of injections becomes. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 09:42:14PM +0200, Gleb Natapov wrote: > > explanation why its believed to be benign (given how the injection > > return value is interpreted) could also work. Its ugly, though... murphy > > is around. > The race above is not benign. It will report interrupt as coalesced > while in reality it is injected. This may cause to many interrupt to be > injected. If this happens rare enough ntp may be able to fix time drift > resulted from this. OK. > > OTOH spinlock is not the end of the world, can figure out something later > > (we've tried without success so far). > It serializes all injections into vcpu. I do not believe now that even > with lock we are safe for the reason I mention above. We can use pir->on > bit as a lock, but that only emphasise how ridiculous serialization of > injections becomes. Please review the 2nd iteration of pseudocode in patchset v4 thread, it should be good. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 23, 2013 at 04:52:59PM -0300, Marcelo Tosatti wrote: > > > OTOH spinlock is not the end of the world, can figure out something later > > > (we've tried without success so far). > > It serializes all injections into vcpu. I do not believe now that even > > with lock we are safe for the reason I mention above. We can use pir->on > > bit as a lock, but that only emphasise how ridiculous serialization of > > injections becomes. > > Please review the 2nd iteration of pseudocode in patchset v4 thread, it > should be good. Can you point me to what exactly I should look at? All I see is a discussion on how to no miss an IPI, but this is not the problem I am talking about here. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti wrote on 2013-02-24: > On Sat, Feb 23, 2013 at 03:16:11PM +0000, Zhang, Yang Z wrote: >> Marcelo Tosatti wrote on 2013-02-23: >>> On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote: >>>> Marcelo Tosatti wrote on 2013-02-23: >>>>> >>>>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote: >>>>>> From: Yang Zhang <yang.z.zhang@Intel.com> >>>>>> >>>>>> Posted Interrupt allows APIC interrupts to inject into guest directly >>>>>> without any vmexit. >>>>>> >>>>>> - When delivering a interrupt to guest, if target vcpu is running, >>>>>> update Posted-interrupt requests bitmap and send a notification >>>>>> event to the vcpu. Then the vcpu will handle this interrupt >>>>>> automatically, without any software involvemnt. - If target vcpu is >>>>>> not running or there already a notification event pending in the >>>>>> vcpu, do nothing. The interrupt will be handled by next vm entry >>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> >>>>>> --- >>>>>> >>>>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c >>>>>> index e4595f1..64616cc 100644 >>>>>> --- a/arch/x86/kernel/irq.c >>>>>> +++ b/arch/x86/kernel/irq.c >>>>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs) >>>>>> set_irq_regs(old_regs); >>>>>> } >>>>>> +#ifdef CONFIG_HAVE_KVM >>>>>> +/* >>>>>> + * Handler for POSTED_INTERRUPT_VECTOR. >>>>>> + */ >>>>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) >>>>>> +{ >>>>>> + struct pt_regs *old_regs = set_irq_regs(regs); >>>>>> + >>>>>> + ack_APIC_irq(); >>>>>> + >>>>>> + irq_enter(); >>>>>> + >>>>>> + exit_idle(); >>>>>> + >>>>>> + irq_exit(); >>>>>> + >>>>>> + set_irq_regs(old_regs); >>>>>> +} >>>>>> +#endif >>>>>> + >>>>> >>>>> Add per-cpu counters, similarly to other handlers in the same file. >>>> Sure. >>>> >>>>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct > kvm_lapic >>>>> *apic) >>>>>> if (!apic->irr_pending) return -1; >>>>>> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = >>>>>> apic_search_irr(apic); ASSERT(result == -1 || result >= 16); >>>>> >>>>> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest, >>>>> before inject_pending_event. >>>>> >>>>> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) > { >>>>> -> >>>>> inject_pending_event >>>>> ... >>>>> } >>>> Some other places will search irr to do something, like >>> kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch >>> irr, not just before enter guest. >>> >>> I see. The only call site that needs IRR/PIR information with posted >>> interrupt enabled is kvm_arch_vcpu_runnable, correct? >> Yes. >> >>> If so, can we contain reading PIR to only that location? >> Yes, we can do it. Actually, why I do pir->irr here is to avoid the >> wrong usage in future of check pending interrupt just by call >> kvm_vcpu_has_interrupt(). > > Yes, i see. > >> Also, there may need an extra callback to check pir. >> So you think your suggestions is better? > > Because it respects standard request processing mechanism, yes. Sure. Will change it in next patch. >>> And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the >>> standard way of event processing and also reduce reading the PIR). >> Since we will check ON bit before each reading PIR, the cost can be >> ignored. > > Note reading ON bit is not OK, because if injection path did not see > vcpu->mode == IN_GUEST_MODE, ON bit will not be set. In my patch, It will set ON bit before check vcpu->mode. So it's ok. Actually, I think the ON bit must be set unconditionally after change PIR regardless of vcpu->mode. >>> >>> So it is possible that a PIR IPI is delivered and handled before guest >>> entry. >>> >>> By clearing PIR notification bit after local_irq_disable, but before the >>> last check for vcpu->requests, we guarantee that a PIR IPI is never lost. >>> >>> Makes sense? (please check the logic, it might be flawed). >> I am not sure whether I understand you. But I don't think the IPI will >> lost in current patch: >> >> if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode == > IN_GUEST_MODE)) { >> kvm_make_request(KVM_REQ_EVENT, vcpu); >> apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), >> POSTED_INTR_VECTOR); >> *delivered = true; >> } >> >> vcpu entry has: >> vcpu->mode = GUEST_MODE >> local irq disable >> check request >> >> We will send the IPI after making request, if IPI is received after set > guest_mode before disable irq, then it still will be handled by the following check > request. >> Please correct me if I am wrong. > > cpu0 cpu1 vcpu0 > test_and_set_bit(PIR-A) > set KVM_REQ_EVENT > process REQ_EVENT > PIR-A->IRR > > vcpu->mode=IN_GUEST > > if (vcpu0->guest_mode) > if (!t_a_s_bit(PIR notif)) > send IPI > linux_pir_handler > > t_a_s_b(PIR-B)=1 > no PIR IPI sent No, this exists with your implementation not mine. As I said, in this patch, it will make request after vcpu ==guest mode, then send ipi. Even the ipi is lost, but the request still will be tracked. Because we have the follow check: vcpu->mode = GUEST_MODE (ipi may arrived here and lost) local irq disable check request (this will ensure the pir modification will be picked up by vcpu before vmentry) >>> >>> Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the >>> need for the spinlock there. >> In function sync_pir_to_irr(): >> tmp_pir=xchg(pir,0) >> xchg(tmp_pir, irr) >> >> It is not atomically, the lock still is needed. > > IRR is only accessed by local vcpu context, only PIR is accessed concurrently, > therefore only PIR accesses must be atomic. xchg instruction is > locked and atomic. There has same problem as we discussed before. Consider this: Before delivery poste ipi, the irr is 0. then: cpu0 cpu1 vcpu0 delivery_posted_intr() sync_pir_toirr(): tmp_pir=xchg(pir,0)(pir is cleared) check pir(pass) check irr(pass) xchg(tmp_pir, irr) injected= true set pir Same problem: both pir and irr is set, but it reported as injected. Still need the lock to protect it. > +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ + > u32 i, pir_val; + struct kvm_lapic *apic = vcpu->arch.apic; + + > for (i = 0; i <= 7; i++) { + pir_val = xchg(&pir[i], > 0); + if (pir_val) + *((u32 > *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; + } +} > +EXPORT_SYMBOL_GPL(kvm_apic_update_irr); > > Right? > Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote on 2013-02-24: > On Sat, Feb 23, 2013 at 02:05:13PM -0300, Marcelo Tosatti wrote: >> On Sat, Feb 23, 2013 at 05:31:44PM +0200, Gleb Natapov wrote: >>> On Sat, Feb 23, 2013 at 11:48:54AM -0300, Marcelo Tosatti wrote: >>>>>>> 1. orig_irr = read irr from vapic page >>>>>>> 2. if (orig_irr != 0) >>>>>>> 3. return false; >>>>>>> 4. kvm_make_request(KVM_REQ_EVENT) >>>>>>> 5. bool injected = !test_and_set_bit(PIR) >>>>>>> 6. if (vcpu->guest_mode && injected) >>>>>>> 7. if (test_and_set_bit(PIR notification bit)) >>>>>>> 8. send PIR IPI >>>>>>> 9. return injected >>>>>> >>>>>> Consider follow case: >>>>>> vcpu 0 | vcpu1 >>>>>> send intr to vcpu1 >>>>>> check irr >>>>>> receive a posted intr >>>>>> pir->irr(pir is cleared, irr is set) >>>>>> injected=test_and_set_bit=true pir=set >>>>>> >>>>>> Then both irr and pir have the interrupt pending, they may merge to >>>>>> one later, but injected reported as true. Wrong. >>>>>> >>>>> I and Marcelo discussed the lockless logic that should be used here on >>>>> the previous patch submission. All is left for you is to implement it. >>>>> We worked hard to make irq injection path lockless, we will not going to >>>>> add one now. >>>> >>> He is right, the scheme is still flawed (because of concurrent >>> injectors along with HW in VMX non-root). I'd said lets add a >>> spinlock think about lockless scheme in the meantime. The logic >>> proposed was (from that thread): apic_accept_interrupt() { >>> if (PIR || IRR) >>> return coalesced; else set PIR; >>> } >>> Which should map to something like: >>> if (!test_and_set_bit(PIR)) >>> return coalesced; >> >> HW transfers PIR to IRR, here. Say due to PIR IPI sent >> due to setting of a different vector. >> > Hmm, yes. Haven't thought about different vector :( > >>> if (irr on vapic page is set) >>> return coalesced; >> >>> >>> I do not see how the race above can happen this way. Other can though if >>> vcpu is outside a guest. My be we should deliver interrupt differently >>> depending on whether vcpu is in guest or not. >> >> Problem is with 3 contexes: two injectors and one vcpu in guest >> mode. Earlier on that thread you mentioned >> >> "The point is that we need to check PIR and IRR atomically and this is >> impossible." >> >> That would be one way to fix it. >> > I do not think it fixes it. There is no guaranty that IPI will be > processed by remote cpu while sending cpu is still in locked section, so > the same race may happen regardless. As you say above there are 3 > contexts, but only two use locks. See following logic, I think the problem you mentioned will not happened with current logic. get lock if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.) return coalesced if test_irr return coalesced set_pir injected=true if t_a_s(on) && in guest mode send ipi unlock >>> I'd rather think about proper way to do lockless injection before >>> committing anything. The case where we care about correct injection >>> status is rare, but we always care about scalability and since we >>> violate the spec by reading vapic page while vcpu is in non-root >>> operation anyway the result may be incorrect with or without the lock. >>> Our use can was not in HW designers mind when they designed this thing >>> obviously :( >> >> Zhang, can you comment on whether reading vapic page with CPU in >> VMX-non root accessing it is safe? See 24.11.4 In addition to data in the VMCS region itself, VMX non-root operation can be controlled by data structures that are referenced by pointers in a VMCS (for example, the I/O bitmaps). While the pointers to these data structures are parts of the VMCS, the data structures themselves are not. They are not accessible using VMREAD and VMWRITE but by ordinary memory writes. Software should ensure that each such data structure is modified only when no logical processor with a current VMCS that references it is in VMX non-root operation. Doing otherwise may lead to unpredictable behavior. This means the initial design of KVM is wrong. It should not to modify vIRR directly. The good thing is that reading is ok. >> Gleb, yes, a comment mentioning the race (instead of the spinlock) and >> explanation why its believed to be benign (given how the injection >> return value is interpreted) could also work. Its ugly, though... murphy >> is around. > The race above is not benign. It will report interrupt as coalesced > while in reality it is injected. This may cause to many interrupt to be > injected. If this happens rare enough ntp may be able to fix time drift > resulted from this. Please check the above logic. Does it have same problem? If yes, please point out. >> >> OTOH spinlock is not the end of the world, can figure out something later >> (we've tried without success so far). > It serializes all injections into vcpu. I do not believe now that even > with lock we are safe for the reason I mention above. We can use pir->on > bit as a lock, but that only emphasise how ridiculous serialization of > injections becomes. > > -- > Gleb. Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote: > > I do not think it fixes it. There is no guaranty that IPI will be > > processed by remote cpu while sending cpu is still in locked section, so > > the same race may happen regardless. As you say above there are 3 > > contexts, but only two use locks. > See following logic, I think the problem you mentioned will not happened with current logic. > > get lock > if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.) I do not see where those assumptions are coming from. Testing pir does not guaranty that the IPI is not processed by VCPU right now. iothread: vcpu: send_irq() lock(pir) check pir and irr set pir send IPI (*) unlock(pir) send_irq() lock(pir) receive IPI (*) atomic { pir_tmp = pir pir = 0 } check pir and irr irr &= pir_tmp set pir send IPI unlock(pir) At this point both pir and irr are set and interrupt may be coalesced, but it is reported as delivered. So what prevents the scenario above from happening? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote on 2013-02-24: > On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote: >>> I do not think it fixes it. There is no guaranty that IPI will be >>> processed by remote cpu while sending cpu is still in locked section, so >>> the same race may happen regardless. As you say above there are 3 >>> contexts, but only two use locks. >> See following logic, I think the problem you mentioned will not >> happened with current logic. >> >> get lock >> if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we > are taking the lock now, no IPI will be sent before we release the lock and no > pir->irr is performed by hardware for same interrupt.) > I do not see where those assumptions are coming from. Testing pir does > not guaranty that the IPI is not processed by VCPU right now. > > iothread: vcpu: > send_irq() > lock(pir) > check pir and irr > set pir > send IPI (*) > unlock(pir) > > send_irq() > lock(pir) > receive IPI (*) > atomic { > pir_tmp = pir > pir = 0 > } check pir and irr irr &= pir_tmp > set pir > send IPI > unlock(pir) > > At this point both pir and irr are set and interrupt may be coalesced, > but it is reported as delivered. > > So what prevents the scenario above from happening? Yes, you are right. For this case, it can do nothing. And we cannot solve this problem in current KVM, right? Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Feb 24, 2013 at 02:26:36PM +0000, Zhang, Yang Z wrote: > > I do not see where those assumptions are coming from. Testing pir does > > not guaranty that the IPI is not processed by VCPU right now. > > > > iothread: vcpu: > > send_irq() > > lock(pir) > > check pir and irr > > set pir > > send IPI (*) > > unlock(pir) > > > > send_irq() > > lock(pir) > > receive IPI (*) > > atomic { > > pir_tmp = pir > > pir = 0 > > } check pir and irr irr &= pir_tmp > > set pir > > send IPI > > unlock(pir) > > > > At this point both pir and irr are set and interrupt may be coalesced, > > but it is reported as delivered. > > > > So what prevents the scenario above from happening? > Yes, you are right. For this case, it can do nothing. And we cannot solve this problem in current KVM, right? > Yes, as far as I can see we cannot reliably implement KVM's KVM_IRQ_LINE_STATUS interface with given HW interface. Now we should think about how to minimize the damage. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Feb 24, 2013 at 01:17:59PM +0000, Zhang, Yang Z wrote: > Marcelo Tosatti wrote on 2013-02-24: > > On Sat, Feb 23, 2013 at 03:16:11PM +0000, Zhang, Yang Z wrote: > >> Marcelo Tosatti wrote on 2013-02-23: > >>> On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote: > >>>> Marcelo Tosatti wrote on 2013-02-23: > >>>>> > >>>>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote: > >>>>>> From: Yang Zhang <yang.z.zhang@Intel.com> > >>>>>> > >>>>>> Posted Interrupt allows APIC interrupts to inject into guest directly > >>>>>> without any vmexit. > >>>>>> > >>>>>> - When delivering a interrupt to guest, if target vcpu is running, > >>>>>> update Posted-interrupt requests bitmap and send a notification > >>>>>> event to the vcpu. Then the vcpu will handle this interrupt > >>>>>> automatically, without any software involvemnt. - If target vcpu is > >>>>>> not running or there already a notification event pending in the > >>>>>> vcpu, do nothing. The interrupt will be handled by next vm entry > >>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> > >>>>>> --- > >>>>>> > >>>>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c > >>>>>> index e4595f1..64616cc 100644 > >>>>>> --- a/arch/x86/kernel/irq.c > >>>>>> +++ b/arch/x86/kernel/irq.c > >>>>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs) > >>>>>> set_irq_regs(old_regs); > >>>>>> } > >>>>>> +#ifdef CONFIG_HAVE_KVM > >>>>>> +/* > >>>>>> + * Handler for POSTED_INTERRUPT_VECTOR. > >>>>>> + */ > >>>>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) > >>>>>> +{ > >>>>>> + struct pt_regs *old_regs = set_irq_regs(regs); > >>>>>> + > >>>>>> + ack_APIC_irq(); > >>>>>> + > >>>>>> + irq_enter(); > >>>>>> + > >>>>>> + exit_idle(); > >>>>>> + > >>>>>> + irq_exit(); > >>>>>> + > >>>>>> + set_irq_regs(old_regs); > >>>>>> +} > >>>>>> +#endif > >>>>>> + > >>>>> > >>>>> Add per-cpu counters, similarly to other handlers in the same file. > >>>> Sure. > >>>> > >>>>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct > > kvm_lapic > >>>>> *apic) > >>>>>> if (!apic->irr_pending) return -1; > >>>>>> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = > >>>>>> apic_search_irr(apic); ASSERT(result == -1 || result >= 16); > >>>>> > >>>>> kvm_x86_ops->sync_pir_to_irr() should be called from vcpu_enter_guest, > >>>>> before inject_pending_event. > >>>>> > >>>>> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) > > { > >>>>> -> > >>>>> inject_pending_event > >>>>> ... > >>>>> } > >>>> Some other places will search irr to do something, like > >>> kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch > >>> irr, not just before enter guest. > >>> > >>> I see. The only call site that needs IRR/PIR information with posted > >>> interrupt enabled is kvm_arch_vcpu_runnable, correct? > >> Yes. > >> > >>> If so, can we contain reading PIR to only that location? > >> Yes, we can do it. Actually, why I do pir->irr here is to avoid the > >> wrong usage in future of check pending interrupt just by call > >> kvm_vcpu_has_interrupt(). > > > > Yes, i see. > > > >> Also, there may need an extra callback to check pir. > >> So you think your suggestions is better? > > > > Because it respects standard request processing mechanism, yes. > Sure. Will change it in next patch. > > >>> And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect the > >>> standard way of event processing and also reduce reading the PIR). > >> Since we will check ON bit before each reading PIR, the cost can be > >> ignored. > > > > Note reading ON bit is not OK, because if injection path did not see > > vcpu->mode == IN_GUEST_MODE, ON bit will not be set. > In my patch, It will set ON bit before check vcpu->mode. So it's ok. > Actually, I think the ON bit must be set unconditionally after change PIR regardless of vcpu->mode. > > >>> > >>> So it is possible that a PIR IPI is delivered and handled before guest > >>> entry. > >>> > >>> By clearing PIR notification bit after local_irq_disable, but before the > >>> last check for vcpu->requests, we guarantee that a PIR IPI is never lost. > >>> > >>> Makes sense? (please check the logic, it might be flawed). > >> I am not sure whether I understand you. But I don't think the IPI will > >> lost in current patch: > >> > >> if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode == > > IN_GUEST_MODE)) { > >> kvm_make_request(KVM_REQ_EVENT, vcpu); > >> apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), > >> POSTED_INTR_VECTOR); > >> *delivered = true; > >> } > >> > >> vcpu entry has: > >> vcpu->mode = GUEST_MODE > >> local irq disable > >> check request > >> > >> We will send the IPI after making request, if IPI is received after set > > guest_mode before disable irq, then it still will be handled by the following check > > request. > >> Please correct me if I am wrong. p1) > > cpu0 cpu1 vcpu0 > > test_and_set_bit(PIR-A) > > set KVM_REQ_EVENT > > process REQ_EVENT > > PIR-A->IRR > > > > vcpu->mode=IN_GUEST > > > > if (vcpu0->guest_mode) > > if (!t_a_s_bit(PIR notif)) > > send IPI > > linux_pir_handler > > > > t_a_s_b(PIR-B)=1 > > no PIR IPI sent p2) > No, this exists with your implementation not mine. > As I said, in this patch, it will make request after vcpu ==guest mode, then send ipi. Even the ipi is lost, but the request still will be tracked. Because we have the follow check: > vcpu->mode = GUEST_MODE > (ipi may arrived here and lost) > local irq disable > check request (this will ensure the pir modification will be picked up by vcpu before vmentry) Please read the sequence above again, between p1) and p2). Yes, if the IPI is lost, the request _for the CPU which sent PIR IPI_ is guaranteed to be processed, but not the request for another cpu (cpu1). If in fact the scenario is not possible, then please point out between p1) and p2) where the error in representation is. Note there are 3 contexes: CPU0, CPU1, VCPU0 (virtual CPU on some CPU != 0 and != 1). > >>> Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the > >>> need for the spinlock there. > >> In function sync_pir_to_irr(): > >> tmp_pir=xchg(pir,0) > >> xchg(tmp_pir, irr) > >> > >> It is not atomically, the lock still is needed. > > > > IRR is only accessed by local vcpu context, only PIR is accessed concurrently, > > therefore only PIR accesses must be atomic. xchg instruction is > > locked and atomic. > There has same problem as we discussed before. Consider this: > Before delivery poste ipi, the irr is 0. then: > > cpu0 cpu1 vcpu0 > delivery_posted_intr() > sync_pir_toirr(): > tmp_pir=xchg(pir,0)(pir is cleared) > check pir(pass) > check irr(pass) > xchg(tmp_pir, irr) > injected= true > set pir > > Same problem: both pir and irr is set, but it reported as injected. Still need the lock to protect it. See: cpu0 check pir(pass) check irr(pass) injected = !test_and_set_bit(pir) versus cpu1 xchg(pir) No information can be lost because all accesses to shared data are atomic. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote: > > contexts, but only two use locks. > See following logic, I think the problem you mentioned will not happened with current logic. > > get lock > if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.) > return coalesced > if test_irr > return coalesced > set_pir > injected=true > if t_a_s(on) && in guest mode > send ipi > unlock > > > >>> I'd rather think about proper way to do lockless injection before > >>> committing anything. The case where we care about correct injection > >>> status is rare, but we always care about scalability and since we > >>> violate the spec by reading vapic page while vcpu is in non-root > >>> operation anyway the result may be incorrect with or without the lock. > >>> Our use can was not in HW designers mind when they designed this thing > >>> obviously :( > >> > >> Zhang, can you comment on whether reading vapic page with CPU in > >> VMX-non root accessing it is safe? > See 24.11.4 > In addition to data in the VMCS region itself, VMX non-root operation can be controlled by data structures that are > referenced by pointers in a VMCS (for example, the I/O bitmaps). While the pointers to these data structures are > parts of the VMCS, the data structures themselves are not. They are not accessible using VMREAD and VMWRITE > but by ordinary memory writes. > Software should ensure that each such data structure is modified only when no logical processor with a current > VMCS that references it is in VMX non-root operation. Doing otherwise may lead to unpredictable behavior. > > This means the initial design of KVM is wrong. It should not to modify vIRR directly. > > The good thing is that reading is ok. OK. > >> Gleb, yes, a comment mentioning the race (instead of the spinlock) and > >> explanation why its believed to be benign (given how the injection > >> return value is interpreted) could also work. Its ugly, though... murphy > >> is around. > > The race above is not benign. It will report interrupt as coalesced > > while in reality it is injected. This may cause to many interrupt to be > > injected. If this happens rare enough ntp may be able to fix time drift > > resulted from this. > Please check the above logic. Does it have same problem? If yes, please point out. 1) set_pir must be test_and_set_bit() (so the lock at vcpu entry path can be dropped). 2) if t_a_s(on) && in guest mode send ipi should be inverted: if guest mode && t_a_s(on) send ipi So you avoid setting ON bit if guest is outside of guest mode. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Feb 24, 2013 at 04:19:17PM +0200, Gleb Natapov wrote: > On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote: > > > I do not think it fixes it. There is no guaranty that IPI will be > > > processed by remote cpu while sending cpu is still in locked section, so > > > the same race may happen regardless. As you say above there are 3 > > > contexts, but only two use locks. > > See following logic, I think the problem you mentioned will not happened with current logic. > > > > get lock > > if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.) Does the PIR IPI interrupt returns synchronously _after_ PIR->IRR transfer is made? Don't think so. PIR IPI interrupt returns after remote CPU has acked interrupt receival, not after remote CPU has acked _and_ performed PIR->IRR transfer. Right? (yes, right, see step 4. below). Should try to make it easier to drop the lock later, so depend on it as little as possible (also please document what it protects in detail). Also note: " 3. The processor clears the outstanding-notification bit in the posted-interrupt descriptor. This is done atomically so as to leave the remainder of the descriptor unmodified (e.g., with a locked AND operation). 4. The processor writes zero to the EOI register in the local APIC; this dismisses the interrupt with the postedinterrupt notification vector from the local APIC. 5. The logical processor performs a logical-OR of PIR into VIRR and clears PIR. No other agent can read or write a PIR bit (or group of bits) between the time it is read (to determine what to OR into VIRR) and when it is cleared. " So checking the ON bit does not mean the HW has finished performing PIR->VIRR transfer (which means ON bit can only be used as an indication of whether to send interrupt, not whether PIR->VIRR transfer is finished). So its fine to do -> atomic set pir -> if (atomic_test_and_set(PIR ON bit)) -> send IPI But HW can transfer two distinct bits, set by different serialized instances of kvm_set_irq (protected with a lock), in a single PIR->IRR pass. > I do not see where those assumptions are coming from. Testing pir does > not guaranty that the IPI is not processed by VCPU right now. > > iothread: vcpu: > send_irq() > lock(pir) > check pir and irr > set pir > send IPI (*) > unlock(pir) > > send_irq() > lock(pir) > receive IPI (*) > atomic { > pir_tmp = pir > pir = 0 > } > check pir and irr > irr &= pir_tmp > set pir > send IPI > unlock(pir) > > At this point both pir and irr are set and interrupt may be coalesced, > but it is reported as delivered. s/"set pir"/"injected = !t_a_s(pir)"/ > So what prevents the scenario above from happening? > > -- > Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
I didn't really follow, but is the root cause the need to keep track of interrupt coalescing? If so we can recommend that users use KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection with irq coalescing support to vcpu context. It's not pleasant to cause a performance regression, so it should be a last resort (or maybe do both if it helps). On Sun, Feb 24, 2013 at 8:08 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: > On Sun, Feb 24, 2013 at 04:19:17PM +0200, Gleb Natapov wrote: >> On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote: >> > > I do not think it fixes it. There is no guaranty that IPI will be >> > > processed by remote cpu while sending cpu is still in locked section, so >> > > the same race may happen regardless. As you say above there are 3 >> > > contexts, but only two use locks. >> > See following logic, I think the problem you mentioned will not happened with current logic. >> > >> > get lock >> > if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we are taking the lock now, no IPI will be sent before we release the lock and no pir->irr is performed by hardware for same interrupt.) > > Does the PIR IPI interrupt returns synchronously _after_ PIR->IRR transfer > is made? Don't think so. > > PIR IPI interrupt returns after remote CPU has acked interrupt receival, > not after remote CPU has acked _and_ performed PIR->IRR transfer. > > Right? (yes, right, see step 4. below). > > Should try to make it easier to drop the lock later, so depend on it as > little as possible (also please document what it protects in detail). > > Also note: > > " > 3. The processor clears the outstanding-notification bit in the > posted-interrupt descriptor. This is done atomically > so as to leave the remainder of the descriptor unmodified (e.g., with a > locked AND operation). > 4. The processor writes zero to the EOI register in the local APIC; this > dismisses the interrupt with the postedinterrupt > notification vector from the local APIC. > 5. The logical processor performs a logical-OR of PIR into VIRR and > clears PIR. No other agent can read or write a > PIR bit (or group of bits) between the time it is read (to determine > what to OR into VIRR) and when it is cleared. > " > > So checking the ON bit does not mean the HW has finished performing > PIR->VIRR transfer (which means ON bit can only be used as an indication > of whether to send interrupt, not whether PIR->VIRR transfer is > finished). > > So its fine to do > > -> atomic set pir > -> if (atomic_test_and_set(PIR ON bit)) > -> send IPI > > But HW can transfer two distinct bits, set by different serialized instances > of kvm_set_irq (protected with a lock), in a single PIR->IRR pass. > >> I do not see where those assumptions are coming from. Testing pir does >> not guaranty that the IPI is not processed by VCPU right now. >> >> iothread: vcpu: >> send_irq() >> lock(pir) >> check pir and irr >> set pir >> send IPI (*) >> unlock(pir) >> >> send_irq() >> lock(pir) >> receive IPI (*) >> atomic { >> pir_tmp = pir >> pir = 0 >> } >> check pir and irr >> irr &= pir_tmp >> set pir >> send IPI >> unlock(pir) >> >> At this point both pir and irr are set and interrupt may be coalesced, >> but it is reported as delivered. > > s/"set pir"/"injected = !t_a_s(pir)"/ > >> So what prevents the scenario above from happening? >> >> -- >> Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti wrote on 2013-02-25: > On Sun, Feb 24, 2013 at 01:17:59PM +0000, Zhang, Yang Z wrote: >> Marcelo Tosatti wrote on 2013-02-24: >>> On Sat, Feb 23, 2013 at 03:16:11PM +0000, Zhang, Yang Z wrote: >>>> Marcelo Tosatti wrote on 2013-02-23: >>>>> On Sat, Feb 23, 2013 at 02:05:28PM +0000, Zhang, Yang Z wrote: >>>>>> Marcelo Tosatti wrote on 2013-02-23: >>>>>>> >>>>>>> On Fri, Feb 22, 2013 at 09:42:32PM +0800, Yang Zhang wrote: >>>>>>>> From: Yang Zhang <yang.z.zhang@Intel.com> >>>>>>>> >>>>>>>> Posted Interrupt allows APIC interrupts to inject into guest directly >>>>>>>> without any vmexit. >>>>>>>> >>>>>>>> - When delivering a interrupt to guest, if target vcpu is running, >>>>>>>> update Posted-interrupt requests bitmap and send a notification >>>>>>>> event to the vcpu. Then the vcpu will handle this interrupt >>>>>>>> automatically, without any software involvemnt. - If target vcpu is >>>>>>>> not running or there already a notification event pending in the >>>>>>>> vcpu, do nothing. The interrupt will be handled by next vm entry >>>>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> >>>>>>>> --- >>>>>>>> >>>>>>>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c >>>>>>>> index e4595f1..64616cc 100644 >>>>>>>> --- a/arch/x86/kernel/irq.c >>>>>>>> +++ b/arch/x86/kernel/irq.c >>>>>>>> @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs > *regs) >>>>>>>> set_irq_regs(old_regs); >>>>>>>> } >>>>>>>> +#ifdef CONFIG_HAVE_KVM >>>>>>>> +/* >>>>>>>> + * Handler for POSTED_INTERRUPT_VECTOR. >>>>>>>> + */ >>>>>>>> +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) >>>>>>>> +{ >>>>>>>> + struct pt_regs *old_regs = set_irq_regs(regs); >>>>>>>> + >>>>>>>> + ack_APIC_irq(); >>>>>>>> + >>>>>>>> + irq_enter(); >>>>>>>> + >>>>>>>> + exit_idle(); >>>>>>>> + >>>>>>>> + irq_exit(); >>>>>>>> + >>>>>>>> + set_irq_regs(old_regs); >>>>>>>> +} >>>>>>>> +#endif >>>>>>>> + >>>>>>> >>>>>>> Add per-cpu counters, similarly to other handlers in the same file. >>>>>> Sure. >>>>>> >>>>>>>> @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct >>> kvm_lapic >>>>>>> *apic) >>>>>>>> if (!apic->irr_pending) return -1; >>>>>>>> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = >>>>>>>> apic_search_irr(apic); ASSERT(result == -1 || result >= 16); >>>>>>> >>>>>>> kvm_x86_ops->sync_pir_to_irr() should be called from >>>>>>> vcpu_enter_guest, before inject_pending_event. >>>>>>> >>>>>>> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || > req_int_win) >>> { >>>>>>> -> >>>>>>> inject_pending_event >>>>>>> ... >>>>>>> } >>>>>> Some other places will search irr to do something, like >>>>> kvm_cpu_has_interrupt(). So we must to sync pir to irr before touch >>>>> irr, not just before enter guest. >>>>> >>>>> I see. The only call site that needs IRR/PIR information with posted >>>>> interrupt enabled is kvm_arch_vcpu_runnable, correct? >>>> Yes. >>>> >>>>> If so, can we contain reading PIR to only that location? >>>> Yes, we can do it. Actually, why I do pir->irr here is to avoid the >>>> wrong usage in future of check pending interrupt just by call >>>> kvm_vcpu_has_interrupt(). >>> >>> Yes, i see. >>> >>>> Also, there may need an extra callback to check pir. >>>> So you think your suggestions is better? >>> >>> Because it respects standard request processing mechanism, yes. >> Sure. Will change it in next patch. >> >>>>> And have PIR->IRR sync at KVM_REQ_EVENT processing only (to respect >>>>> the standard way of event processing and also reduce reading the >>>>> PIR). >>>> Since we will check ON bit before each reading PIR, the cost can be >>>> ignored. >>> >>> Note reading ON bit is not OK, because if injection path did not see >>> vcpu->mode == IN_GUEST_MODE, ON bit will not be set. >> In my patch, It will set ON bit before check vcpu->mode. So it's ok. >> Actually, I think the ON bit must be set unconditionally after change >> PIR regardless of vcpu->mode. >> >>>>> >>>>> So it is possible that a PIR IPI is delivered and handled before guest >>>>> entry. >>>>> >>>>> By clearing PIR notification bit after local_irq_disable, but before the >>>>> last check for vcpu->requests, we guarantee that a PIR IPI is never lost. >>>>> >>>>> Makes sense? (please check the logic, it might be flawed). >>>> I am not sure whether I understand you. But I don't think the IPI will >>>> lost in current patch: >>>> >>>> if (!pi_test_and_set_on(&vmx->pi_desc) && (vcpu->mode == >>> IN_GUEST_MODE)) { >>>> kvm_make_request(KVM_REQ_EVENT, vcpu); >>>> apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), >>>> POSTED_INTR_VECTOR); >>>> *delivered = true; >>>> } >>>> >>>> vcpu entry has: >>>> vcpu->mode = GUEST_MODE >>>> local irq disable >>>> check request >>>> >>>> We will send the IPI after making request, if IPI is received after set >>> guest_mode before disable irq, then it still will be handled by the >>> following check request. >>>> Please correct me if I am wrong. > > p1) > >>> cpu0 cpu1 vcpu0 >>> test_and_set_bit(PIR-A) >>> set KVM_REQ_EVENT >>> process REQ_EVENT >>> PIR-A->IRR >>> >>> vcpu->mode=IN_GUEST >>> >>> if (vcpu0->guest_mode) >>> if (!t_a_s_bit(PIR notif)) >>> send IPI >>> linux_pir_handler >>> >>> t_a_s_b(PIR-B)=1 >>> no PIR IPI sent > > p2) > >> No, this exists with your implementation not mine. >> As I said, in this patch, it will make request after vcpu ==guest mode, then send > ipi. Even the ipi is lost, but the request still will be tracked. Because we have the > follow check: >> vcpu->mode = GUEST_MODE >> (ipi may arrived here and lost) >> local irq disable >> check request (this will ensure the pir modification will be picked up by vcpu > before vmentry) > > Please read the sequence above again, between p1) and p2). Yes, if the > IPI is lost, the request _for the CPU which sent PIR IPI_ is guaranteed > to be processed, but not the request for another cpu (cpu1). If no PIR IPI sent in cpu1, the delivered is false, and it will roll back to old logic: __apic_accept_irq(): if (!delivered) { kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } This can solve the problem you mentioned above. Right? > If in fact the scenario is not possible, then please point out between > p1) and p2) where the error in representation is. > > Note there are 3 contexes: CPU0, CPU1, VCPU0 (virtual CPU on some CPU != 0 > and != > 1). >>>>> Sync PIR->IRR uses xchg, which is locked and atomic, so can't see the >>>>> need for the spinlock there. >>>> In function sync_pir_to_irr(): >>>> tmp_pir=xchg(pir,0) >>>> xchg(tmp_pir, irr) >>>> >>>> It is not atomically, the lock still is needed. >>> >>> IRR is only accessed by local vcpu context, only PIR is accessed concurrently, >>> therefore only PIR accesses must be atomic. xchg instruction is >>> locked and atomic. >> There has same problem as we discussed before. Consider this: >> Before delivery poste ipi, the irr is 0. then: >> >> cpu0 cpu1 vcpu0 >> delivery_posted_intr() >> sync_pir_toirr(): >> tmp_pir=xchg(pir,0)(pir is cleared) >> check pir(pass) >> check irr(pass) >> xchg(tmp_pir, irr) >> injected= true >> set pir >> >> Same problem: both pir and irr is set, but it reported as injected. Still need the > lock to protect it. > > See: > > cpu0 > > check pir(pass) > check irr(pass) > injected = !test_and_set_bit(pir) > > versus > > cpu1 >xchg(pir) > > No information can be lost because all accesses to shared data are > atomic. Sorry, I cannot get your point. Why 'xchg(pir)' can solve the problem I mentioned above? Can you elaborate it? The spinlock I used is trying to protect the whole procedure of sync pir to irr, not just access pir. Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti wrote on 2013-02-25: > On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote: >>> contexts, but only two use locks. >> See following logic, I think the problem you mentioned will not >> happened with current logic. >> >> get lock >> if test_pir (this will ensure there is no in flight IPI for same interrupt. Since we > are taking the lock now, no IPI will be sent before we release the lock and no > pir->irr is performed by hardware for same interrupt.) >> return coalesced if test_irr return coalesced >> set_pir >> injected=true >> if t_a_s(on) && in guest mode >> send ipi >> unlock >> >> >>>>> I'd rather think about proper way to do lockless injection before >>>>> committing anything. The case where we care about correct injection >>>>> status is rare, but we always care about scalability and since we >>>>> violate the spec by reading vapic page while vcpu is in non-root >>>>> operation anyway the result may be incorrect with or without the lock. >>>>> Our use can was not in HW designers mind when they designed this thing >>>>> obviously :( >>>> >>>> Zhang, can you comment on whether reading vapic page with CPU in >>>> VMX-non root accessing it is safe? >> See 24.11.4 In addition to data in the VMCS region itself, VMX non-root >> operation can be controlled by data structures that are referenced by >> pointers in a VMCS (for example, the I/O bitmaps). While the pointers >> to these data structures are parts of the VMCS, the data structures >> themselves are not. They are not accessible using VMREAD and VMWRITE >> but by ordinary memory writes. Software should ensure that each such >> data structure is modified only when no logical processor with a >> current VMCS that references it is in VMX non-root operation. Doing >> otherwise may lead to unpredictable behavior. >> >> This means the initial design of KVM is wrong. It should not to modify >> vIRR directly. >> >> The good thing is that reading is ok. > > OK. > >>>> Gleb, yes, a comment mentioning the race (instead of the spinlock) and >>>> explanation why its believed to be benign (given how the injection >>>> return value is interpreted) could also work. Its ugly, though... murphy >>>> is around. >>> The race above is not benign. It will report interrupt as coalesced >>> while in reality it is injected. This may cause to many interrupt to be >>> injected. If this happens rare enough ntp may be able to fix time drift >>> resulted from this. >> Please check the above logic. Does it have same problem? If yes, please point > out. > > 1) set_pir must be test_and_set_bit() (so the lock at vcpu entry path can > be dropped). > > 2) if t_a_s(on) && in guest mode > send ipi > should be inverted: > > if guest mode && t_a_s(on) > send ipi > So you avoid setting ON bit if guest is outside of guest mode. Need the ON bit to track whether there are pending bit in pir. Or else, must traverse pir even it is empty when calling sync_pir_to_irr. Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote on 2013-02-25: > I didn't really follow, but is the root cause the need to keep track > of interrupt coalescing? If so we can recommend that users use > KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection > with irq coalescing support to vcpu context. So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted interrupt is enabled to force users doesn't to use KVM_IRQ_LINE_STATUS. Does this acceptable? The only case in KVM that need to know the interrupt injection status is vlapic timer. But since vlapic timer and vcpu are always in same pcpu, so there is no problem. > It's not pleasant to cause a performance regression, so it should be a > last resort (or maybe do both if it helps). > > On Sun, Feb 24, 2013 at 8:08 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: >> On Sun, Feb 24, 2013 at 04:19:17PM +0200, Gleb Natapov wrote: >>> On Sun, Feb 24, 2013 at 01:55:07PM +0000, Zhang, Yang Z wrote: >>>>> I do not think it fixes it. There is no guaranty that IPI will be >>>>> processed by remote cpu while sending cpu is still in locked section, so >>>>> the same race may happen regardless. As you say above there are 3 >>>>> contexts, but only two use locks. >>>> See following logic, I think the problem you mentioned will not >>>> happened with current logic. >>>> >>>> get lock >>>> if test_pir (this will ensure there is no in flight IPI for same interrupt. Since > we are taking the lock now, no IPI will be sent before we release the lock and no > pir->irr is performed by hardware for same interrupt.) >> >> Does the PIR IPI interrupt returns synchronously _after_ PIR->IRR transfer >> is made? Don't think so. >> >> PIR IPI interrupt returns after remote CPU has acked interrupt receival, >> not after remote CPU has acked _and_ performed PIR->IRR transfer. >> >> Right? (yes, right, see step 4. below). >> >> Should try to make it easier to drop the lock later, so depend on it as >> little as possible (also please document what it protects in detail). >> >> Also note: >> >> " >> 3. The processor clears the outstanding-notification bit in the >> posted-interrupt descriptor. This is done atomically >> so as to leave the remainder of the descriptor unmodified (e.g., with a >> locked AND operation). >> 4. The processor writes zero to the EOI register in the local APIC; this >> dismisses the interrupt with the postedinterrupt >> notification vector from the local APIC. >> 5. The logical processor performs a logical-OR of PIR into VIRR and >> clears PIR. No other agent can read or write a >> PIR bit (or group of bits) between the time it is read (to determine >> what to OR into VIRR) and when it is cleared. >> " >> >> So checking the ON bit does not mean the HW has finished performing >> PIR->VIRR transfer (which means ON bit can only be used as an indication >> of whether to send interrupt, not whether PIR->VIRR transfer is >> finished). >> >> So its fine to do >> >> -> atomic set pir >> -> if (atomic_test_and_set(PIR ON bit)) >> -> send IPI >> >> But HW can transfer two distinct bits, set by different serialized instances >> of kvm_set_irq (protected with a lock), in a single PIR->IRR pass. >> >>> I do not see where those assumptions are coming from. Testing pir does >>> not guaranty that the IPI is not processed by VCPU right now. >>> >>> iothread: vcpu: >>> send_irq() >>> lock(pir) >>> check pir and irr >>> set pir >>> send IPI (*) >>> unlock(pir) >>> >>> send_irq() >>> lock(pir) >>> receive IPI (*) >>> atomic { >>> pir_tmp = pir >>> pir = 0 >>> } check pir and irr irr &= pir_tmp >>> set pir >>> send IPI >>> unlock(pir) >>> >>> At this point both pir and irr are set and interrupt may be coalesced, >>> but it is reported as delivered. >> >> s/"set pir"/"injected = !t_a_s(pir)"/ >> >>> So what prevents the scenario above from happening? >>> >>> -- >>> Gleb. Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote: > Avi Kivity wrote on 2013-02-25: > > I didn't really follow, but is the root cause the need to keep track > > of interrupt coalescing? If so we can recommend that users use > > KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection > > with irq coalescing support to vcpu context. > So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted interrupt is enabled to force users doesn't to use KVM_IRQ_LINE_STATUS. Does this acceptable? > > The only case in KVM that need to know the interrupt injection status is vlapic timer. But since vlapic timer and vcpu are always in same pcpu, so there is no problem. > Not really. The primary user of this interface is RTC interrupt re-injection for Windows guests. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote on 2013-02-25: > On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote: >> Avi Kivity wrote on 2013-02-25: >>> I didn't really follow, but is the root cause the need to keep track >>> of interrupt coalescing? If so we can recommend that users use >>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection >>> with irq coalescing support to vcpu context. >> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted > interrupt is enabled to force users doesn't to use KVM_IRQ_LINE_STATUS. Does > this acceptable? >> >> The only case in KVM that need to know the interrupt injection status is vlapic > timer. But since vlapic timer and vcpu are always in same pcpu, so there is no > problem. >> > Not really. The primary user of this interface is RTC interrupt > re-injection for Windows guests. So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well? Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote: > Gleb Natapov wrote on 2013-02-25: > > On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote: > >> Avi Kivity wrote on 2013-02-25: > >>> I didn't really follow, but is the root cause the need to keep track > >>> of interrupt coalescing? If so we can recommend that users use > >>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt injection > >>> with irq coalescing support to vcpu context. > >> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted > > interrupt is enabled to force users doesn't to use KVM_IRQ_LINE_STATUS. Does > > this acceptable? > >> > >> The only case in KVM that need to know the interrupt injection status is vlapic > > timer. But since vlapic timer and vcpu are always in same pcpu, so there is no > > problem. > >> > > Not really. The primary user of this interface is RTC interrupt > > re-injection for Windows guests. > So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well? > Windows guests may experience timedrift under CPU overcommit scenario. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote on 2013-02-25: > On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote: >> Gleb Natapov wrote on 2013-02-25: >>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote: >>>> Avi Kivity wrote on 2013-02-25: >>>>> I didn't really follow, but is the root cause the need to keep track >>>>> of interrupt coalescing? If so we can recommend that users use >>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt >>>>> injection with irq coalescing support to vcpu context. >>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted >>> interrupt is enabled to force users doesn't to use >>> KVM_IRQ_LINE_STATUS. Does this acceptable? >>>> >>>> The only case in KVM that need to know the interrupt injection status is > vlapic >>> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no >>> problem. >>>> >>> Not really. The primary user of this interface is RTC interrupt >>> re-injection for Windows guests. >> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well? >> > Windows guests may experience timedrift under CPU overcommit scenario. Ok, I see. Seems we are stuck. :( Do you have any suggestion to solve or workaround current problem? Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 11:13:25AM +0000, Zhang, Yang Z wrote: > Gleb Natapov wrote on 2013-02-25: > > On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote: > >> Gleb Natapov wrote on 2013-02-25: > >>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote: > >>>> Avi Kivity wrote on 2013-02-25: > >>>>> I didn't really follow, but is the root cause the need to keep track > >>>>> of interrupt coalescing? If so we can recommend that users use > >>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt > >>>>> injection with irq coalescing support to vcpu context. > >>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted > >>> interrupt is enabled to force users doesn't to use > >>> KVM_IRQ_LINE_STATUS. Does this acceptable? > >>>> > >>>> The only case in KVM that need to know the interrupt injection status is > > vlapic > >>> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no > >>> problem. > >>>> > >>> Not really. The primary user of this interface is RTC interrupt > >>> re-injection for Windows guests. > >> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well? > >> > > Windows guests may experience timedrift under CPU overcommit scenario. > Ok, I see. Seems we are stuck. :( > Do you have any suggestion to solve or workaround current problem? Depend on knowledge about atomicity (item 5 IIRC) of the sequence in the manual. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti wrote on 2013-02-25: > On Mon, Feb 25, 2013 at 11:13:25AM +0000, Zhang, Yang Z wrote: >> Gleb Natapov wrote on 2013-02-25: >>> On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote: >>>> Gleb Natapov wrote on 2013-02-25: >>>>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote: >>>>>> Avi Kivity wrote on 2013-02-25: >>>>>>> I didn't really follow, but is the root cause the need to keep track >>>>>>> of interrupt coalescing? If so we can recommend that users use >>>>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt >>>>>>> injection with irq coalescing support to vcpu context. >>>>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when > posted >>>>> interrupt is enabled to force users doesn't to use >>>>> KVM_IRQ_LINE_STATUS. Does this acceptable? >>>>>> >>>>>> The only case in KVM that need to know the interrupt injection status is >>> vlapic >>>>> timer. But since vlapic timer and vcpu are always in same pcpu, so >>>>> there is no problem. >>>>>> >>>>> Not really. The primary user of this interface is RTC interrupt >>>>> re-injection for Windows guests. >>>> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well? >>>> >>> Windows guests may experience timedrift under CPU overcommit scenario. >> Ok, I see. Seems we are stuck. :( >> Do you have any suggestion to solve or workaround current problem? > > Depend on knowledge about atomicity (item 5 IIRC) of the sequence > in the manual. I am sure it is not atomic: tmp_pir=xhcg(pir, 0) irr |= tmp_pir Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 06:55:58AM +0000, Zhang, Yang Z wrote: > > > > p1) > > > >>> cpu0 cpu1 vcpu0 > >>> test_and_set_bit(PIR-A) > >>> set KVM_REQ_EVENT > >>> process REQ_EVENT > >>> PIR-A->IRR > >>> > >>> vcpu->mode=IN_GUEST > >>> > >>> if (vcpu0->guest_mode) > >>> if (!t_a_s_bit(PIR notif)) > >>> send IPI > >>> linux_pir_handler > >>> > >>> t_a_s_b(PIR-B)=1 > >>> no PIR IPI sent > > > > p2) > > > >> No, this exists with your implementation not mine. > >> As I said, in this patch, it will make request after vcpu ==guest mode, then send > > ipi. Even the ipi is lost, but the request still will be tracked. Because we have the > > follow check: > >> vcpu->mode = GUEST_MODE > >> (ipi may arrived here and lost) > >> local irq disable > >> check request (this will ensure the pir modification will be picked up by vcpu > > before vmentry) > > > > Please read the sequence above again, between p1) and p2). Yes, if the > > IPI is lost, the request _for the CPU which sent PIR IPI_ is guaranteed > > to be processed, but not the request for another cpu (cpu1). > If no PIR IPI sent in cpu1, the delivered is false, and it will roll back to old logic: > __apic_accept_irq(): > if (!delivered) { > kvm_make_request(KVM_REQ_EVENT, vcpu); > kvm_vcpu_kick(vcpu); > } > This can solve the problem you mentioned above. Right? Should not be doing this kick in the first place. One result of it is that you'll always take a VM-exit if a second injection happens while a VCPU has not handled the first one. What is the drawback of the suggestion to unconditionally clear PIR notification bit on VM-entry? We can avoid it, but lets get it correct first (BTW can stick a comment saying FIXME: optimize) on that PIR clearing. > > cpu0 > > > > check pir(pass) > > check irr(pass) > > injected = !test_and_set_bit(pir) > > > > versus > > > > cpu1 > >xchg(pir) > > > > No information can be lost because all accesses to shared data are > > atomic. > Sorry, I cannot get your point. Why 'xchg(pir)' can solve the problem I mentioned above? Can you elaborate it? > The spinlock I used is trying to protect the whole procedure of sync pir to irr, not just access pir. You're right, its the same problem as with the hardware update. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 11:13:25AM +0000, Zhang, Yang Z wrote: > Gleb Natapov wrote on 2013-02-25: > > On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote: > >> Gleb Natapov wrote on 2013-02-25: > >>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote: > >>>> Avi Kivity wrote on 2013-02-25: > >>>>> I didn't really follow, but is the root cause the need to keep track > >>>>> of interrupt coalescing? If so we can recommend that users use > >>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt > >>>>> injection with irq coalescing support to vcpu context. > >>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted > >>> interrupt is enabled to force users doesn't to use > >>> KVM_IRQ_LINE_STATUS. Does this acceptable? > >>>> > >>>> The only case in KVM that need to know the interrupt injection status is > > vlapic > >>> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no > >>> problem. > >>>> > >>> Not really. The primary user of this interface is RTC interrupt > >>> re-injection for Windows guests. > >> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well? > >> > > Windows guests may experience timedrift under CPU overcommit scenario. > Ok, I see. Seems we are stuck. :( > Do you have any suggestion to solve or workaround current problem? > I see a couple of possible solutions: 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons: current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it will be slow on newer kernels 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not running during injection. This assumes that if vcpu is running and does not process interrupt it is guest fault and the same can happen on real HW too. Coalescing when vcpu is not running though is the result of CPU overcommit and should be reported. Cons interface definition is kind of murky. 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI notifiers for interrupt reinjection. This requires us to add interface for reporting EOI to userspace. This is not in the scope of this patchset. Cons: need to introduce new interface (and the one that will not work on AMD BTW) Other ideas? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 03:34:19PM +0200, Gleb Natapov wrote: > On Mon, Feb 25, 2013 at 11:13:25AM +0000, Zhang, Yang Z wrote: > > Gleb Natapov wrote on 2013-02-25: > > > On Mon, Feb 25, 2013 at 11:04:25AM +0000, Zhang, Yang Z wrote: > > >> Gleb Natapov wrote on 2013-02-25: > > >>> On Mon, Feb 25, 2013 at 08:42:52AM +0000, Zhang, Yang Z wrote: > > >>>> Avi Kivity wrote on 2013-02-25: > > >>>>> I didn't really follow, but is the root cause the need to keep track > > >>>>> of interrupt coalescing? If so we can recommend that users use > > >>>>> KVM_IRQ_LINE when coalescing is unneeded, and move interrupt > > >>>>> injection with irq coalescing support to vcpu context. > > >>>> So we can hide the capability KVM_CAP_IRQ_INJECT_STATUS when posted > > >>> interrupt is enabled to force users doesn't to use > > >>> KVM_IRQ_LINE_STATUS. Does this acceptable? > > >>>> > > >>>> The only case in KVM that need to know the interrupt injection status is > > > vlapic > > >>> timer. But since vlapic timer and vcpu are always in same pcpu, so there is no > > >>> problem. > > >>>> > > >>> Not really. The primary user of this interface is RTC interrupt > > >>> re-injection for Windows guests. > > >> So without KVM_IRQ_LINE_STATUS capability, RTC cannot work well? > > >> > > > Windows guests may experience timedrift under CPU overcommit scenario. > > Ok, I see. Seems we are stuck. :( > > Do you have any suggestion to solve or workaround current problem? > > > I see a couple of possible solutions: > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons: > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it > will be slow on newer kernels Can add a capability to QEMU and enable APICv selectively only in newer QEMU, which can issue KVM_IRQ_LINE_STATUS on target vcpu only when necessary (and KVM_IRQ_LINE otherwise). Even a lock serializing injection is not safe because ON bit is cleared before XCHG(PIR, 0). Must do something heavier (such as running on target vcpu context). > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not > running during injection. This assumes that if vcpu is running and does > not process interrupt it is guest fault and the same can happen on real > HW too. Coalescing when vcpu is not running though is the result of CPU > overcommit and should be reported. Cons interface definition is kind of > murky. > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI > notifiers for interrupt reinjection. This requires us to add interface > for reporting EOI to userspace. This is not in the scope of this > patchset. Cons: need to introduce new interface (and the one that will > not work on AMD BTW) Breaks older userspace? > > Other ideas? Can HW write a 'finished' bit after 6 in the reserved area? Suppose its not a KVM-specific problem? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 11:00:21AM -0300, Marcelo Tosatti wrote: > > I see a couple of possible solutions: > > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons: > > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it > > will be slow on newer kernels > > Can add a capability to QEMU and enable APICv selectively only > in newer QEMU, which can issue KVM_IRQ_LINE_STATUS on target vcpu > only when necessary (and KVM_IRQ_LINE otherwise). Bad idea. What happens with mixed scenarios. > Even a lock serializing injection is not safe because ON bit is cleared > before XCHG(PIR, 0). Must do something heavier (such as running on > target vcpu context). Note always running on target vcpu is likely to be slower than no-APICv. So need to do something heavier on the kernel under serialization, if firmware cannot be changed (injection from simultaneous CPUs should be rare so if data to serialize __accept_apic_irq is cache-line aligned it should reduce performance impact). > > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not > > running during injection. This assumes that if vcpu is running and does > > not process interrupt it is guest fault and the same can happen on real > > HW too. Coalescing when vcpu is not running though is the result of CPU > > overcommit and should be reported. Cons interface definition is kind of > > murky. > > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI > > notifiers for interrupt reinjection. This requires us to add interface > > for reporting EOI to userspace. This is not in the scope of this > > patchset. Cons: need to introduce new interface (and the one that will > > not work on AMD BTW) Is there no int-ack notification at RTC HW level? > Breaks older userspace? > > > > Other ideas? > > Can HW write a 'finished' bit after 6 in the reserved area? Suppose its > not a KVM-specific problem? > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti wrote on 2013-02-25: > On Mon, Feb 25, 2013 at 06:55:58AM +0000, Zhang, Yang Z wrote: >>> >>> p1) >>> >>>>> cpu0 cpu1 vcpu0 >>>>> test_and_set_bit(PIR-A) >>>>> set KVM_REQ_EVENT >>>>> process REQ_EVENT >>>>> PIR-A->IRR >>>>> >>>>> vcpu->mode=IN_GUEST >>>>> >>>>> if (vcpu0->guest_mode) >>>>> if (!t_a_s_bit(PIR notif)) >>>>> send IPI >>>>> linux_pir_handler >>>>> >>>>> t_a_s_b(PIR-B)=1 >>>>> no PIR IPI sent >>> >>> p2) >>> >>>> No, this exists with your implementation not mine. >>>> As I said, in this patch, it will make request after vcpu ==guest mode, then > send >>> ipi. Even the ipi is lost, but the request still will be tracked. >>> Because we have the follow check: >>>> vcpu->mode = GUEST_MODE >>>> (ipi may arrived here and lost) >>>> local irq disable >>>> check request (this will ensure the pir modification will be picked up by vcpu >>> before vmentry) >>> >>> Please read the sequence above again, between p1) and p2). Yes, if the >>> IPI is lost, the request _for the CPU which sent PIR IPI_ is guaranteed >>> to be processed, but not the request for another cpu (cpu1). >> If no PIR IPI sent in cpu1, the delivered is false, and it will roll back to old logic: >> __apic_accept_irq(): >> if (!delivered) { >> kvm_make_request(KVM_REQ_EVENT, vcpu); >> kvm_vcpu_kick(vcpu); >> } >> This can solve the problem you mentioned above. Right? > > Should not be doing this kick in the first place. One result of it is > that you'll always take a VM-exit if a second injection happens while a VCPU > has not handled the first one. You are right. > What is the drawback of the suggestion to unconditionally clear PIR > notification bit on VM-entry? The only thing is we need to traverse the whole pir when calling sync_pir_to_irr even the pir is empty. > > We can avoid it, but lets get it correct first (BTW can stick a comment > saying FIXME: optimize) on that PIR clearing. Ok, I will adopt your suggestion. BTW, Where should the comment be add? on sync_pir_to_irr()? > >>> cpu0 >>> >>> check pir(pass) >>> check irr(pass) >>> injected = !test_and_set_bit(pir) >>> >>> versus >>> >>> cpu1 >>> xchg(pir) >>> >>> >>> No information can be lost because all accesses to shared data are >>> atomic. >> Sorry, I cannot get your point. Why 'xchg(pir)' can solve the problem I >> mentioned above? Can you elaborate it? The spinlock I used is trying to >> protect the whole procedure of sync pir to irr, > not just access pir. > > You're right, its the same problem as with the hardware update. Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
>> > I see a couple of possible solutions: > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons: > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it > will be slow on newer kernels You could backport the qemu change, verify that it builds, and push it to stable branches. It's hardly ideal but if nothing else comes up then it's a solution. > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not > running during injection. This assumes that if vcpu is running and does > not process interrupt it is guest fault and the same can happen on real > HW too. Coalescing when vcpu is not running though is the result of CPU > overcommit and should be reported. Cons interface definition is kind of > murky. You still have a window where the vcpu is scheduled out with guest interrupts disabled, then scheduled back in and before it manages to handle the interrupt, the second one hits. It's worth testing though. > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI > notifiers for interrupt reinjection. This requires us to add interface > for reporting EOI to userspace. This is not in the scope of this > patchset. Cons: need to introduce new interface (and the one that will > not work on AMD BTW) > > Other ideas? 1. inject RTC interrupt 2. not see EOI 3. inject RTC interrupt 4. due to 2, report coalesced AMD can still use IRR test-and-set. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 11:17:39AM -0300, Marcelo Tosatti wrote: > On Mon, Feb 25, 2013 at 11:00:21AM -0300, Marcelo Tosatti wrote: > > > I see a couple of possible solutions: > > > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons: > > > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it > > > will be slow on newer kernels > > > > Can add a capability to QEMU and enable APICv selectively only > > in newer QEMU, which can issue KVM_IRQ_LINE_STATUS on target vcpu > > only when necessary (and KVM_IRQ_LINE otherwise). > > Bad idea. What happens with mixed scenarios. > Exactly. > > Even a lock serializing injection is not safe because ON bit is cleared > > before XCHG(PIR, 0). Must do something heavier (such as running on > > target vcpu context). > > Note always running on target vcpu is likely to be slower than no-APICv. > Yes, but we will do it only for RTC interrupt. Still performance hit should be very visible when RTC is in 1000Hz mode. > So need to do something heavier on the kernel under serialization, if > firmware cannot be changed (injection from simultaneous CPUs should be > rare so if data to serialize __accept_apic_irq is cache-line aligned > it should reduce performance impact). > > > > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not > > > running during injection. This assumes that if vcpu is running and does > > > not process interrupt it is guest fault and the same can happen on real > > > HW too. Coalescing when vcpu is not running though is the result of CPU > > > overcommit and should be reported. Cons interface definition is kind of > > > murky. > > > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI > > > notifiers for interrupt reinjection. This requires us to add interface > > > for reporting EOI to userspace. This is not in the scope of this > > > patchset. Cons: need to introduce new interface (and the one that will > > > not work on AMD BTW) > > Is there no int-ack notification at RTC HW level? There is, but Windows calls it twice for each injected interrupt. I tried to use it in the past to detect interrupt coalescing, but this double ack prevented me from doing so. May be revisit this approach with willingness to be more hacky about it. Also it is possible to disable RTC ack for HyperV guests. We do not do it and if we will use the ack we will not be able to. > > > Breaks older userspace? Older userspace will have timedrif with Windows, yes. Note that we some version of Windows it has it now too. > > > > > > Other ideas? > > > > Can HW write a 'finished' bit after 6 in the reserved area? Suppose its > > not a KVM-specific problem? > > I still think adding locking just to obtain correct injection status is bad trade-off even if HW would allow us to get away with it. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 06:50:40PM +0200, Avi Kivity wrote: > >> > > I see a couple of possible solutions: > > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons: > > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it > > will be slow on newer kernels > > You could backport the qemu change, verify that it builds, and push it > to stable branches. It's hardly ideal but if nothing else comes up > then it's a solution. > > > > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not > > running during injection. This assumes that if vcpu is running and does > > not process interrupt it is guest fault and the same can happen on real > > HW too. Coalescing when vcpu is not running though is the result of CPU > > overcommit and should be reported. Cons interface definition is kind of > > murky. > > You still have a window where the vcpu is scheduled out with guest > interrupts disabled, then scheduled back in and before it manages to > handle the interrupt, the second one hits. > Yes, definitely not ideal solution. > It's worth testing though. > Yes again. Windows almost never disables interrupts and RTC interrupt is of highest priority. > > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI > > notifiers for interrupt reinjection. This requires us to add interface > > for reporting EOI to userspace. This is not in the scope of this > > patchset. Cons: need to introduce new interface (and the one that will > > not work on AMD BTW) > > > > Other ideas? > > 1. inject RTC interrupt > 2. not see EOI > 3. inject RTC interrupt > 4. due to 2, report coalesced > That's the 3 in my list, no? > AMD can still use IRR test-and-set. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 7:43 PM, Gleb Natapov <gleb@redhat.com> wrote: > >> > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI >> > notifiers for interrupt reinjection. This requires us to add interface >> > for reporting EOI to userspace. This is not in the scope of this >> > patchset. Cons: need to introduce new interface (and the one that will >> > not work on AMD BTW) >> > >> > Other ideas? >> >> 1. inject RTC interrupt >> 2. not see EOI >> 3. inject RTC interrupt >> 4. due to 2, report coalesced >> > That's the 3 in my list, no? No, this idea doesn't involve user interface changes. We still report through KVM_IRQ_LINE_STATUS or whatever it's called. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 08:56:07PM +0200, Avi Kivity wrote: > On Mon, Feb 25, 2013 at 7:43 PM, Gleb Natapov <gleb@redhat.com> wrote: > > > >> > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI > >> > notifiers for interrupt reinjection. This requires us to add interface > >> > for reporting EOI to userspace. This is not in the scope of this > >> > patchset. Cons: need to introduce new interface (and the one that will > >> > not work on AMD BTW) > >> > > >> > Other ideas? > >> > >> 1. inject RTC interrupt > >> 2. not see EOI > >> 3. inject RTC interrupt > >> 4. due to 2, report coalesced > >> > > That's the 3 in my list, no? > > > No, this idea doesn't involve user interface changes. We still report > through KVM_IRQ_LINE_STATUS or whatever it's called. Interesting idea! -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 07:40:07PM +0200, Gleb Natapov wrote: > On Mon, Feb 25, 2013 at 11:17:39AM -0300, Marcelo Tosatti wrote: > > On Mon, Feb 25, 2013 at 11:00:21AM -0300, Marcelo Tosatti wrote: > > > > I see a couple of possible solutions: > > > > 1. Do what Avi said. Make KVM_IRQ_LINE_STATUS be synchronous. Cons: > > > > current QEMU uses KVM_IRQ_LINE_STATUS always and it means that it > > > > will be slow on newer kernels > > > > > > Can add a capability to QEMU and enable APICv selectively only > > > in newer QEMU, which can issue KVM_IRQ_LINE_STATUS on target vcpu > > > only when necessary (and KVM_IRQ_LINE otherwise). > > > > Bad idea. What happens with mixed scenarios. > > > Exactly. > > > > Even a lock serializing injection is not safe because ON bit is cleared > > > before XCHG(PIR, 0). Must do something heavier (such as running on > > > target vcpu context). > > > > Note always running on target vcpu is likely to be slower than no-APICv. > > > Yes, but we will do it only for RTC interrupt. Still performance hit > should be very visible when RTC is in 1000Hz mode. So older qemu-kvm on APICv enabled hardware becomes slow? If that is acceptable, fine. > > So need to do something heavier on the kernel under serialization, if > > firmware cannot be changed (injection from simultaneous CPUs should be > > rare so if data to serialize __accept_apic_irq is cache-line aligned > > it should reduce performance impact). > > > > > > 2. Make KVM_IRQ_LINE_STATUS report coalescing only when vcpu is not > > > > running during injection. This assumes that if vcpu is running and does > > > > not process interrupt it is guest fault and the same can happen on real > > > > HW too. Coalescing when vcpu is not running though is the result of CPU > > > > overcommit and should be reported. Cons interface definition is kind of > > > > murky. > > > > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI > > > > notifiers for interrupt reinjection. This requires us to add interface > > > > for reporting EOI to userspace. This is not in the scope of this > > > > patchset. Cons: need to introduce new interface (and the one that will > > > > not work on AMD BTW) > > > > Is there no int-ack notification at RTC HW level? > There is, but Windows calls it twice for each injected interrupt. I > tried to use it in the past to detect interrupt coalescing, but this > double ack prevented me from doing so. May be revisit this approach with > willingness to be more hacky about it. Also it is possible to disable > RTC ack for HyperV guests. We do not do it and if we will use the ack we > will not be able to. Is it OK to kill the ability to have coalesced information? > > > Breaks older userspace? > Older userspace will have timedrif with Windows, yes. Note that we some > version of Windows it has it now too. > > > > Other ideas? > > > > > > Can HW write a 'finished' bit after 6 in the reserved area? Suppose its > > > not a KVM-specific problem? > > > > I still think adding locking just to obtain correct injection status is bad > trade-off even if HW would allow us to get away with it. Perhaps with notification at end of copy it could be lockless. With the current scheme, it is not possible to get it right because the notification bit disappears temporarily from sight. And it is not possible to distinguish between 'interrupt injected' and 'interrupt merged', as discussed here. So would have to serialize completly along the lines of: Lock and only inject if can identify that interrupt handler is not running. If its possible to drop KVM_IRQ_LINE_STATUS, then demand APICv HW to use recent software, fine (did not grasp Avi's second idea). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 25, 2013 at 09:01:02PM +0200, Gleb Natapov wrote: > On Mon, Feb 25, 2013 at 08:56:07PM +0200, Avi Kivity wrote: > > On Mon, Feb 25, 2013 at 7:43 PM, Gleb Natapov <gleb@redhat.com> wrote: > > > > > >> > 3. Do not report KVM_IRQ_LINE_STATUS capability and move RTC to use EOI > > >> > notifiers for interrupt reinjection. This requires us to add interface > > >> > for reporting EOI to userspace. This is not in the scope of this > > >> > patchset. Cons: need to introduce new interface (and the one that will > > >> > not work on AMD BTW) > > >> > > > >> > Other ideas? > > >> > > >> 1. inject RTC interrupt > > >> 2. not see EOI > > >> 3. inject RTC interrupt > > >> 4. due to 2, report coalesced > > >> > > > That's the 3 in my list, no? > > > > > > No, this idea doesn't involve user interface changes. We still report > > through KVM_IRQ_LINE_STATUS or whatever it's called. > Interesting idea! > But do not see how to implement efficiently without interface change. The idea is basically to register ACK notifier for RTC interrupt but terminate it in the kernel instead of reporting to userspace. Kernel should know somehow what GSI should it register ACK notifier for. There are couple of solutions: 1. New interface to tell what GSI to track. - Cons: KVM_IRQ_LINE_STATUS will stop working for old QEMUs that do not use it. New special purpose API. 2. Register ACK notifier only for RTC. - Cons: KVM_IRQ_LINE_STATUS will work only for RTC. This is the only thing that use it currently, but such change will make it one trick pony officially. 3. Register ACK notifiers for all edge interrupts in IOAPIC. - Cons: with APICv edge interrupt does not require EOI to do vmexit. Registering ACK notifiers for all of them will make them all do vmexit. 4. Combinations of 1 and 3. Register ACK notifiers for all edge interrupts in IOAPIC and provide API to drop unneeded notifiers. - Cons: New special purpose API. Thoughts? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 26, 2013 at 10:12 AM, Gleb Natapov <gleb@redhat.com> wrote: >> > But do not see how to implement efficiently without interface change. The > idea is basically to register ACK notifier for RTC interrupt but terminate > it in the kernel instead of reporting to userspace. Kernel should know > somehow what GSI should it register ACK notifier for. Right, my idea does not help at all. > There are couple > of solutions: > 1. New interface to tell what GSI to track. > - Cons: KVM_IRQ_LINE_STATUS will stop working for old QEMUs that do > not use it. New special purpose API. > 2. Register ACK notifier only for RTC. > - Cons: KVM_IRQ_LINE_STATUS will work only for RTC. This is the only > thing that use it currently, but such change will make it one > trick pony officially. > 3. Register ACK notifiers for all edge interrupts in IOAPIC. > - Cons: with APICv edge interrupt does not require EOI to do vmexit. > Registering ACK notifiers for all of them will make them all > do vmexit. > 4. Combinations of 1 and 3. Register ACK notifiers for all edge interrupts > in IOAPIC and provide API to drop unneeded notifiers. > - Cons: New special purpose API. > 5. Make KVM_IRQ_LINE_STATUS register an ack notifier, document it as more expensive than KVM_IRQ_LINE, and change qemu to use KVM_IRQ_LINE when it is sufficient. Pros: fully backwards compatible Cons: to realize full performance, need to patch qemu -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 40afa00..9bd4eca 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -19,6 +19,10 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) +#ifdef CONFIG_HAVE_KVM +BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +#endif + /* * every pentium local APIC has two 'local interrupts', with a * soft-definable vector attached to both interrupts, one of diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6e..cebef02 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,6 +28,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); extern void x86_platform_ipi(void); +extern void kvm_posted_intr_ipi(void); extern void error_interrupt(void); extern void irq_work_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 1508e51..774dc9f 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,11 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 +/* Vector for KVM to deliver posted interrupt IPI */ +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * IRQ work vector: */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b8388e9..79da55e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -704,6 +704,9 @@ struct kvm_x86_ops { void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); + bool (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector, + int *result, bool *delivered); + void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5c9dbad..ce8ac80 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -158,6 +158,7 @@ #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 +#define PIN_BASED_POSTED_INTR 0x00000080 #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 @@ -180,6 +181,7 @@ /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, + POSTED_INTR_NV = 0x00000002, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -214,6 +216,8 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, APIC_ACCESS_ADDR_HIGH = 0x00002015, + POSTED_INTR_DESC_ADDR = 0x00002016, + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, EPT_POINTER = 0x0000201a, EPT_POINTER_HIGH = 0x0000201b, EOI_EXIT_BITMAP0 = 0x0000201c, diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 70641af..781bc06 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1177,6 +1177,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi +#ifdef CONFIG_HAVE_KVM +apicinterrupt POSTED_INTR_VECTOR \ + kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +#endif + apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e4595f1..64616cc 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -228,6 +228,26 @@ void smp_x86_platform_ipi(struct pt_regs *regs) set_irq_regs(old_regs); } +#ifdef CONFIG_HAVE_KVM +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + irq_enter(); + + exit_idle(); + + irq_exit(); + + set_irq_regs(old_regs); +} +#endif + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6e03b0d..2329a54 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -205,6 +205,10 @@ static void __init apic_intr_init(void) /* IPI for X86 platform specific use */ alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); +#ifdef CONFIG_HAVE_KVM + /* IPI for KVM to deliver posted interrupt */ + alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); +#endif /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 02b51dd..9e67a9d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -357,6 +357,25 @@ static u8 count_vectors(void *bitmap) return count; } +int kvm_apic_test_irr(int vec, struct kvm_lapic *apic) +{ + return apic_test_vector(vec, apic->regs + APIC_IRR); +} +EXPORT_SYMBOL_GPL(kvm_apic_test_irr); + +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ + u32 i, pir_val; + struct kvm_lapic *apic = vcpu->arch.apic; + + for (i = 0; i <= 7; i++) { + pir_val = xchg(&pir[i], 0); + if (pir_val) + *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; + } +} +EXPORT_SYMBOL_GPL(kvm_apic_update_irr); + static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; @@ -379,6 +398,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -685,6 +705,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; + bool delivered = false; switch (delivery_mode) { case APIC_DM_LOWEST: @@ -700,7 +721,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - result = !apic_test_and_set_irr(vector, apic); + if (!kvm_x86_ops->deliver_posted_interrupt(vcpu, vector, + &result, &delivered)) + result = !apic_test_and_set_irr(vector, apic); + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); if (!result) { @@ -710,8 +734,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); + if (!delivered) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } break; case APIC_DM_REMRD: diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1676d34..53b98ac 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -157,5 +157,7 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, u64 *eoi_bitmap); +int kvm_apic_test_irr(int vec, struct kvm_lapic *apic); +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a7d60d7..9e705e3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3591,6 +3591,17 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr) return; } +static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + return; +} + +static bool svm_deliver_posted_interrupt(struct kvm_vcpu *vcpu, + int vector, int *result, bool *delivered) +{ + return false; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4319,6 +4330,8 @@ static struct kvm_x86_ops svm_x86_ops = { .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_isr_update = svm_hwapic_isr_update, + .sync_pir_to_irr = svm_sync_pir_to_irr, + .deliver_posted_interrupt = svm_deliver_posted_interrupt, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 12a91b0..a06364b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,7 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); -static bool __read_mostly enable_apicv_reg_vid; +static bool __read_mostly enable_apicv = 1; +module_param(enable_apicv, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use @@ -365,6 +366,41 @@ struct nested_vmx { struct page *apic_access_page; }; +#define POSTED_INTR_ON 0 +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + u8 on:1, + rsvd:7; + } control; + u32 rsvd[8]; + } u; +} __aligned(64); + +static bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->u.control); +} + +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->u.control); +} + +static int pi_test_pir(int vector, struct pi_desc *pi_desc) +{ + return test_bit(vector, (unsigned long *)pi_desc->pir); +} + +static void pi_set_pir(int vector, struct pi_desc *pi_desc) +{ + __set_bit(vector, (unsigned long *)pi_desc->pir); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -429,6 +465,10 @@ struct vcpu_vmx { bool rdtscp_enabled; + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + spinlock_t pi_lock; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -783,6 +823,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +static inline bool cpu_has_vmx_posted_intr(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; +} + +static inline bool cpu_has_vmx_apicv(void) +{ + return cpu_has_vmx_apic_register_virt() && + cpu_has_vmx_virtual_intr_delivery() && + cpu_has_vmx_posted_intr(); +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -2530,12 +2582,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmexit_control = 0; u32 _vmentry_control = 0; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | @@ -2612,6 +2658,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; + min = 0; opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2790,11 +2847,10 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; - if (!cpu_has_vmx_apic_register_virt() || - !cpu_has_vmx_virtual_intr_delivery()) - enable_apicv_reg_vid = 0; + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; - if (enable_apicv_reg_vid) + if (enable_apicv) kvm_x86_ops->update_cr8_intercept = NULL; else kvm_x86_ops->hwapic_irr_update = NULL; @@ -3871,6 +3927,59 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } +static int vmx_vm_has_apicv(struct kvm *kvm) +{ + return enable_apicv && irqchip_in_kernel(kvm); +} + +static bool vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, + int vector, int *result, bool *delivered) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long flags; + + if (!vmx_vm_has_apicv(vcpu->kvm)) + return false; + + spin_lock_irqsave(&vmx->pi_lock, flags); + if (pi_test_pir(vector, &vmx->pi_desc) || + kvm_apic_test_irr(vector, vcpu->arch.apic)) { + spin_unlock_irqrestore(&vmx->pi_lock, flags); + return true; + } + + pi_set_pir(vector, &vmx->pi_desc); + *result = 1; + if (!pi_test_and_set_on(&vmx->pi_desc) && + (vcpu->mode == IN_GUEST_MODE)) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + *delivered = true; + } + spin_unlock_irqrestore(&vmx->pi_lock, flags); + + return true; +} + +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long flags; + + if (!vmx_vm_has_apicv(vcpu->kvm)) + return; + + spin_lock_irqsave(&vmx->pi_lock, flags); + if (!pi_test_and_clear_on(&vmx->pi_desc)) { + spin_unlock_irqrestore(&vmx->pi_lock, flags); + return; + } + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); + + spin_unlock_irqrestore(&vmx->pi_lock, flags); +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -3931,6 +4040,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +{ + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; + + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + return pin_based_exec_ctrl; +} + static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -3948,11 +4066,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } -static int vmx_vm_has_apicv(struct kvm *kvm) -{ - return enable_apicv_reg_vid && irqchip_in_kernel(kvm); -} - static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -4008,8 +4121,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - vmcs_config.pin_based_exec_ctrl); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -4018,13 +4130,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } - if (enable_apicv_reg_vid) { + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); vmcs_write64(EOI_EXIT_BITMAP3, 0); vmcs_write16(GUEST_INTR_STATUS, 0); + + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + spin_lock_init(&vmx->pi_lock); } if (ple_gap) { @@ -4174,6 +4290,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + if (vmx_vm_has_apicv(vcpu->kvm)) + memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); + if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); @@ -7639,6 +7758,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_posted_interrupt = vmx_deliver_posted_interrupt, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -7742,7 +7863,7 @@ static int __init vmx_init(void) memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); - if (enable_apicv_reg_vid) { + if (enable_apicv) { for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1fa37e..62f8c94 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2679,6 +2679,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); return 0;