Message ID | 513F1529.3040309@siemens.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Il 12/03/2013 12:44, Jan Kiszka ha scritto: > A VCPU sending INIT or SIPI to some other VCPU races for setting the > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > was overwritten by kvm_emulate_halt and, thus, got lost. > > This introduces APIC events for those two signals, keeping them in > kvm_apic until kvm_apic_accept_events is run over the target vcpu > context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there > are pending events, thus if vcpu blocking should end. > > The patch comes with the side effect of effectively obsoleting > KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but > immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. > The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED > state. That also means we no longer exit to user space after receiving a > SIPI event. > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > --- > > This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving > this honor to Paolo. > > I didn't try porting any INIT handling for nested on top yet but I > think it should be feasible - once we know their semantics for sure, at > least on Intel... > > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- > arch/x86/kvm/lapic.h | 7 ++++++ > arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- > 4 files changed, 63 insertions(+), 25 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 348d859..2d28e76 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); > int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); > int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); > int kvm_cpu_get_interrupt(struct kvm_vcpu *v); > +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); > > void kvm_define_shared_msr(unsigned index, u32 msr); > void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 02b51dd..4a21a6b 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > case APIC_DM_INIT: > if (!trig_mode || level) { > result = 1; > - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + set_bit(KVM_APIC_INIT, &apic->pending_events); I think this should clear pending SIPIs, unless KVM_APIC_INIT was already set in which case it should be a no-op. Something like: e = apic->pending_events; while (!(e & KVM_APIC_INIT)) e = cmpxchg(&apic->pending_events, e, (e | KVM_APIC_INIT) & ~KVM_APIC_SIPI); If you do this, better make pending_events an atomic_t. > kvm_make_request(KVM_REQ_EVENT, vcpu); > kvm_vcpu_kick(vcpu); > } else { > @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > case APIC_DM_STARTUP: > apic_debug("SIPI to vcpu %d vector 0x%02x\n", > vcpu->vcpu_id, vector); > - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > - result = 1; > - vcpu->arch.sipi_vector = vector; > - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > - kvm_make_request(KVM_REQ_EVENT, vcpu); > - kvm_vcpu_kick(vcpu); > - } > + result = 1; > + apic->sipi_vector = vector; > + /* make sure sipi_vector is visible for the receiver */ > + smp_wmb(); > + set_bit(KVM_APIC_SIPI, &apic->pending_events); > + kvm_make_request(KVM_REQ_EVENT, vcpu); > + kvm_vcpu_kick(vcpu); > break; > > case APIC_DM_EXTINT: > @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) > addr); > } > > +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) > +{ > + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; > +} > + > +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + > + if (!kvm_vcpu_has_lapic(vcpu)) > + return; > + > + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && > + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > + vcpu->arch.sipi_vector = apic->sipi_vector; > + pr_debug("vcpu %d received sipi with vector # %x\n", > + vcpu->vcpu_id, vcpu->arch.sipi_vector); > + kvm_lapic_reset(vcpu); > + kvm_vcpu_reset(vcpu); > + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > + } > +} > + > void kvm_lapic_init(void) > { > /* do not patch jump label more than once per second */ > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 1676d34..ef3f4ef 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -5,6 +5,9 @@ > > #include <linux/kvm_host.h> > > +#define KVM_APIC_INIT 0 > +#define KVM_APIC_SIPI 1 > + > struct kvm_timer { > struct hrtimer timer; > s64 period; /* unit: ns */ > @@ -32,13 +35,17 @@ struct kvm_lapic { > void *regs; > gpa_t vapic_addr; > struct page *vapic_page; > + unsigned long pending_events; > + unsigned int sipi_vector; > }; > int kvm_create_lapic(struct kvm_vcpu *vcpu); > void kvm_free_lapic(struct kvm_vcpu *vcpu); > > int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); > +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); > int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); > int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); > +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); > void kvm_lapic_reset(struct kvm_vcpu *vcpu); > u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); > void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index b891ac3..a0b8041 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; > > static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); > > -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); > - > static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) > { > int i; > @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > bool req_immediate_exit = 0; > > if (vcpu->requests) { > + kvm_apic_accept_events(vcpu); > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > + r = 1; > + goto out; > + } > if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) > kvm_mmu_unload(vcpu); > if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) > @@ -5847,14 +5850,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > int r; > struct kvm *kvm = vcpu->kvm; > > - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { > - pr_debug("vcpu %d received sipi with vector # %x\n", > - vcpu->vcpu_id, vcpu->arch.sipi_vector); > - kvm_lapic_reset(vcpu); > - kvm_vcpu_reset(vcpu); > - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > - } > - > vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); > r = vapic_enter(vcpu); > if (r) { > @@ -5871,8 +5866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); > kvm_vcpu_block(vcpu); > vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); > - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) > - { > + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { > + kvm_apic_accept_events(vcpu); > switch(vcpu->arch.mp_state) { > case KVM_MP_STATE_HALTED: > vcpu->arch.mp_state = > @@ -5880,7 +5875,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > case KVM_MP_STATE_RUNNABLE: > vcpu->arch.apf.halted = false; > break; > - case KVM_MP_STATE_SIPI_RECEIVED: > + case KVM_MP_STATE_INIT_RECEIVED: > + break; > default: > r = -EINTR; > break; > @@ -5901,7 +5897,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > ++vcpu->stat.request_irq_exits; > } > > - kvm_check_async_pf_completion(vcpu); > + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) > + kvm_check_async_pf_completion(vcpu); Can you instead do another change that affects the conditions of kvm_check_async_pf_completion? For example, should kvm_arch_interrupt_allowed return zero if the VCPU is in the INIT_RECEIVED state? > > if (signal_pending(current)) { > r = -EINTR; > @@ -6015,6 +6012,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > > if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { > kvm_vcpu_block(vcpu); > + kvm_apic_accept_events(vcpu); > clear_bit(KVM_REQ_UNHALT, &vcpu->requests); > r = -EAGAIN; > goto out; > @@ -6171,6 +6169,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, > int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > struct kvm_mp_state *mp_state) > { > + kvm_apic_accept_events(vcpu); Is this racy against __vcpu_run? > mp_state->mp_state = vcpu->arch.mp_state; > return 0; > } > @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, > struct kvm_mp_state *mp_state) > { > - vcpu->arch.mp_state = mp_state->mp_state; > + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { > + if (!kvm_vcpu_has_lapic(vcpu)) > + return -EINVAL; > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); > + } else > + vcpu->arch.mp_state = mp_state->mp_state; Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? > kvm_make_request(KVM_REQ_EVENT, vcpu); > return 0; > } > @@ -6515,7 +6520,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) > kvm_x86_ops->vcpu_free(vcpu); > } > > -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu) > +void kvm_vcpu_reset(struct kvm_vcpu *vcpu) > { > atomic_set(&vcpu->arch.nmi_queued, 0); > vcpu->arch.nmi_pending = 0; > @@ -6988,7 +6993,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) > return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && > !vcpu->arch.apf.halted) > || !list_empty_careful(&vcpu->async_pf.done) > - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED > + || kvm_apic_has_events(vcpu) > || atomic_read(&vcpu->arch.nmi_queued) || > (kvm_arch_interrupt_allowed(vcpu) && > kvm_cpu_has_interrupt(vcpu)); > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Mar 12, 2013 at 01:06:59PM +0100, Paolo Bonzini wrote: > > @@ -6171,6 +6169,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, > > int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > > struct kvm_mp_state *mp_state) > > { > > + kvm_apic_accept_events(vcpu); > > Is this racy against __vcpu_run? > FWIW both of them are vcpu ioctl and thus run under vcpu mutex. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Il 12/03/2013 13:06, Paolo Bonzini ha scritto: > > @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > > int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, > > struct kvm_mp_state *mp_state) > > { > > - vcpu->arch.mp_state = mp_state->mp_state; > > + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { > > + if (!kvm_vcpu_has_lapic(vcpu)) > > + return -EINVAL; > > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > > + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); > > + } else > > + vcpu->arch.mp_state = mp_state->mp_state; > > Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? And since migration was brought up yesterday, do we need an interface to retrieve and set this? And should KVM_GET/SET_VCPU_EVENTS use the new sipi_vector in the APIC rather than the old one? Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-12 13:29, Paolo Bonzini wrote: > Il 12/03/2013 13:06, Paolo Bonzini ha scritto: >>> @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, >>> int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, >>> struct kvm_mp_state *mp_state) >>> { >>> - vcpu->arch.mp_state = mp_state->mp_state; >>> + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { >>> + if (!kvm_vcpu_has_lapic(vcpu)) >>> + return -EINVAL; >>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>> + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); >>> + } else >>> + vcpu->arch.mp_state = mp_state->mp_state; >> >> Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? > > And since migration was brought up yesterday, do we need an interface to > retrieve and set this? > > And should KVM_GET/SET_VCPU_EVENTS use the new sipi_vector in the APIC > rather than the old one? I hope not. The idea is that the APIC events are processed before the migration completes. Translating events on get_mpstate should ensure this. Jan
On Tue, Mar 12, 2013 at 01:46:53PM +0100, Jan Kiszka wrote: > On 2013-03-12 13:29, Paolo Bonzini wrote: > > Il 12/03/2013 13:06, Paolo Bonzini ha scritto: > >>> @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > >>> int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, > >>> struct kvm_mp_state *mp_state) > >>> { > >>> - vcpu->arch.mp_state = mp_state->mp_state; > >>> + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { > >>> + if (!kvm_vcpu_has_lapic(vcpu)) > >>> + return -EINVAL; > >>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > >>> + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); > >>> + } else > >>> + vcpu->arch.mp_state = mp_state->mp_state; > >> > >> Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? > > > > And since migration was brought up yesterday, do we need an interface to > > retrieve and set this? > > > > And should KVM_GET/SET_VCPU_EVENTS use the new sipi_vector in the APIC > > rather than the old one? > > I hope not. The idea is that the APIC events are processed before the > migration completes. Translating events on get_mpstate should ensure this. > But when you will add nested support it will not be that simple, so as part of migration with nested guests we will have to transfer pending_events too instead of processing then on (set|get)_mpstate. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-12 13:49, Gleb Natapov wrote: > On Tue, Mar 12, 2013 at 01:46:53PM +0100, Jan Kiszka wrote: >> On 2013-03-12 13:29, Paolo Bonzini wrote: >>> Il 12/03/2013 13:06, Paolo Bonzini ha scritto: >>>>> @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, >>>>> int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, >>>>> struct kvm_mp_state *mp_state) >>>>> { >>>>> - vcpu->arch.mp_state = mp_state->mp_state; >>>>> + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { >>>>> + if (!kvm_vcpu_has_lapic(vcpu)) >>>>> + return -EINVAL; >>>>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>>>> + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); >>>>> + } else >>>>> + vcpu->arch.mp_state = mp_state->mp_state; >>>> >>>> Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? >>> >>> And since migration was brought up yesterday, do we need an interface to >>> retrieve and set this? >>> >>> And should KVM_GET/SET_VCPU_EVENTS use the new sipi_vector in the APIC >>> rather than the old one? >> >> I hope not. The idea is that the APIC events are processed before the >> migration completes. Translating events on get_mpstate should ensure this. >> > But when you will add nested support it will not be that simple, so as > part of migration with nested guests we will have to transfer > pending_events too instead of processing then on (set|get)_mpstate. Right, but that can then easily become part of the to-be-defined nested vcpu state (which is likely more than vmcs12). Jan
On Tue, Mar 12, 2013 at 01:52:00PM +0100, Jan Kiszka wrote: > On 2013-03-12 13:49, Gleb Natapov wrote: > > On Tue, Mar 12, 2013 at 01:46:53PM +0100, Jan Kiszka wrote: > >> On 2013-03-12 13:29, Paolo Bonzini wrote: > >>> Il 12/03/2013 13:06, Paolo Bonzini ha scritto: > >>>>> @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > >>>>> int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, > >>>>> struct kvm_mp_state *mp_state) > >>>>> { > >>>>> - vcpu->arch.mp_state = mp_state->mp_state; > >>>>> + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { > >>>>> + if (!kvm_vcpu_has_lapic(vcpu)) > >>>>> + return -EINVAL; > >>>>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > >>>>> + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); > >>>>> + } else > >>>>> + vcpu->arch.mp_state = mp_state->mp_state; > >>>> > >>>> Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? > >>> > >>> And since migration was brought up yesterday, do we need an interface to > >>> retrieve and set this? > >>> > >>> And should KVM_GET/SET_VCPU_EVENTS use the new sipi_vector in the APIC > >>> rather than the old one? > >> > >> I hope not. The idea is that the APIC events are processed before the > >> migration completes. Translating events on get_mpstate should ensure this. > >> > > But when you will add nested support it will not be that simple, so as > > part of migration with nested guests we will have to transfer > > pending_events too instead of processing then on (set|get)_mpstate. > > Right, but that can then easily become part of the to-be-defined nested > vcpu state (which is likely more than vmcs12). > Yes, I am not saying we have to implement it right away. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-12 13:06, Paolo Bonzini wrote: > Il 12/03/2013 12:44, Jan Kiszka ha scritto: >> A VCPU sending INIT or SIPI to some other VCPU races for setting the >> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED >> was overwritten by kvm_emulate_halt and, thus, got lost. >> >> This introduces APIC events for those two signals, keeping them in >> kvm_apic until kvm_apic_accept_events is run over the target vcpu >> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there >> are pending events, thus if vcpu blocking should end. >> >> The patch comes with the side effect of effectively obsoleting >> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but >> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. >> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED >> state. That also means we no longer exit to user space after receiving a >> SIPI event. >> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >> --- >> >> This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving >> this honor to Paolo. >> >> I didn't try porting any INIT handling for nested on top yet but I >> think it should be feasible - once we know their semantics for sure, at >> least on Intel... >> >> arch/x86/include/asm/kvm_host.h | 1 + >> arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- >> arch/x86/kvm/lapic.h | 7 ++++++ >> arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- >> 4 files changed, 63 insertions(+), 25 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 348d859..2d28e76 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); >> int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); >> int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); >> int kvm_cpu_get_interrupt(struct kvm_vcpu *v); >> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >> >> void kvm_define_shared_msr(unsigned index, u32 msr); >> void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >> index 02b51dd..4a21a6b 100644 >> --- a/arch/x86/kvm/lapic.c >> +++ b/arch/x86/kvm/lapic.c >> @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >> case APIC_DM_INIT: >> if (!trig_mode || level) { >> result = 1; >> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> + set_bit(KVM_APIC_INIT, &apic->pending_events); > > I think this should clear pending SIPIs, unless KVM_APIC_INIT was > already set in which case it should be a no-op. Something like: > > e = apic->pending_events; > while (!(e & KVM_APIC_INIT)) > e = cmpxchg(&apic->pending_events, e, > (e | KVM_APIC_INIT) & ~KVM_APIC_SIPI); > > If you do this, better make pending_events an atomic_t. OK (looks ugly but necessary). > >> kvm_make_request(KVM_REQ_EVENT, vcpu); >> kvm_vcpu_kick(vcpu); >> } else { >> @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >> case APIC_DM_STARTUP: >> apic_debug("SIPI to vcpu %d vector 0x%02x\n", >> vcpu->vcpu_id, vector); >> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> - result = 1; >> - vcpu->arch.sipi_vector = vector; >> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; >> - kvm_make_request(KVM_REQ_EVENT, vcpu); >> - kvm_vcpu_kick(vcpu); >> - } >> + result = 1; >> + apic->sipi_vector = vector; >> + /* make sure sipi_vector is visible for the receiver */ >> + smp_wmb(); >> + set_bit(KVM_APIC_SIPI, &apic->pending_events); >> + kvm_make_request(KVM_REQ_EVENT, vcpu); >> + kvm_vcpu_kick(vcpu); >> break; >> >> case APIC_DM_EXTINT: >> @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) >> addr); >> } >> >> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) >> +{ >> + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; >> +} >> + >> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) >> +{ >> + struct kvm_lapic *apic = vcpu->arch.apic; >> + >> + if (!kvm_vcpu_has_lapic(vcpu)) >> + return; >> + >> + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) >> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && >> + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> + vcpu->arch.sipi_vector = apic->sipi_vector; >> + pr_debug("vcpu %d received sipi with vector # %x\n", >> + vcpu->vcpu_id, vcpu->arch.sipi_vector); >> + kvm_lapic_reset(vcpu); >> + kvm_vcpu_reset(vcpu); >> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >> + } >> +} >> + >> void kvm_lapic_init(void) >> { >> /* do not patch jump label more than once per second */ >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h >> index 1676d34..ef3f4ef 100644 >> --- a/arch/x86/kvm/lapic.h >> +++ b/arch/x86/kvm/lapic.h >> @@ -5,6 +5,9 @@ >> >> #include <linux/kvm_host.h> >> >> +#define KVM_APIC_INIT 0 >> +#define KVM_APIC_SIPI 1 >> + >> struct kvm_timer { >> struct hrtimer timer; >> s64 period; /* unit: ns */ >> @@ -32,13 +35,17 @@ struct kvm_lapic { >> void *regs; >> gpa_t vapic_addr; >> struct page *vapic_page; >> + unsigned long pending_events; >> + unsigned int sipi_vector; >> }; >> int kvm_create_lapic(struct kvm_vcpu *vcpu); >> void kvm_free_lapic(struct kvm_vcpu *vcpu); >> >> int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); >> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); >> int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); >> int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); >> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); >> void kvm_lapic_reset(struct kvm_vcpu *vcpu); >> u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); >> void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index b891ac3..a0b8041 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; >> >> static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); >> >> -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >> - >> static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) >> { >> int i; >> @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >> bool req_immediate_exit = 0; >> >> if (vcpu->requests) { >> + kvm_apic_accept_events(vcpu); >> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> + r = 1; >> + goto out; >> + } >> if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) >> kvm_mmu_unload(vcpu); >> if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) >> @@ -5847,14 +5850,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> int r; >> struct kvm *kvm = vcpu->kvm; >> >> - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { >> - pr_debug("vcpu %d received sipi with vector # %x\n", >> - vcpu->vcpu_id, vcpu->arch.sipi_vector); >> - kvm_lapic_reset(vcpu); >> - kvm_vcpu_reset(vcpu); >> - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >> - } >> - >> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >> r = vapic_enter(vcpu); >> if (r) { >> @@ -5871,8 +5866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); >> kvm_vcpu_block(vcpu); >> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >> - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) >> - { >> + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { >> + kvm_apic_accept_events(vcpu); >> switch(vcpu->arch.mp_state) { >> case KVM_MP_STATE_HALTED: >> vcpu->arch.mp_state = >> @@ -5880,7 +5875,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> case KVM_MP_STATE_RUNNABLE: >> vcpu->arch.apf.halted = false; >> break; >> - case KVM_MP_STATE_SIPI_RECEIVED: >> + case KVM_MP_STATE_INIT_RECEIVED: >> + break; >> default: >> r = -EINTR; >> break; >> @@ -5901,7 +5897,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> ++vcpu->stat.request_irq_exits; >> } >> >> - kvm_check_async_pf_completion(vcpu); >> + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) >> + kvm_check_async_pf_completion(vcpu); > > Can you instead do another change that affects the conditions of > kvm_check_async_pf_completion? > > For example, should kvm_arch_interrupt_allowed return zero if the VCPU > is in the INIT_RECEIVED state? Yeah, that probably makes sense beyond async_pf. > >> >> if (signal_pending(current)) { >> r = -EINTR; >> @@ -6015,6 +6012,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) >> >> if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { >> kvm_vcpu_block(vcpu); >> + kvm_apic_accept_events(vcpu); >> clear_bit(KVM_REQ_UNHALT, &vcpu->requests); >> r = -EAGAIN; >> goto out; >> @@ -6171,6 +6169,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, >> int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, >> struct kvm_mp_state *mp_state) >> { >> + kvm_apic_accept_events(vcpu); > > Is this racy against __vcpu_run? No, as Gleb already explained. > >> mp_state->mp_state = vcpu->arch.mp_state; >> return 0; >> } >> @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, >> int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, >> struct kvm_mp_state *mp_state) >> { >> - vcpu->arch.mp_state = mp_state->mp_state; >> + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { >> + if (!kvm_vcpu_has_lapic(vcpu)) >> + return -EINVAL; >> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); >> + } else >> + vcpu->arch.mp_state = mp_state->mp_state; > > Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? Good question. But, then, how do we reset those CPUs (BSPs only)? Jan
On 2013-03-12 13:58, Jan Kiszka wrote: > On 2013-03-12 13:06, Paolo Bonzini wrote: >> Il 12/03/2013 12:44, Jan Kiszka ha scritto: >>> A VCPU sending INIT or SIPI to some other VCPU races for setting the >>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED >>> was overwritten by kvm_emulate_halt and, thus, got lost. >>> >>> This introduces APIC events for those two signals, keeping them in >>> kvm_apic until kvm_apic_accept_events is run over the target vcpu >>> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there >>> are pending events, thus if vcpu blocking should end. >>> >>> The patch comes with the side effect of effectively obsoleting >>> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but >>> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. >>> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED >>> state. That also means we no longer exit to user space after receiving a >>> SIPI event. >>> >>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >>> --- >>> >>> This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving >>> this honor to Paolo. >>> >>> I didn't try porting any INIT handling for nested on top yet but I >>> think it should be feasible - once we know their semantics for sure, at >>> least on Intel... >>> >>> arch/x86/include/asm/kvm_host.h | 1 + >>> arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- >>> arch/x86/kvm/lapic.h | 7 ++++++ >>> arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- >>> 4 files changed, 63 insertions(+), 25 deletions(-) >>> >>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >>> index 348d859..2d28e76 100644 >>> --- a/arch/x86/include/asm/kvm_host.h >>> +++ b/arch/x86/include/asm/kvm_host.h >>> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); >>> int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); >>> int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); >>> int kvm_cpu_get_interrupt(struct kvm_vcpu *v); >>> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >>> >>> void kvm_define_shared_msr(unsigned index, u32 msr); >>> void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); >>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >>> index 02b51dd..4a21a6b 100644 >>> --- a/arch/x86/kvm/lapic.c >>> +++ b/arch/x86/kvm/lapic.c >>> @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >>> case APIC_DM_INIT: >>> if (!trig_mode || level) { >>> result = 1; >>> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>> + set_bit(KVM_APIC_INIT, &apic->pending_events); >> >> I think this should clear pending SIPIs, unless KVM_APIC_INIT was >> already set in which case it should be a no-op. Something like: >> >> e = apic->pending_events; >> while (!(e & KVM_APIC_INIT)) >> e = cmpxchg(&apic->pending_events, e, >> (e | KVM_APIC_INIT) & ~KVM_APIC_SIPI); >> >> If you do this, better make pending_events an atomic_t. > > OK (looks ugly but necessary). > >> >>> kvm_make_request(KVM_REQ_EVENT, vcpu); >>> kvm_vcpu_kick(vcpu); >>> } else { >>> @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >>> case APIC_DM_STARTUP: >>> apic_debug("SIPI to vcpu %d vector 0x%02x\n", >>> vcpu->vcpu_id, vector); >>> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >>> - result = 1; >>> - vcpu->arch.sipi_vector = vector; >>> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; >>> - kvm_make_request(KVM_REQ_EVENT, vcpu); >>> - kvm_vcpu_kick(vcpu); >>> - } >>> + result = 1; >>> + apic->sipi_vector = vector; >>> + /* make sure sipi_vector is visible for the receiver */ >>> + smp_wmb(); >>> + set_bit(KVM_APIC_SIPI, &apic->pending_events); >>> + kvm_make_request(KVM_REQ_EVENT, vcpu); >>> + kvm_vcpu_kick(vcpu); >>> break; >>> >>> case APIC_DM_EXTINT: >>> @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) >>> addr); >>> } >>> >>> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) >>> +{ >>> + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; >>> +} >>> + >>> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) >>> +{ >>> + struct kvm_lapic *apic = vcpu->arch.apic; >>> + >>> + if (!kvm_vcpu_has_lapic(vcpu)) >>> + return; >>> + >>> + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) >>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>> + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && >>> + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >>> + vcpu->arch.sipi_vector = apic->sipi_vector; >>> + pr_debug("vcpu %d received sipi with vector # %x\n", >>> + vcpu->vcpu_id, vcpu->arch.sipi_vector); >>> + kvm_lapic_reset(vcpu); >>> + kvm_vcpu_reset(vcpu); >>> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >>> + } >>> +} >>> + >>> void kvm_lapic_init(void) >>> { >>> /* do not patch jump label more than once per second */ >>> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h >>> index 1676d34..ef3f4ef 100644 >>> --- a/arch/x86/kvm/lapic.h >>> +++ b/arch/x86/kvm/lapic.h >>> @@ -5,6 +5,9 @@ >>> >>> #include <linux/kvm_host.h> >>> >>> +#define KVM_APIC_INIT 0 >>> +#define KVM_APIC_SIPI 1 >>> + >>> struct kvm_timer { >>> struct hrtimer timer; >>> s64 period; /* unit: ns */ >>> @@ -32,13 +35,17 @@ struct kvm_lapic { >>> void *regs; >>> gpa_t vapic_addr; >>> struct page *vapic_page; >>> + unsigned long pending_events; >>> + unsigned int sipi_vector; >>> }; >>> int kvm_create_lapic(struct kvm_vcpu *vcpu); >>> void kvm_free_lapic(struct kvm_vcpu *vcpu); >>> >>> int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); >>> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); >>> int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); >>> int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); >>> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); >>> void kvm_lapic_reset(struct kvm_vcpu *vcpu); >>> u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); >>> void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>> index b891ac3..a0b8041 100644 >>> --- a/arch/x86/kvm/x86.c >>> +++ b/arch/x86/kvm/x86.c >>> @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; >>> >>> static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); >>> >>> -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >>> - >>> static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) >>> { >>> int i; >>> @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >>> bool req_immediate_exit = 0; >>> >>> if (vcpu->requests) { >>> + kvm_apic_accept_events(vcpu); >>> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >>> + r = 1; >>> + goto out; >>> + } >>> if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) >>> kvm_mmu_unload(vcpu); >>> if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) >>> @@ -5847,14 +5850,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >>> int r; >>> struct kvm *kvm = vcpu->kvm; >>> >>> - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { >>> - pr_debug("vcpu %d received sipi with vector # %x\n", >>> - vcpu->vcpu_id, vcpu->arch.sipi_vector); >>> - kvm_lapic_reset(vcpu); >>> - kvm_vcpu_reset(vcpu); >>> - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >>> - } >>> - >>> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >>> r = vapic_enter(vcpu); >>> if (r) { >>> @@ -5871,8 +5866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >>> srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); >>> kvm_vcpu_block(vcpu); >>> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >>> - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) >>> - { >>> + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { >>> + kvm_apic_accept_events(vcpu); >>> switch(vcpu->arch.mp_state) { >>> case KVM_MP_STATE_HALTED: >>> vcpu->arch.mp_state = >>> @@ -5880,7 +5875,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >>> case KVM_MP_STATE_RUNNABLE: >>> vcpu->arch.apf.halted = false; >>> break; >>> - case KVM_MP_STATE_SIPI_RECEIVED: >>> + case KVM_MP_STATE_INIT_RECEIVED: >>> + break; >>> default: >>> r = -EINTR; >>> break; >>> @@ -5901,7 +5897,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >>> ++vcpu->stat.request_irq_exits; >>> } >>> >>> - kvm_check_async_pf_completion(vcpu); >>> + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) >>> + kvm_check_async_pf_completion(vcpu); >> >> Can you instead do another change that affects the conditions of >> kvm_check_async_pf_completion? >> >> For example, should kvm_arch_interrupt_allowed return zero if the VCPU >> is in the INIT_RECEIVED state? > > Yeah, that probably makes sense beyond async_pf. Wait: If you perform a proper reset on INIT already, we would clear IF thus prevent also async_pf injections. On the other hand, kvm_arch_can_inject_async_page_present returns true if apf.msr_val & KVM_ASYNC_PF_ENABLED is not set - shouldn't that be cleared on reset as well? Hmm... Jan
Il 12/03/2013 13:46, Jan Kiszka ha scritto: >>>> >>> @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, >>>> >>> int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, >>>> >>> struct kvm_mp_state *mp_state) >>>> >>> { >>>> >>> - vcpu->arch.mp_state = mp_state->mp_state; >>>> >>> + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { >>>> >>> + if (!kvm_vcpu_has_lapic(vcpu)) >>>> >>> + return -EINVAL; >>>> >>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>>> >>> + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); >>>> >>> + } else >>>> >>> + vcpu->arch.mp_state = mp_state->mp_state; >>> >> >>> >> Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? >> > >> > And since migration was brought up yesterday, do we need an interface to >> > retrieve and set this? >> > >> > And should KVM_GET/SET_VCPU_EVENTS use the new sipi_vector in the APIC >> > rather than the old one? > I hope not. The idea is that the APIC events are processed before the > migration completes. Translating events on get_mpstate should ensure this. What about persistent state such as "an INIT has been received and caused a vmexit, but is being latched until vmxoff"? Perhaps we could use the top 8 bits of vcpu->arch.sipi_vector for this, they are always shifted out when sipi_vector is used, and should be zero in all cases. It would then be possible to reuse KVM_GET/SET_VCPU_EVENTS. Migration support for nested VMX is still far far away, so perhaps we do not care, but we still need a way to inject INIT from userspace. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Mar 12, 2013 at 12:44:41PM +0100, Jan Kiszka wrote: > A VCPU sending INIT or SIPI to some other VCPU races for setting the > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > was overwritten by kvm_emulate_halt and, thus, got lost. > > This introduces APIC events for those two signals, keeping them in > kvm_apic until kvm_apic_accept_events is run over the target vcpu > context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there > are pending events, thus if vcpu blocking should end. > > The patch comes with the side effect of effectively obsoleting > KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but > immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. > The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED > state. That also means we no longer exit to user space after receiving a > SIPI event. > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > --- > > This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving > this honor to Paolo. > > I didn't try porting any INIT handling for nested on top yet but I > think it should be feasible - once we know their semantics for sure, at > least on Intel... > > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- > arch/x86/kvm/lapic.h | 7 ++++++ > arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- > 4 files changed, 63 insertions(+), 25 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 348d859..2d28e76 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); > int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); > int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); > int kvm_cpu_get_interrupt(struct kvm_vcpu *v); > +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); > > void kvm_define_shared_msr(unsigned index, u32 msr); > void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 02b51dd..4a21a6b 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > case APIC_DM_INIT: > if (!trig_mode || level) { > result = 1; > - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + set_bit(KVM_APIC_INIT, &apic->pending_events); > kvm_make_request(KVM_REQ_EVENT, vcpu); > kvm_vcpu_kick(vcpu); > } else { > @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > case APIC_DM_STARTUP: > apic_debug("SIPI to vcpu %d vector 0x%02x\n", > vcpu->vcpu_id, vector); > - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > - result = 1; > - vcpu->arch.sipi_vector = vector; > - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > - kvm_make_request(KVM_REQ_EVENT, vcpu); > - kvm_vcpu_kick(vcpu); > - } > + result = 1; > + apic->sipi_vector = vector; > + /* make sure sipi_vector is visible for the receiver */ > + smp_wmb(); > + set_bit(KVM_APIC_SIPI, &apic->pending_events); > + kvm_make_request(KVM_REQ_EVENT, vcpu); > + kvm_vcpu_kick(vcpu); > break; > > case APIC_DM_EXTINT: > @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) > addr); > } > > +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) > +{ > + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; The function is called only from kvm_arch_vcpu_runnable() and kvm_arch_vcpu_runnable() is called only if irqchip is in kernel, so I think kvm_vcpu_has_lapic() can be dropped. > +} > + > +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + > + if (!kvm_vcpu_has_lapic(vcpu)) > + return; > + > + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && > + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > + vcpu->arch.sipi_vector = apic->sipi_vector; > + pr_debug("vcpu %d received sipi with vector # %x\n", > + vcpu->vcpu_id, vcpu->arch.sipi_vector); > + kvm_lapic_reset(vcpu); > + kvm_vcpu_reset(vcpu); Shouldn't we reset on KVM_APIC_INIT? > + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > + } > +} > + > void kvm_lapic_init(void) > { > /* do not patch jump label more than once per second */ > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 1676d34..ef3f4ef 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -5,6 +5,9 @@ > > #include <linux/kvm_host.h> > > +#define KVM_APIC_INIT 0 > +#define KVM_APIC_SIPI 1 > + > struct kvm_timer { > struct hrtimer timer; > s64 period; /* unit: ns */ > @@ -32,13 +35,17 @@ struct kvm_lapic { > void *regs; > gpa_t vapic_addr; > struct page *vapic_page; > + unsigned long pending_events; > + unsigned int sipi_vector; > }; > int kvm_create_lapic(struct kvm_vcpu *vcpu); > void kvm_free_lapic(struct kvm_vcpu *vcpu); > > int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); > +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); > int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); > int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); > +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); > void kvm_lapic_reset(struct kvm_vcpu *vcpu); > u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); > void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index b891ac3..a0b8041 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; > > static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); > > -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); > - > static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) > { > int i; > @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > bool req_immediate_exit = 0; > > if (vcpu->requests) { > + kvm_apic_accept_events(vcpu); > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > + r = 1; > + goto out; > + } Why not call it under kvm_check_request(KVM_REQ_EVENT, vcpu) bellow? > if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) > kvm_mmu_unload(vcpu); > if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) > @@ -5847,14 +5850,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > int r; > struct kvm *kvm = vcpu->kvm; > > - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { > - pr_debug("vcpu %d received sipi with vector # %x\n", > - vcpu->vcpu_id, vcpu->arch.sipi_vector); > - kvm_lapic_reset(vcpu); > - kvm_vcpu_reset(vcpu); > - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > - } > - > vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); > r = vapic_enter(vcpu); > if (r) { > @@ -5871,8 +5866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); > kvm_vcpu_block(vcpu); > vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); > - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) > - { > + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { > + kvm_apic_accept_events(vcpu); > switch(vcpu->arch.mp_state) { > case KVM_MP_STATE_HALTED: > vcpu->arch.mp_state = > @@ -5880,7 +5875,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > case KVM_MP_STATE_RUNNABLE: > vcpu->arch.apf.halted = false; > break; > - case KVM_MP_STATE_SIPI_RECEIVED: > + case KVM_MP_STATE_INIT_RECEIVED: > + break; > default: > r = -EINTR; > break; > @@ -5901,7 +5897,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > ++vcpu->stat.request_irq_exits; > } > > - kvm_check_async_pf_completion(vcpu); > + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) > + kvm_check_async_pf_completion(vcpu); Do not understand why is this 'if' needed. > > if (signal_pending(current)) { > r = -EINTR; > @@ -6015,6 +6012,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > > if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { > kvm_vcpu_block(vcpu); > + kvm_apic_accept_events(vcpu); > clear_bit(KVM_REQ_UNHALT, &vcpu->requests); > r = -EAGAIN; > goto out; > @@ -6171,6 +6169,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, > int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > struct kvm_mp_state *mp_state) > { > + kvm_apic_accept_events(vcpu); > mp_state->mp_state = vcpu->arch.mp_state; > return 0; > } > @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, > struct kvm_mp_state *mp_state) > { > - vcpu->arch.mp_state = mp_state->mp_state; > + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { > + if (!kvm_vcpu_has_lapic(vcpu)) > + return -EINVAL; > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); > + } else > + vcpu->arch.mp_state = mp_state->mp_state; > kvm_make_request(KVM_REQ_EVENT, vcpu); > return 0; > } > @@ -6515,7 +6520,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) > kvm_x86_ops->vcpu_free(vcpu); > } > > -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu) > +void kvm_vcpu_reset(struct kvm_vcpu *vcpu) > { > atomic_set(&vcpu->arch.nmi_queued, 0); > vcpu->arch.nmi_pending = 0; > @@ -6988,7 +6993,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) > return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && > !vcpu->arch.apf.halted) > || !list_empty_careful(&vcpu->async_pf.done) > - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED > + || kvm_apic_has_events(vcpu) > || atomic_read(&vcpu->arch.nmi_queued) || > (kvm_arch_interrupt_allowed(vcpu) && > kvm_cpu_has_interrupt(vcpu)); > -- > 1.7.3.4 -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-12 14:09, Paolo Bonzini wrote: > Il 12/03/2013 13:46, Jan Kiszka ha scritto: >>>>>>>> @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, >>>>>>>> int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, >>>>>>>> struct kvm_mp_state *mp_state) >>>>>>>> { >>>>>>>> - vcpu->arch.mp_state = mp_state->mp_state; >>>>>>>> + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { >>>>>>>> + if (!kvm_vcpu_has_lapic(vcpu)) >>>>>>>> + return -EINVAL; >>>>>>>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>>>>>>> + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); >>>>>>>> + } else >>>>>>>> + vcpu->arch.mp_state = mp_state->mp_state; >>>>>> >>>>>> Should INIT_RECEIVED also be invalid without an in-kernel LAPIC? >>>> >>>> And since migration was brought up yesterday, do we need an interface to >>>> retrieve and set this? >>>> >>>> And should KVM_GET/SET_VCPU_EVENTS use the new sipi_vector in the APIC >>>> rather than the old one? >> I hope not. The idea is that the APIC events are processed before the >> migration completes. Translating events on get_mpstate should ensure this. > > What about persistent state such as "an INIT has been received and > caused a vmexit, but is being latched until vmxoff"? Perhaps we could > use the top 8 bits of vcpu->arch.sipi_vector for this, they are always > shifted out when sipi_vector is used, and should be zero in all cases. > It would then be possible to reuse KVM_GET/SET_VCPU_EVENTS. See my other mail. > > Migration support for nested VMX is still far far away, so perhaps we do > not care, but we still need a way to inject INIT from userspace. Ack for INIT injection. In theory, you could use the VCPU event IOCTL, it is extensible. If that is elegant is another question. Jan
Il 12/03/2013 14:01, Jan Kiszka ha scritto: >>> >> For example, should kvm_arch_interrupt_allowed return zero if the VCPU >>> >> is in the INIT_RECEIVED state? >> > >> > Yeah, that probably makes sense beyond async_pf. > Wait: If you perform a proper reset on INIT already, we would clear IF > thus prevent also async_pf injections. On the other hand, > kvm_arch_can_inject_async_page_present returns true if apf.msr_val & > KVM_ASYNC_PF_ENABLED is not set - shouldn't that be cleared on reset as > well? Hmm... So if you split the reset (which is done on INIT) from setting up CS on SIPI, you kill two birds with a stone, or at least one and a half: you clear IF as you wrote above, and INIT on BSP almost works (the only missing bit should be setting the mp_state to KVM_MP_STATE_RUNNABLE). Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-12 14:13, Paolo Bonzini wrote: > Il 12/03/2013 14:01, Jan Kiszka ha scritto: >>>>>> For example, should kvm_arch_interrupt_allowed return zero if the VCPU >>>>>> is in the INIT_RECEIVED state? >>>> >>>> Yeah, that probably makes sense beyond async_pf. >> Wait: If you perform a proper reset on INIT already, we would clear IF >> thus prevent also async_pf injections. On the other hand, >> kvm_arch_can_inject_async_page_present returns true if apf.msr_val & >> KVM_ASYNC_PF_ENABLED is not set - shouldn't that be cleared on reset as >> well? Hmm... > > So if you split the reset (which is done on INIT) from setting up CS on > SIPI, you kill two birds with a stone, or at least one and a half: you > clear IF as you wrote above, and INIT on BSP almost works (the only > missing bit should be setting the mp_state to KVM_MP_STATE_RUNNABLE). ...unless the async_pf MSR is cleared on reset as well. Jan
On 2013-03-12 14:12, Gleb Natapov wrote: > On Tue, Mar 12, 2013 at 12:44:41PM +0100, Jan Kiszka wrote: >> A VCPU sending INIT or SIPI to some other VCPU races for setting the >> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED >> was overwritten by kvm_emulate_halt and, thus, got lost. >> >> This introduces APIC events for those two signals, keeping them in >> kvm_apic until kvm_apic_accept_events is run over the target vcpu >> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there >> are pending events, thus if vcpu blocking should end. >> >> The patch comes with the side effect of effectively obsoleting >> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but >> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. >> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED >> state. That also means we no longer exit to user space after receiving a >> SIPI event. >> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >> --- >> >> This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving >> this honor to Paolo. >> >> I didn't try porting any INIT handling for nested on top yet but I >> think it should be feasible - once we know their semantics for sure, at >> least on Intel... >> >> arch/x86/include/asm/kvm_host.h | 1 + >> arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- >> arch/x86/kvm/lapic.h | 7 ++++++ >> arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- >> 4 files changed, 63 insertions(+), 25 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 348d859..2d28e76 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); >> int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); >> int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); >> int kvm_cpu_get_interrupt(struct kvm_vcpu *v); >> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >> >> void kvm_define_shared_msr(unsigned index, u32 msr); >> void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >> index 02b51dd..4a21a6b 100644 >> --- a/arch/x86/kvm/lapic.c >> +++ b/arch/x86/kvm/lapic.c >> @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >> case APIC_DM_INIT: >> if (!trig_mode || level) { >> result = 1; >> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> + set_bit(KVM_APIC_INIT, &apic->pending_events); >> kvm_make_request(KVM_REQ_EVENT, vcpu); >> kvm_vcpu_kick(vcpu); >> } else { >> @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >> case APIC_DM_STARTUP: >> apic_debug("SIPI to vcpu %d vector 0x%02x\n", >> vcpu->vcpu_id, vector); >> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> - result = 1; >> - vcpu->arch.sipi_vector = vector; >> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; >> - kvm_make_request(KVM_REQ_EVENT, vcpu); >> - kvm_vcpu_kick(vcpu); >> - } >> + result = 1; >> + apic->sipi_vector = vector; >> + /* make sure sipi_vector is visible for the receiver */ >> + smp_wmb(); >> + set_bit(KVM_APIC_SIPI, &apic->pending_events); >> + kvm_make_request(KVM_REQ_EVENT, vcpu); >> + kvm_vcpu_kick(vcpu); >> break; >> >> case APIC_DM_EXTINT: >> @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) >> addr); >> } >> >> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) >> +{ >> + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; > The function is called only from kvm_arch_vcpu_runnable() and > kvm_arch_vcpu_runnable() is called only if irqchip is in kernel, so I > think kvm_vcpu_has_lapic() can be dropped. Likely, will check again. > >> +} >> + >> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) >> +{ >> + struct kvm_lapic *apic = vcpu->arch.apic; >> + >> + if (!kvm_vcpu_has_lapic(vcpu)) >> + return; >> + >> + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) >> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && >> + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> + vcpu->arch.sipi_vector = apic->sipi_vector; >> + pr_debug("vcpu %d received sipi with vector # %x\n", >> + vcpu->vcpu_id, vcpu->arch.sipi_vector); >> + kvm_lapic_reset(vcpu); >> + kvm_vcpu_reset(vcpu); > Shouldn't we reset on KVM_APIC_INIT? It is reset a few lines above. Yes, there could be another INIT pending by now, but that is racing with SIPI anyway and could also arrive a few instruction later. So I don't think we need to worry. > >> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >> + } >> +} >> + >> void kvm_lapic_init(void) >> { >> /* do not patch jump label more than once per second */ >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h >> index 1676d34..ef3f4ef 100644 >> --- a/arch/x86/kvm/lapic.h >> +++ b/arch/x86/kvm/lapic.h >> @@ -5,6 +5,9 @@ >> >> #include <linux/kvm_host.h> >> >> +#define KVM_APIC_INIT 0 >> +#define KVM_APIC_SIPI 1 >> + >> struct kvm_timer { >> struct hrtimer timer; >> s64 period; /* unit: ns */ >> @@ -32,13 +35,17 @@ struct kvm_lapic { >> void *regs; >> gpa_t vapic_addr; >> struct page *vapic_page; >> + unsigned long pending_events; >> + unsigned int sipi_vector; >> }; >> int kvm_create_lapic(struct kvm_vcpu *vcpu); >> void kvm_free_lapic(struct kvm_vcpu *vcpu); >> >> int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); >> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); >> int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); >> int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); >> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); >> void kvm_lapic_reset(struct kvm_vcpu *vcpu); >> u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); >> void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index b891ac3..a0b8041 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; >> >> static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); >> >> -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >> - >> static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) >> { >> int i; >> @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >> bool req_immediate_exit = 0; >> >> if (vcpu->requests) { >> + kvm_apic_accept_events(vcpu); >> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> + r = 1; >> + goto out; >> + } > Why not call it under kvm_check_request(KVM_REQ_EVENT, vcpu) bellow? I probably had the desire to handle this event and potential exit reason early. Not sure if it makes any difference. > >> if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) >> kvm_mmu_unload(vcpu); >> if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) >> @@ -5847,14 +5850,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> int r; >> struct kvm *kvm = vcpu->kvm; >> >> - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { >> - pr_debug("vcpu %d received sipi with vector # %x\n", >> - vcpu->vcpu_id, vcpu->arch.sipi_vector); >> - kvm_lapic_reset(vcpu); >> - kvm_vcpu_reset(vcpu); >> - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >> - } >> - >> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >> r = vapic_enter(vcpu); >> if (r) { >> @@ -5871,8 +5866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); >> kvm_vcpu_block(vcpu); >> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >> - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) >> - { >> + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { >> + kvm_apic_accept_events(vcpu); >> switch(vcpu->arch.mp_state) { >> case KVM_MP_STATE_HALTED: >> vcpu->arch.mp_state = >> @@ -5880,7 +5875,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> case KVM_MP_STATE_RUNNABLE: >> vcpu->arch.apf.halted = false; >> break; >> - case KVM_MP_STATE_SIPI_RECEIVED: >> + case KVM_MP_STATE_INIT_RECEIVED: >> + break; >> default: >> r = -EINTR; >> break; >> @@ -5901,7 +5897,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> ++vcpu->stat.request_irq_exits; >> } >> >> - kvm_check_async_pf_completion(vcpu); >> + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) >> + kvm_check_async_pf_completion(vcpu); > Do not understand why is this 'if' needed. To prevent spurious async-pf event injection and mp-state corruption. See the other mail thread with Paolo on how to avoid this, potentially. Jan
On Tue, Mar 12, 2013 at 02:16:29PM +0100, Jan Kiszka wrote: > On 2013-03-12 14:13, Paolo Bonzini wrote: > > Il 12/03/2013 14:01, Jan Kiszka ha scritto: > >>>>>> For example, should kvm_arch_interrupt_allowed return zero if the VCPU > >>>>>> is in the INIT_RECEIVED state? > >>>> > >>>> Yeah, that probably makes sense beyond async_pf. > >> Wait: If you perform a proper reset on INIT already, we would clear IF > >> thus prevent also async_pf injections. On the other hand, > >> kvm_arch_can_inject_async_page_present returns true if apf.msr_val & > >> KVM_ASYNC_PF_ENABLED is not set - shouldn't that be cleared on reset as > >> well? Hmm... > > > > So if you split the reset (which is done on INIT) from setting up CS on > > SIPI, you kill two birds with a stone, or at least one and a half: you > > clear IF as you wrote above, and INIT on BSP almost works (the only > > missing bit should be setting the mp_state to KVM_MP_STATE_RUNNABLE). > > ...unless the async_pf MSR is cleared on reset as well. > It is: vcpu->arch.apf.msr_val = 0; -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-12 14:25, Gleb Natapov wrote: > On Tue, Mar 12, 2013 at 02:16:29PM +0100, Jan Kiszka wrote: >> On 2013-03-12 14:13, Paolo Bonzini wrote: >>> Il 12/03/2013 14:01, Jan Kiszka ha scritto: >>>>>>>> For example, should kvm_arch_interrupt_allowed return zero if the VCPU >>>>>>>> is in the INIT_RECEIVED state? >>>>>> >>>>>> Yeah, that probably makes sense beyond async_pf. >>>> Wait: If you perform a proper reset on INIT already, we would clear IF >>>> thus prevent also async_pf injections. On the other hand, >>>> kvm_arch_can_inject_async_page_present returns true if apf.msr_val & >>>> KVM_ASYNC_PF_ENABLED is not set - shouldn't that be cleared on reset as >>>> well? Hmm... >>> >>> So if you split the reset (which is done on INIT) from setting up CS on >>> SIPI, you kill two birds with a stone, or at least one and a half: you >>> clear IF as you wrote above, and INIT on BSP almost works (the only >>> missing bit should be setting the mp_state to KVM_MP_STATE_RUNNABLE). >> >> ...unless the async_pf MSR is cleared on reset as well. >> > It is: > vcpu->arch.apf.msr_val = 0; Can you explain the "if !(apf.msr_val & KVM_ASYNC_PF_ENABLED) return true;" in kvm_arch_can_inject_async_page_present? Jan
On Tue, Mar 12, 2013 at 02:27:11PM +0100, Jan Kiszka wrote: > On 2013-03-12 14:25, Gleb Natapov wrote: > > On Tue, Mar 12, 2013 at 02:16:29PM +0100, Jan Kiszka wrote: > >> On 2013-03-12 14:13, Paolo Bonzini wrote: > >>> Il 12/03/2013 14:01, Jan Kiszka ha scritto: > >>>>>>>> For example, should kvm_arch_interrupt_allowed return zero if the VCPU > >>>>>>>> is in the INIT_RECEIVED state? > >>>>>> > >>>>>> Yeah, that probably makes sense beyond async_pf. > >>>> Wait: If you perform a proper reset on INIT already, we would clear IF > >>>> thus prevent also async_pf injections. On the other hand, > >>>> kvm_arch_can_inject_async_page_present returns true if apf.msr_val & > >>>> KVM_ASYNC_PF_ENABLED is not set - shouldn't that be cleared on reset as > >>>> well? Hmm... > >>> > >>> So if you split the reset (which is done on INIT) from setting up CS on > >>> SIPI, you kill two birds with a stone, or at least one and a half: you > >>> clear IF as you wrote above, and INIT on BSP almost works (the only > >>> missing bit should be setting the mp_state to KVM_MP_STATE_RUNNABLE). > >> > >> ...unless the async_pf MSR is cleared on reset as well. > >> > > It is: > > vcpu->arch.apf.msr_val = 0; > > Can you explain the "if !(apf.msr_val & KVM_ASYNC_PF_ENABLED) return > true;" in kvm_arch_can_inject_async_page_present? > APF works even with non-PV guests. If non PV guest access swapped out page the page is swapped in asynchronously and vcpu enters apf.halted state. In this sate it can still receive interrupt. Completion of apf cause vcpu exit apf.halted state. On vcpu reset we call kvm_clear_async_pf_completion_queue() so vcpu->async_pf.done will be empty and kvm_check_async_pf_completion() will be nop if vcpu is properly reset on INIT. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Mar 12, 2013 at 02:21:17PM +0100, Jan Kiszka wrote: > On 2013-03-12 14:12, Gleb Natapov wrote: > > On Tue, Mar 12, 2013 at 12:44:41PM +0100, Jan Kiszka wrote: > >> A VCPU sending INIT or SIPI to some other VCPU races for setting the > >> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > >> was overwritten by kvm_emulate_halt and, thus, got lost. > >> > >> This introduces APIC events for those two signals, keeping them in > >> kvm_apic until kvm_apic_accept_events is run over the target vcpu > >> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there > >> are pending events, thus if vcpu blocking should end. > >> > >> The patch comes with the side effect of effectively obsoleting > >> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but > >> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. > >> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED > >> state. That also means we no longer exit to user space after receiving a > >> SIPI event. > >> > >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > >> --- > >> > >> This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving > >> this honor to Paolo. > >> > >> I didn't try porting any INIT handling for nested on top yet but I > >> think it should be feasible - once we know their semantics for sure, at > >> least on Intel... > >> > >> arch/x86/include/asm/kvm_host.h | 1 + > >> arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- > >> arch/x86/kvm/lapic.h | 7 ++++++ > >> arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- > >> 4 files changed, 63 insertions(+), 25 deletions(-) > >> > >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > >> index 348d859..2d28e76 100644 > >> --- a/arch/x86/include/asm/kvm_host.h > >> +++ b/arch/x86/include/asm/kvm_host.h > >> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); > >> int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); > >> int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); > >> int kvm_cpu_get_interrupt(struct kvm_vcpu *v); > >> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); > >> > >> void kvm_define_shared_msr(unsigned index, u32 msr); > >> void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); > >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > >> index 02b51dd..4a21a6b 100644 > >> --- a/arch/x86/kvm/lapic.c > >> +++ b/arch/x86/kvm/lapic.c > >> @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > >> case APIC_DM_INIT: > >> if (!trig_mode || level) { > >> result = 1; > >> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > >> + set_bit(KVM_APIC_INIT, &apic->pending_events); > >> kvm_make_request(KVM_REQ_EVENT, vcpu); > >> kvm_vcpu_kick(vcpu); > >> } else { > >> @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > >> case APIC_DM_STARTUP: > >> apic_debug("SIPI to vcpu %d vector 0x%02x\n", > >> vcpu->vcpu_id, vector); > >> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > >> - result = 1; > >> - vcpu->arch.sipi_vector = vector; > >> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > >> - kvm_make_request(KVM_REQ_EVENT, vcpu); > >> - kvm_vcpu_kick(vcpu); > >> - } > >> + result = 1; > >> + apic->sipi_vector = vector; > >> + /* make sure sipi_vector is visible for the receiver */ > >> + smp_wmb(); > >> + set_bit(KVM_APIC_SIPI, &apic->pending_events); > >> + kvm_make_request(KVM_REQ_EVENT, vcpu); > >> + kvm_vcpu_kick(vcpu); > >> break; > >> > >> case APIC_DM_EXTINT: > >> @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) > >> addr); > >> } > >> > >> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) > >> +{ > >> + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; > > The function is called only from kvm_arch_vcpu_runnable() and > > kvm_arch_vcpu_runnable() is called only if irqchip is in kernel, so I > > think kvm_vcpu_has_lapic() can be dropped. > > Likely, will check again. > > > > >> +} > >> + > >> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) > >> +{ > >> + struct kvm_lapic *apic = vcpu->arch.apic; > >> + > >> + if (!kvm_vcpu_has_lapic(vcpu)) > >> + return; > >> + > >> + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) > >> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > >> + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && > >> + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > >> + vcpu->arch.sipi_vector = apic->sipi_vector; > >> + pr_debug("vcpu %d received sipi with vector # %x\n", > >> + vcpu->vcpu_id, vcpu->arch.sipi_vector); > >> + kvm_lapic_reset(vcpu); > >> + kvm_vcpu_reset(vcpu); > > Shouldn't we reset on KVM_APIC_INIT? > > It is reset a few lines above. Yes, there could be another INIT pending > by now, but that is racing with SIPI anyway and could also arrive a few > instruction later. So I don't think we need to worry. > Not sure I understand. I am saying the code should be: if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_lapic_reset(vcpu); kvm_vcpu_reset(vcpu); } if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { vcpu->arch.sipi_vector = apic->sipi_vector; } > > > >> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > >> + } > >> +} > >> + > >> void kvm_lapic_init(void) > >> { > >> /* do not patch jump label more than once per second */ > >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > >> index 1676d34..ef3f4ef 100644 > >> --- a/arch/x86/kvm/lapic.h > >> +++ b/arch/x86/kvm/lapic.h > >> @@ -5,6 +5,9 @@ > >> > >> #include <linux/kvm_host.h> > >> > >> +#define KVM_APIC_INIT 0 > >> +#define KVM_APIC_SIPI 1 > >> + > >> struct kvm_timer { > >> struct hrtimer timer; > >> s64 period; /* unit: ns */ > >> @@ -32,13 +35,17 @@ struct kvm_lapic { > >> void *regs; > >> gpa_t vapic_addr; > >> struct page *vapic_page; > >> + unsigned long pending_events; > >> + unsigned int sipi_vector; > >> }; > >> int kvm_create_lapic(struct kvm_vcpu *vcpu); > >> void kvm_free_lapic(struct kvm_vcpu *vcpu); > >> > >> int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); > >> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); > >> int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); > >> int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); > >> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); > >> void kvm_lapic_reset(struct kvm_vcpu *vcpu); > >> u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); > >> void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); > >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >> index b891ac3..a0b8041 100644 > >> --- a/arch/x86/kvm/x86.c > >> +++ b/arch/x86/kvm/x86.c > >> @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; > >> > >> static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); > >> > >> -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); > >> - > >> static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) > >> { > >> int i; > >> @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > >> bool req_immediate_exit = 0; > >> > >> if (vcpu->requests) { > >> + kvm_apic_accept_events(vcpu); > >> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > >> + r = 1; > >> + goto out; > >> + } > > Why not call it under kvm_check_request(KVM_REQ_EVENT, vcpu) bellow? > > I probably had the desire to handle this event and potential exit reason > early. Not sure if it makes any difference. > It shouldn't, so better to handle with all other apic events. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Il 12/03/2013 14:41, Gleb Natapov ha scritto: > Not sure I understand. I am saying the code should be: > > if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { > vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > kvm_lapic_reset(vcpu); > kvm_vcpu_reset(vcpu); > } > if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && > vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > vcpu->arch.sipi_vector = apic->sipi_vector; > } > Yes, this is also discussed in the async_pf thread. But kvm_vcpu_reset is what sets CS based on the sipi_vector, so some more changes are needed (or you can just reset the VCPU twice, but that's ugly...). Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Mar 12, 2013 at 02:43:38PM +0100, Paolo Bonzini wrote: > Il 12/03/2013 14:41, Gleb Natapov ha scritto: > > Not sure I understand. I am saying the code should be: > > > > if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { > > vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > > kvm_lapic_reset(vcpu); > > kvm_vcpu_reset(vcpu); > > } > > if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && > > vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > > vcpu->arch.sipi_vector = apic->sipi_vector; > > } > > > > Yes, this is also discussed in the async_pf thread. But kvm_vcpu_reset > is what sets CS based on the sipi_vector, so some more changes are > needed (or you can just reset the VCPU twice, but that's ugly...). > Ah correct. We can start from reseting twice and documenting why are we doing it. Then we should move sregs register initialization to common code and factor out CS handling in separate function which will be called on SIPI. Or just call kvm_set_segment() on SIPI. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-12 13:06, Paolo Bonzini wrote: >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >> index 02b51dd..4a21a6b 100644 >> --- a/arch/x86/kvm/lapic.c >> +++ b/arch/x86/kvm/lapic.c >> @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >> case APIC_DM_INIT: >> if (!trig_mode || level) { >> result = 1; >> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> + set_bit(KVM_APIC_INIT, &apic->pending_events); > > I think this should clear pending SIPIs, unless KVM_APIC_INIT was > already set in which case it should be a no-op. Something like: > > e = apic->pending_events; > while (!(e & KVM_APIC_INIT)) > e = cmpxchg(&apic->pending_events, e, > (e | KVM_APIC_INIT) & ~KVM_APIC_SIPI); > > If you do this, better make pending_events an atomic_t. Quick question: Why atomic_t? It becomes a bit ugly to poke into the atomic counter for bitops, and cmpxchg is mapped on an interlocked version. Jan
-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Il 13/03/2013 08:39, Jan Kiszka ha scritto: >> I think this should clear pending SIPIs, unless KVM_APIC_INIT >> was already set in which case it should be a no-op. Something >> like: >> >> e = apic->pending_events; while (!(e & KVM_APIC_INIT)) e = >> cmpxchg(&apic->pending_events, e, (e | KVM_APIC_INIT) & >> ~KVM_APIC_SIPI); >> >> If you do this, better make pending_events an atomic_t. > > Quick question: Why atomic_t? It becomes a bit ugly to poke into > the atomic counter for bitops, and cmpxchg is mapped on an > interlocked version. It's a bit safer because it forces you to use atomic_read, and generally points out to reviewers to think about memory barriers. But indeed the lack of atomic_set_bit and atomic_clear_bit makes the code uglier. And as long as there are no other bits in apic->pending_events, the above loop is actually the same as just apic->pending_events = 1 << KVM_APIC_INIT; Paolo -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.19 (GNU/Linux) Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/ iQIcBAEBAgAGBQJRQEDHAAoJEBvWZb6bTYbyjhcP/iiky7ZedYXuAA3nI9rVNI79 YUNL9wWzK7EEa7sT54/ky6D5cY+6f/Zjy3TBQexpNscJWJvdPWgU33nA4cA4YnXn cguOUbZ63AKe1a/fFmyc8Kb45Y2tJVJ3tLOH0WZHBh1gPE8BWOCSx/cUncb15oBc CTWS7l6ZjTN5wSJytR/nbGSBvdzBmrlKgMpFb9eiP0lhnMLhrtX/ihdsyLzk8mJV nyTH9LFu3jJoVsSMDn341egx6QGaWiZ/1yzBg8zS9YbbqamxYsUKhBZtF+jwdwR/ hDQsQJuk6d3zJxJysAb1ESQGA18ulKZ2FAa0ozUIn6W1BqKihzW9smZ5gDrs7ZNW zQaraN6swhxvN1UN/Ik1r6esBc2b/LCGtKAgl/Vjm+Kyf5skhVxREUZAGHLOY+Q6 e3TuEAY0N4zLtS4zTB/1tom+MzpWdS2Zrla8pTP8hDGj54vopxCK1fbonv13dwzH hkd+ddzWqRzclDy6MIDNW9e/hzn4g37UMIr4ZkezwciFHz/Oxti5qj8+DOOO0R9r k0CNpyRFwyC+IccS00omdivX9sHW7tbYw2LVKuw4rWvgW5RnGCcRUj9TdxtO1twy 66DhSobVUZr6RHJsYCWtOzyZacKcRHzYGKuizSq9idTRqMpArTVn8jIt69nruG04 LFGiA06mwKP+OGLmmAbN =Y3al -----END PGP SIGNATURE----- -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2013-03-13 10:03, Paolo Bonzini wrote: > Il 13/03/2013 08:39, Jan Kiszka ha scritto: >>> I think this should clear pending SIPIs, unless KVM_APIC_INIT >>> was already set in which case it should be a no-op. Something >>> like: >>> >>> e = apic->pending_events; while (!(e & KVM_APIC_INIT)) e = >>> cmpxchg(&apic->pending_events, e, (e | KVM_APIC_INIT) & >>> ~KVM_APIC_SIPI); >>> >>> If you do this, better make pending_events an atomic_t. > >> Quick question: Why atomic_t? It becomes a bit ugly to poke into >> the atomic counter for bitops, and cmpxchg is mapped on an >> interlocked version. > > It's a bit safer because it forces you to use atomic_read, and > generally points out to reviewers to think about memory barriers. But > indeed the lack of atomic_set_bit and atomic_clear_bit makes the code > uglier. And as long as there are no other bits in > apic->pending_events, the above loop is actually the same as just > > apic->pending_events = 1 << KVM_APIC_INIT; Yeah, can simplify this in the next version of the just posted patch. Jan
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 348d859..2d28e76 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_define_shared_msr(unsigned index, u32 msr); void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 02b51dd..4a21a6b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (!trig_mode || level) { result = 1; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + set_bit(KVM_APIC_INIT, &apic->pending_events); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_STARTUP: apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { - result = 1; - vcpu->arch.sipi_vector = vector; - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - } + result = 1; + apic->sipi_vector = vector; + /* make sure sipi_vector is visible for the receiver */ + smp_wmb(); + set_bit(KVM_APIC_SIPI, &apic->pending_events); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_EXTINT: @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) addr); } +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; +} + +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!kvm_vcpu_has_lapic(vcpu)) + return; + + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + vcpu->arch.sipi_vector = apic->sipi_vector; + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + kvm_vcpu_reset(vcpu); + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } +} + void kvm_lapic_init(void) { /* do not patch jump label more than once per second */ diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1676d34..ef3f4ef 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -5,6 +5,9 @@ #include <linux/kvm_host.h> +#define KVM_APIC_INIT 0 +#define KVM_APIC_SIPI 1 + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -32,13 +35,17 @@ struct kvm_lapic { void *regs; gpa_t vapic_addr; struct page *vapic_page; + unsigned long pending_events; + unsigned int sipi_vector; }; int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b891ac3..a0b8041 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); - static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = 0; if (vcpu->requests) { + kvm_apic_accept_events(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + r = 1; + goto out; + } if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -5847,14 +5850,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) int r; struct kvm *kvm = vcpu->kvm; - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - kvm_vcpu_reset(vcpu); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); r = vapic_enter(vcpu); if (r) { @@ -5871,8 +5866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) - { + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { + kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: vcpu->arch.mp_state = @@ -5880,7 +5875,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; break; - case KVM_MP_STATE_SIPI_RECEIVED: + case KVM_MP_STATE_INIT_RECEIVED: + break; default: r = -EINTR; break; @@ -5901,7 +5897,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ++vcpu->stat.request_irq_exits; } - kvm_check_async_pf_completion(vcpu); + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) + kvm_check_async_pf_completion(vcpu); if (signal_pending(current)) { r = -EINTR; @@ -6015,6 +6012,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); + kvm_apic_accept_events(vcpu); clear_bit(KVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; @@ -6171,6 +6169,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + kvm_apic_accept_events(vcpu); mp_state->mp_state = vcpu->arch.mp_state; return 0; } @@ -6178,7 +6177,13 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu->arch.mp_state = mp_state->mp_state; + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { + if (!kvm_vcpu_has_lapic(vcpu)) + return -EINVAL; + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); + } else + vcpu->arch.mp_state = mp_state->mp_state; kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -6515,7 +6520,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu) +void kvm_vcpu_reset(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; @@ -6988,7 +6993,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || kvm_apic_has_events(vcpu) || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu));
A VCPU sending INIT or SIPI to some other VCPU races for setting the remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED was overwritten by kvm_emulate_halt and, thus, got lost. This introduces APIC events for those two signals, keeping them in kvm_apic until kvm_apic_accept_events is run over the target vcpu context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there are pending events, thus if vcpu blocking should end. The patch comes with the side effect of effectively obsoleting KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED state. That also means we no longer exit to user space after receiving a SIPI event. Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> --- This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving this honor to Paolo. I didn't try porting any INIT handling for nested on top yet but I think it should be feasible - once we know their semantics for sure, at least on Intel... arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- arch/x86/kvm/lapic.h | 7 ++++++ arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- 4 files changed, 63 insertions(+), 25 deletions(-)