Message ID | 20210428173820.13051-1-jon@nutanix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | kvm: x86: move srcu lock out of kvm_vcpu_check_block | expand |
On Wed, Apr 28, 2021, Jon Kohler wrote: > To improve performance, this moves kvm->srcu lock logic from > kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around > check_events. Also adds a hint for callers to tell > kvm_vcpu_running whether or not to acquire srcu, which is useful in > situations where the lock may already be held. With this in place, we > see roughly 5% improvement in an internal benchmark [3] and no more > impact from this lock on non-nested workloads. ... > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index efc7a82ab140..354f690cc982 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) > return 1; > } > > -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) > +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu) > { > - if (is_guest_mode(vcpu)) > - kvm_x86_ops.nested_ops->check_events(vcpu); > + if (is_guest_mode(vcpu)) { > + if (acquire_srcu) { > + /* > + * We need to lock because check_events could call > + * nested_vmx_vmexit() which might need to resolve a > + * valid memslot. We will have this lock only when > + * called from vcpu_run but not when called from > + * kvm_vcpu_check_block > kvm_arch_vcpu_runnable. > + */ > + int idx = srcu_read_lock(&vcpu->kvm->srcu); > + kvm_x86_ops.nested_ops->check_events(vcpu); > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > + } else { > + kvm_x86_ops.nested_ops->check_events(vcpu); > + } > + } Obviously not your fault, but I absolutely detest calling check_events() from kvm_vcpu_running. I would much prefer to make baby steps toward cleaning up the existing mess instead of piling more weirdness on top. Ideally, APICv support would be fixed to not require a deep probe into nested events just to see if a vCPU can run. But, that's probably more than we want to bite off at this time. What if we add another nested_ops API to check if the vCPU has an event, but not actually process the event? I think that would allow eliminating the SRCU lock, and would get rid of the most egregious behavior of triggering a nested VM-Exit in a seemingly innocuous helper. If this works, we could even explore moving the call to nested_ops->has_events() out of kvm_vcpu_running() and into kvm_vcpu_has_events(); I can't tell if the side effects in vcpu_block() would get messed up with that change :-/ Incomplete patch... diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 00339d624c92..15f514891326 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3771,15 +3771,17 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.preemption_timer_expired; } -static int vmx_check_nested_events(struct kvm_vcpu *vcpu) +static int __vmx_check_nested_events(struct kvm_vcpu *vcpu, bool only_check) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qual; - bool block_nested_events = - vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); bool mtf_pending = vmx->nested.mtf_pending; struct kvm_lapic *apic = vcpu->arch.apic; + bool block_nested_events = only_check || + vmx->nested.nested_run_pending || + kvm_event_needs_reinjection(vcpu); + /* * Clear the MTF state. If a higher priority VM-exit is delivered first, * this state is discarded. @@ -3837,7 +3839,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } if (vcpu->arch.exception.pending) { - if (vmx->nested.nested_run_pending) + if (vmx->nested.nested_run_pending || only_check) return -EBUSY; if (!nested_vmx_check_exception(vcpu, &exit_qual)) goto no_vmexit; @@ -3886,10 +3888,23 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } no_vmexit: - vmx_complete_nested_posted_interrupt(vcpu); + if (!check_only) + vmx_complete_nested_posted_interrupt(vcpu); + else if (vmx->nested.pi_desc && vmx->nested.pi_pending) + return -EBUSY; return 0; } +static bool vmx_has_nested_event(struct kvm_vcpu *vcpu) +{ + return !!__vmx_check_nested_events(vcpu, true); +} + +static int vmx_check_nested_events(struct kvm_vcpu *vcpu) +{ + return __vmx_check_nested_events(vcpu, false); +} + static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) { ktime_t remaining = @@ -6627,6 +6642,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) } struct kvm_x86_nested_ops vmx_nested_ops = { + .has_event = vmx_has_nested_event, .check_events = vmx_check_nested_events, .hv_timer_pending = nested_vmx_preemption_timer_pending, .triple_fault = nested_vmx_triple_fault, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a829f1ab60c3..5df01012cb1f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9310,6 +9310,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } + } else if (is_guest_mode(vcpu)) { + r = kvm_check_nested_events(vcpu); + if (r < 0) + req_immediate_exit = true; } r = kvm_mmu_reload(vcpu); @@ -9516,8 +9520,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) - kvm_check_nested_events(vcpu); + if (is_guest_mode(vcpu) && + (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu) || + kvm_x86_ops.nested_ops->has_event(vcpu))) + return true; return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted);
On 30/04/21 22:45, Sean Christopherson wrote: > On Wed, Apr 28, 2021, Jon Kohler wrote: >> To improve performance, this moves kvm->srcu lock logic from >> kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around >> check_events. Also adds a hint for callers to tell >> kvm_vcpu_running whether or not to acquire srcu, which is useful in >> situations where the lock may already be held. With this in place, we >> see roughly 5% improvement in an internal benchmark [3] and no more >> impact from this lock on non-nested workloads. > > ... > >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index efc7a82ab140..354f690cc982 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) >> return 1; >> } >> >> -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) >> +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu) >> { >> - if (is_guest_mode(vcpu)) >> - kvm_x86_ops.nested_ops->check_events(vcpu); >> + if (is_guest_mode(vcpu)) { >> + if (acquire_srcu) { >> + /* >> + * We need to lock because check_events could call >> + * nested_vmx_vmexit() which might need to resolve a >> + * valid memslot. We will have this lock only when >> + * called from vcpu_run but not when called from >> + * kvm_vcpu_check_block > kvm_arch_vcpu_runnable. >> + */ >> + int idx = srcu_read_lock(&vcpu->kvm->srcu); >> + kvm_x86_ops.nested_ops->check_events(vcpu); >> + srcu_read_unlock(&vcpu->kvm->srcu, idx); >> + } else { >> + kvm_x86_ops.nested_ops->check_events(vcpu); >> + } >> + } > > Obviously not your fault, but I absolutely detest calling check_events() from > kvm_vcpu_running. I would much prefer to make baby steps toward cleaning up the > existing mess instead of piling more weirdness on top. > > Ideally, APICv support would be fixed to not require a deep probe into nested > events just to see if a vCPU can run. But, that's probably more than we want to > bite off at this time. > > What if we add another nested_ops API to check if the vCPU has an event, but not > actually process the event? I think that would allow eliminating the SRCU lock, > and would get rid of the most egregious behavior of triggering a nested VM-Exit > in a seemingly innocuous helper. > > If this works, we could even explore moving the call to nested_ops->has_events() > out of kvm_vcpu_running() and into kvm_vcpu_has_events(); I can't tell if the > side effects in vcpu_block() would get messed up with that change :-/ > > Incomplete patch... I think it doesn't even have to be *nested* events. Most events are the same inside or outside guest mode, as they already special case guest mode inside the kvm_x86_ops callbacks (e.g. kvm_arch_interrupt_allowed is already called by kvm_vcpu_has_events). I think we only need to extend kvm_x86_ops.nested_ops->hv_timer_pending to cover MTF, plus double check that INIT and SIPI are handled correctly, and then the call to check_nested_events can go away. Paolo > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > index 00339d624c92..15f514891326 100644 > --- a/arch/x86/kvm/vmx/nested.c > +++ b/arch/x86/kvm/vmx/nested.c > @@ -3771,15 +3771,17 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) > to_vmx(vcpu)->nested.preemption_timer_expired; > } > > -static int vmx_check_nested_events(struct kvm_vcpu *vcpu) > +static int __vmx_check_nested_events(struct kvm_vcpu *vcpu, bool only_check) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > unsigned long exit_qual; > - bool block_nested_events = > - vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); > bool mtf_pending = vmx->nested.mtf_pending; > struct kvm_lapic *apic = vcpu->arch.apic; > > + bool block_nested_events = only_check || > + vmx->nested.nested_run_pending || > + kvm_event_needs_reinjection(vcpu); > + > /* > * Clear the MTF state. If a higher priority VM-exit is delivered first, > * this state is discarded. > @@ -3837,7 +3839,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) > } > > if (vcpu->arch.exception.pending) { > - if (vmx->nested.nested_run_pending) > + if (vmx->nested.nested_run_pending || only_check) > return -EBUSY; > if (!nested_vmx_check_exception(vcpu, &exit_qual)) > goto no_vmexit; > @@ -3886,10 +3888,23 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) > } > > no_vmexit: > - vmx_complete_nested_posted_interrupt(vcpu); > + if (!check_only) > + vmx_complete_nested_posted_interrupt(vcpu); > + else if (vmx->nested.pi_desc && vmx->nested.pi_pending) > + return -EBUSY; > return 0; > } > > +static bool vmx_has_nested_event(struct kvm_vcpu *vcpu) > +{ > + return !!__vmx_check_nested_events(vcpu, true); > +} > + > +static int vmx_check_nested_events(struct kvm_vcpu *vcpu) > +{ > + return __vmx_check_nested_events(vcpu, false); > +} > + > static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) > { > ktime_t remaining = > @@ -6627,6 +6642,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) > } > > struct kvm_x86_nested_ops vmx_nested_ops = { > + .has_event = vmx_has_nested_event, > .check_events = vmx_check_nested_events, > .hv_timer_pending = nested_vmx_preemption_timer_pending, > .triple_fault = nested_vmx_triple_fault, > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index a829f1ab60c3..5df01012cb1f 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -9310,6 +9310,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > update_cr8_intercept(vcpu); > kvm_lapic_sync_to_vapic(vcpu); > } > + } else if (is_guest_mode(vcpu)) { > + r = kvm_check_nested_events(vcpu); > + if (r < 0) > + req_immediate_exit = true; > } > > r = kvm_mmu_reload(vcpu); > @@ -9516,8 +9520,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) > > static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) > { > - if (is_guest_mode(vcpu)) > - kvm_check_nested_events(vcpu); > + if (is_guest_mode(vcpu) && > + (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu) || > + kvm_x86_ops.nested_ops->has_event(vcpu))) > + return true; > > return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && > !vcpu->arch.apf.halted); >
> On May 1, 2021, at 9:05 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 30/04/21 22:45, Sean Christopherson wrote: >> On Wed, Apr 28, 2021, Jon Kohler wrote: >>> To improve performance, this moves kvm->srcu lock logic from >>> kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around >>> check_events. Also adds a hint for callers to tell >>> kvm_vcpu_running whether or not to acquire srcu, which is useful in >>> situations where the lock may already be held. With this in place, we >>> see roughly 5% improvement in an internal benchmark [3] and no more >>> impact from this lock on non-nested workloads. >> ... >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>> index efc7a82ab140..354f690cc982 100644 >>> --- a/arch/x86/kvm/x86.c >>> +++ b/arch/x86/kvm/x86.c >>> @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) >>> return 1; >>> } >>> >>> -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) >>> +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu) >>> { >>> - if (is_guest_mode(vcpu)) >>> - kvm_x86_ops.nested_ops->check_events(vcpu); >>> + if (is_guest_mode(vcpu)) { >>> + if (acquire_srcu) { >>> + /* >>> + * We need to lock because check_events could call >>> + * nested_vmx_vmexit() which might need to resolve a >>> + * valid memslot. We will have this lock only when >>> + * called from vcpu_run but not when called from >>> + * kvm_vcpu_check_block > kvm_arch_vcpu_runnable. >>> + */ >>> + int idx = srcu_read_lock(&vcpu->kvm->srcu); >>> + kvm_x86_ops.nested_ops->check_events(vcpu); >>> + srcu_read_unlock(&vcpu->kvm->srcu, idx); >>> + } else { >>> + kvm_x86_ops.nested_ops->check_events(vcpu); >>> + } >>> + } >> Obviously not your fault, but I absolutely detest calling check_events() from >> kvm_vcpu_running. I would much prefer to make baby steps toward cleaning up the >> existing mess instead of piling more weirdness on top. >> Ideally, APICv support would be fixed to not require a deep probe into nested >> events just to see if a vCPU can run. But, that's probably more than we want to >> bite off at this time. >> What if we add another nested_ops API to check if the vCPU has an event, but not >> actually process the event? I think that would allow eliminating the SRCU lock, >> and would get rid of the most egregious behavior of triggering a nested VM-Exit >> in a seemingly innocuous helper. >> If this works, we could even explore moving the call to nested_ops->has_events() >> out of kvm_vcpu_running() and into kvm_vcpu_has_events(); I can't tell if the >> side effects in vcpu_block() would get messed up with that change :-/ >> Incomplete patch... > > I think it doesn't even have to be *nested* events. Most events are the same inside or outside guest mode, as they already special case guest mode inside the kvm_x86_ops callbacks (e.g. kvm_arch_interrupt_allowed is already called by kvm_vcpu_has_events). > > I think we only need to extend kvm_x86_ops.nested_ops->hv_timer_pending to cover MTF, plus double check that INIT and SIPI are handled correctly, and then the call to check_nested_events can go away. > > Paolo [ resending as my editor switched to html :( ] Thanks, Paolo, Sean. I appreciate the prompt response, Sorry for the slow reply, I was out with a hand injury for a few days and am caught up now. Just to confirm - In the spirit of baby steps as Sean mentioned, I’m happy to take up the idea that Sean mentioned, that makes sense to me. Perhaps we can do that small patch and leave a TODO do a tuneup for hv_timer_pending and the other double checks Paolo mentioned. Or would you rather skip that approach and go right to making check_nested_events go-away first? Guessing that might be a larger effort with more nuances though? Happy to help, thanks again, Jon > >> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c >> index 00339d624c92..15f514891326 100644 >> --- a/arch/x86/kvm/vmx/nested.c >> +++ b/arch/x86/kvm/vmx/nested.c >> @@ -3771,15 +3771,17 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) >> to_vmx(vcpu)->nested.preemption_timer_expired; >> } >> -static int vmx_check_nested_events(struct kvm_vcpu *vcpu) >> +static int __vmx_check_nested_events(struct kvm_vcpu *vcpu, bool only_check) >> { >> struct vcpu_vmx *vmx = to_vmx(vcpu); >> unsigned long exit_qual; >> - bool block_nested_events = >> - vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); >> bool mtf_pending = vmx->nested.mtf_pending; >> struct kvm_lapic *apic = vcpu->arch.apic; >> + bool block_nested_events = only_check || >> + vmx->nested.nested_run_pending || >> + kvm_event_needs_reinjection(vcpu); >> + >> /* >> * Clear the MTF state. If a higher priority VM-exit is delivered first, >> * this state is discarded. >> @@ -3837,7 +3839,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) >> } >> if (vcpu->arch.exception.pending) { >> - if (vmx->nested.nested_run_pending) >> + if (vmx->nested.nested_run_pending || only_check) >> return -EBUSY; >> if (!nested_vmx_check_exception(vcpu, &exit_qual)) >> goto no_vmexit; >> @@ -3886,10 +3888,23 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) >> } >> no_vmexit: >> - vmx_complete_nested_posted_interrupt(vcpu); >> + if (!check_only) >> + vmx_complete_nested_posted_interrupt(vcpu); >> + else if (vmx->nested.pi_desc && vmx->nested.pi_pending) >> + return -EBUSY; >> return 0; >> } >> +static bool vmx_has_nested_event(struct kvm_vcpu *vcpu) >> +{ >> + return !!__vmx_check_nested_events(vcpu, true); >> +} >> + >> +static int vmx_check_nested_events(struct kvm_vcpu *vcpu) >> +{ >> + return __vmx_check_nested_events(vcpu, false); >> +} >> + >> static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) >> { >> ktime_t remaining = >> @@ -6627,6 +6642,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) >> } >> struct kvm_x86_nested_ops vmx_nested_ops = { >> + .has_event = vmx_has_nested_event, >> .check_events = vmx_check_nested_events, >> .hv_timer_pending = nested_vmx_preemption_timer_pending, >> .triple_fault = nested_vmx_triple_fault, >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index a829f1ab60c3..5df01012cb1f 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -9310,6 +9310,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >> update_cr8_intercept(vcpu); >> kvm_lapic_sync_to_vapic(vcpu); >> } >> + } else if (is_guest_mode(vcpu)) { >> + r = kvm_check_nested_events(vcpu); >> + if (r < 0) >> + req_immediate_exit = true; >> } >> r = kvm_mmu_reload(vcpu); >> @@ -9516,8 +9520,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) >> static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) >> { >> - if (is_guest_mode(vcpu)) >> - kvm_check_nested_events(vcpu); >> + if (is_guest_mode(vcpu) && >> + (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu) || >> + kvm_x86_ops.nested_ops->has_event(vcpu))) >> + return true; >> return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && >> !vcpu->arch.apf.halted); >
On Wed, May 05, 2021, Jon Kohler wrote: > > > On May 1, 2021, at 9:05 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > > > On 30/04/21 22:45, Sean Christopherson wrote: > >> On Wed, Apr 28, 2021, Jon Kohler wrote: > >>> To improve performance, this moves kvm->srcu lock logic from > >>> kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around > >>> check_events. Also adds a hint for callers to tell > >>> kvm_vcpu_running whether or not to acquire srcu, which is useful in > >>> situations where the lock may already be held. With this in place, we > >>> see roughly 5% improvement in an internal benchmark [3] and no more > >>> impact from this lock on non-nested workloads. > >> ... > >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >>> index efc7a82ab140..354f690cc982 100644 > >>> --- a/arch/x86/kvm/x86.c > >>> +++ b/arch/x86/kvm/x86.c > >>> @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) > >>> return 1; > >>> } > >>> > >>> -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) > >>> +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu) > >>> { > >>> - if (is_guest_mode(vcpu)) > >>> - kvm_x86_ops.nested_ops->check_events(vcpu); > >>> + if (is_guest_mode(vcpu)) { > >>> + if (acquire_srcu) { > >>> + /* > >>> + * We need to lock because check_events could call > >>> + * nested_vmx_vmexit() which might need to resolve a > >>> + * valid memslot. We will have this lock only when > >>> + * called from vcpu_run but not when called from > >>> + * kvm_vcpu_check_block > kvm_arch_vcpu_runnable. > >>> + */ > >>> + int idx = srcu_read_lock(&vcpu->kvm->srcu); > >>> + kvm_x86_ops.nested_ops->check_events(vcpu); > >>> + srcu_read_unlock(&vcpu->kvm->srcu, idx); > >>> + } else { > >>> + kvm_x86_ops.nested_ops->check_events(vcpu); > >>> + } > >>> + } > >> Obviously not your fault, but I absolutely detest calling check_events() from > >> kvm_vcpu_running. I would much prefer to make baby steps toward cleaning up the > >> existing mess instead of piling more weirdness on top. > >> > >> Ideally, APICv support would be fixed to not require a deep probe into nested > >> events just to see if a vCPU can run. But, that's probably more than we want to > >> bite off at this time. > >> > >> What if we add another nested_ops API to check if the vCPU has an event, but not > >> actually process the event? I think that would allow eliminating the SRCU lock, > >> and would get rid of the most egregious behavior of triggering a nested VM-Exit > >> in a seemingly innocuous helper. > >> > >> If this works, we could even explore moving the call to nested_ops->has_events() > >> out of kvm_vcpu_running() and into kvm_vcpu_has_events(); I can't tell if the > >> side effects in vcpu_block() would get messed up with that change :-/ > >> Incomplete patch... > > > > I think it doesn't even have to be *nested* events. Most events are the > > same inside or outside guest mode, as they already special case guest mode > > inside the kvm_x86_ops callbacks (e.g. kvm_arch_interrupt_allowed is > > already called by kvm_vcpu_has_events). > > > > I think we only need to extend kvm_x86_ops.nested_ops->hv_timer_pending to > > cover MTF, plus double check that INIT and SIPI are handled correctly, and > > then the call to check_nested_events can go away. > > Thanks, Paolo, Sean. I appreciate the prompt response, Sorry for the slow > reply, I was out with a hand injury for a few days and am caught up now. > > Just to confirm - In the spirit of baby steps as Sean mentioned, I’m happy to > take up the idea that Sean mentioned, that makes sense to me. Perhaps we can > do that small patch and leave a TODO do a tuneup for hv_timer_pending and the > other double checks Paolo mentioned. Paolo was pointing out that kvm_vcpu_has_events() already checks hv_timer_pending, and that we could add the few missing nested event cases to kvm_vcpu_has_events() instead of wholesale checking everything that's in check_nested_events(). I believe that would work, as I suspect the underlying bug that was alluded to by commit 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running") has since been fixed. But, I'm not sure it makes much difference in practice since we'll likely end up with nested_ops->has_events() either way. Staring a bit more, I'm pretty sure hv_timer_pending() can be made obsolete and dropped. Unless Paolo objects, I still like my original proposal. I think the safest approach from a bisection standpoint would be to do this in 3-4 stages: 1. Refactor check_nested_events() to split out a has_events() helper. 2. Move the has_events() call from kvm_vcpu_running() into kvm_vcpu_has_events() 3. Drop the explicit hv_timer_pending() in inject_pending_event(). It should be dead code since it's just a pointer to nested_vmx_preemption_timer_pending(), which is handled by vmx_check_nested_events() and called earlier. 4. Drop the explicit hv_timer_pending() in kvm_vcpu_has_events() for the same reasons as (3). This can also drop hv_timer_pending() entirely. > Or would you rather skip that approach and go right to making > check_nested_events go-away first? Guessing that might be a larger effort > with more nuances though?
On 19/05/21 23:53, Sean Christopherson wrote: > 1. Refactor check_nested_events() to split out a has_events() helper. > 2. Move the has_events() call from kvm_vcpu_running() into kvm_vcpu_has_events() > 3. Drop the explicit hv_timer_pending() in inject_pending_event(). It should > be dead code since it's just a pointer to nested_vmx_preemption_timer_pending(), > which is handled by vmx_check_nested_events() and called earlier. > 4. Drop the explicit hv_timer_pending() in kvm_vcpu_has_events() for the same > reasons as (3). This can also drop hv_timer_pending() entirely. Sounds good except that I would do (3) first, since if I understand correctly it's a valid cleanup in the current code as well, and do (2) and (4) at the same time since you're basically enlarging the scope of the existing hv_timer_pending call to include all nested events. Paolo
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index efc7a82ab140..354f690cc982 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) return 1; } -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu) { - if (is_guest_mode(vcpu)) - kvm_x86_ops.nested_ops->check_events(vcpu); + if (is_guest_mode(vcpu)) { + if (acquire_srcu) { + /* + * We need to lock because check_events could call + * nested_vmx_vmexit() which might need to resolve a + * valid memslot. We will have this lock only when + * called from vcpu_run but not when called from + * kvm_vcpu_check_block > kvm_arch_vcpu_runnable. + */ + int idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_x86_ops.nested_ops->check_events(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } else { + kvm_x86_ops.nested_ops->check_events(vcpu); + } + } return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -9291,7 +9305,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.l1tf_flush_l1d = true; for (;;) { - if (kvm_vcpu_running(vcpu)) { + if (kvm_vcpu_running(vcpu, false)) { r = vcpu_enter_guest(vcpu); } else { r = vcpu_block(kvm, vcpu); @@ -10999,7 +11013,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu, true) || kvm_vcpu_has_events(vcpu); } bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 383df23514b9..05e29aed35b5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2783,22 +2783,15 @@ static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) { - int ret = -EINTR; - int idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_arch_vcpu_runnable(vcpu)) { kvm_make_request(KVM_REQ_UNHALT, vcpu); - goto out; + return -EINTR; } - if (kvm_cpu_has_pending_timer(vcpu)) - goto out; - if (signal_pending(current)) - goto out; - ret = 0; -out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); - return ret; + if (kvm_cpu_has_pending_timer(vcpu) || signal_pending(current)) + return -EINTR; + + return 0; } static inline void