diff mbox series

[v4,2/5] KVM: LAPIC: inject lapic timer interrupt by posted interrupt

Message ID 1560770687-23227-3-git-send-email-wanpengli@tencent.com (mailing list archive)
State New, archived
Headers show
Series KVM: LAPIC: Implement Exitless Timer | expand

Commit Message

Wanpeng Li June 17, 2019, 11:24 a.m. UTC
From: Wanpeng Li <wanpengli@tencent.com>

Dedicated instances are currently disturbed by unnecessary jitter due 
to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
There is no hardware virtual timer on Intel for guest like ARM. Both 
programming timer in guest and the emulated timer fires incur vmexits.
This patch tries to avoid vmexit which is incurred by the emulated 
timer fires in dedicated instance scenario. 

When nohz_full is enabled in dedicated instances scenario, the emulated 
timers can be offload to the nearest busy housekeeping cpus since APICv 
is really common in recent years. The guest timer interrupt is injected 
by posted-interrupt which is delivered by housekeeping cpu once the emulated 
timer fires. 

The host admin should fine tuned, e.g. dedicated instances scenario w/ 
nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus 
for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root  
mode, ~3% redis performance benefit can be observed on Skylake server.

w/o patch:

            VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time

EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )

w/ patch:

            VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time

EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
 arch/x86/kvm/lapic.h            |  1 +
 arch/x86/kvm/vmx/vmx.c          |  3 ++-
 arch/x86/kvm/x86.c              |  5 +++++
 arch/x86/kvm/x86.h              |  2 ++
 include/linux/sched/isolation.h |  2 ++
 kernel/sched/isolation.c        |  6 ++++++
 7 files changed, 44 insertions(+), 8 deletions(-)

Comments

Marcelo Tosatti June 18, 2019, 1:35 p.m. UTC | #1
On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> Dedicated instances are currently disturbed by unnecessary jitter due 
> to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> There is no hardware virtual timer on Intel for guest like ARM. Both 
> programming timer in guest and the emulated timer fires incur vmexits.
> This patch tries to avoid vmexit which is incurred by the emulated 
> timer fires in dedicated instance scenario. 
> 
> When nohz_full is enabled in dedicated instances scenario, the emulated 
> timers can be offload to the nearest busy housekeeping cpus since APICv 
> is really common in recent years. The guest timer interrupt is injected 
> by posted-interrupt which is delivered by housekeeping cpu once the emulated 
> timer fires. 
> 
> The host admin should fine tuned, e.g. dedicated instances scenario w/ 
> nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus 
> for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root  
> mode, ~3% redis performance benefit can be observed on Skylake server.
> 
> w/o patch:
> 
>             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> 
> EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> 
> w/ patch:
> 
>             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> 
> EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> ---
>  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
>  arch/x86/kvm/lapic.h            |  1 +
>  arch/x86/kvm/vmx/vmx.c          |  3 ++-
>  arch/x86/kvm/x86.c              |  5 +++++
>  arch/x86/kvm/x86.h              |  2 ++
>  include/linux/sched/isolation.h |  2 ++
>  kernel/sched/isolation.c        |  6 ++++++
>  7 files changed, 44 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 87ecb56..9ceeee5 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
>  	return apic->vcpu->vcpu_id;
>  }
>  
> +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> +{
> +	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> +		kvm_hlt_in_guest(vcpu->kvm);
> +}
> +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);

Paolo, can you explain the reasoning behind this?

Should not be necessary...
Wanpeng Li June 19, 2019, 12:36 a.m. UTC | #2
On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
> On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > From: Wanpeng Li <wanpengli@tencent.com>
> >
> > Dedicated instances are currently disturbed by unnecessary jitter due
> > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > There is no hardware virtual timer on Intel for guest like ARM. Both
> > programming timer in guest and the emulated timer fires incur vmexits.
> > This patch tries to avoid vmexit which is incurred by the emulated
> > timer fires in dedicated instance scenario.
> >
> > When nohz_full is enabled in dedicated instances scenario, the emulated
> > timers can be offload to the nearest busy housekeeping cpus since APICv
> > is really common in recent years. The guest timer interrupt is injected
> > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > timer fires.
> >
> > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > mode, ~3% redis performance benefit can be observed on Skylake server.
> >
> > w/o patch:
> >
> >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> >
> > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> >
> > w/ patch:
> >
> >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> >
> > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> >
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > ---
> >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> >  arch/x86/kvm/lapic.h            |  1 +
> >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> >  arch/x86/kvm/x86.c              |  5 +++++
> >  arch/x86/kvm/x86.h              |  2 ++
> >  include/linux/sched/isolation.h |  2 ++
> >  kernel/sched/isolation.c        |  6 ++++++
> >  7 files changed, 44 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 87ecb56..9ceeee5 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> >       return apic->vcpu->vcpu_id;
> >  }
> >
> > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > +{
> > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > +             kvm_hlt_in_guest(vcpu->kvm);
> > +}
> > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
>
> Paolo, can you explain the reasoning behind this?
>
> Should not be necessary...

Here some new discussions:
https://lkml.org/lkml/2019/6/13/1423
https://lkml.org/lkml/2019/6/13/1420
Marcelo Tosatti June 19, 2019, 9:03 p.m. UTC | #3
Hi Li,

On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >
> > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > From: Wanpeng Li <wanpengli@tencent.com>
> > >
> > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > programming timer in guest and the emulated timer fires incur vmexits.
> > > This patch tries to avoid vmexit which is incurred by the emulated
> > > timer fires in dedicated instance scenario.
> > >
> > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > is really common in recent years. The guest timer interrupt is injected
> > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > timer fires.
> > >
> > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > >
> > > w/o patch:
> > >
> > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > >
> > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > >
> > > w/ patch:
> > >
> > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > >
> > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > >
> > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > ---
> > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > >  arch/x86/kvm/lapic.h            |  1 +
> > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > >  arch/x86/kvm/x86.c              |  5 +++++
> > >  arch/x86/kvm/x86.h              |  2 ++
> > >  include/linux/sched/isolation.h |  2 ++
> > >  kernel/sched/isolation.c        |  6 ++++++
> > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > index 87ecb56..9ceeee5 100644
> > > --- a/arch/x86/kvm/lapic.c
> > > +++ b/arch/x86/kvm/lapic.c
> > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > >       return apic->vcpu->vcpu_id;
> > >  }
> > >
> > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > +{
> > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > +}
> > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> >
> > Paolo, can you explain the reasoning behind this?
> >
> > Should not be necessary...
> 
> Here some new discussions:
> https://lkml.org/lkml/2019/6/13/1423

Not sure what this has to do with injecting timer
interrupts via posted interrupts ?

> https://lkml.org/lkml/2019/6/13/1420

Two things (unrelated to the above):

1) hrtimer_reprogram is unable to wakeup a remote vCPU, therefore 
i believe execution of apic_timer_expired can be delayed. 
Should wakeup the CPU which hosts apic_timer_expired.


        /*
         * If the timer is not on the current cpu, we cannot reprogram
         * the other cpus clock event device.
         */
        if (base->cpu_base != cpu_base)
                return;

2) Getting an oops when running cyclictest, debugging...
Wanpeng Li June 20, 2019, 12:52 a.m. UTC | #4
On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
> Hi Li,
>
> On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> > On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > >
> > > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > > From: Wanpeng Li <wanpengli@tencent.com>
> > > >
> > > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > > programming timer in guest and the emulated timer fires incur vmexits.
> > > > This patch tries to avoid vmexit which is incurred by the emulated
> > > > timer fires in dedicated instance scenario.
> > > >
> > > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > > is really common in recent years. The guest timer interrupt is injected
> > > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > > timer fires.
> > > >
> > > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > > >
> > > > w/o patch:
> > > >
> > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > > >
> > > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > > >
> > > > w/ patch:
> > > >
> > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > > >
> > > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > > >
> > > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > > ---
> > > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > > >  arch/x86/kvm/lapic.h            |  1 +
> > > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > > >  arch/x86/kvm/x86.c              |  5 +++++
> > > >  arch/x86/kvm/x86.h              |  2 ++
> > > >  include/linux/sched/isolation.h |  2 ++
> > > >  kernel/sched/isolation.c        |  6 ++++++
> > > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > > >
> > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > index 87ecb56..9ceeee5 100644
> > > > --- a/arch/x86/kvm/lapic.c
> > > > +++ b/arch/x86/kvm/lapic.c
> > > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > > >       return apic->vcpu->vcpu_id;
> > > >  }
> > > >
> > > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > > +{
> > > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> > >
> > > Paolo, can you explain the reasoning behind this?
> > >
> > > Should not be necessary...
> >
> > Here some new discussions:
> > https://lkml.org/lkml/2019/6/13/1423
>
> Not sure what this has to do with injecting timer
> interrupts via posted interrupts ?

Yeah, need more explain from Paolo! Ping Paolo,

>
> > https://lkml.org/lkml/2019/6/13/1420
>
> Two things (unrelated to the above):
>
> 1) hrtimer_reprogram is unable to wakeup a remote vCPU, therefore
> i believe execution of apic_timer_expired can be delayed.
> Should wakeup the CPU which hosts apic_timer_expired.
>
>
>         /*
>          * If the timer is not on the current cpu, we cannot reprogram
>          * the other cpus clock event device.
>          */
>         if (base->cpu_base != cpu_base)
>                 return;

If it is not the first expiring timer on the new target, we don't need
to reprogram. It can't be the first expired timer on the new target
since below:

/*
 * We switch the timer base to a power-optimized selected CPU target,
 * if:
 * - NO_HZ_COMMON is enabled
 * - timer migration is enabled
 * - the timer callback is not running
 * - the timer is not the first expiring timer on the new target
 *
 * If one of the above requirements is not fulfilled we move the timer
 * to the current CPU or leave it on the previously assigned CPU if
 * the timer callback is currently running.
 */
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
   int pinned)

>
> 2) Getting an oops when running cyclictest, debugging...

Radim point out one issue in patch 5/5, not sure that is cause.

Regards,
Wanpeng Li
Wanpeng Li June 21, 2019, 1:42 a.m. UTC | #5
On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
> Hi Li,
>
> On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> > On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > >
> > > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > > From: Wanpeng Li <wanpengli@tencent.com>
> > > >
> > > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > > programming timer in guest and the emulated timer fires incur vmexits.
> > > > This patch tries to avoid vmexit which is incurred by the emulated
> > > > timer fires in dedicated instance scenario.
> > > >
> > > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > > is really common in recent years. The guest timer interrupt is injected
> > > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > > timer fires.
> > > >
> > > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > > >
> > > > w/o patch:
> > > >
> > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > > >
> > > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > > >
> > > > w/ patch:
> > > >
> > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > > >
> > > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > > >
> > > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > > ---
> > > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > > >  arch/x86/kvm/lapic.h            |  1 +
> > > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > > >  arch/x86/kvm/x86.c              |  5 +++++
> > > >  arch/x86/kvm/x86.h              |  2 ++
> > > >  include/linux/sched/isolation.h |  2 ++
> > > >  kernel/sched/isolation.c        |  6 ++++++
> > > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > > >
> > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > index 87ecb56..9ceeee5 100644
> > > > --- a/arch/x86/kvm/lapic.c
> > > > +++ b/arch/x86/kvm/lapic.c
> > > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > > >       return apic->vcpu->vcpu_id;
> > > >  }
> > > >
> > > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > > +{
> > > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> > >
> > > Paolo, can you explain the reasoning behind this?
> > >
> > > Should not be necessary...

https://lkml.org/lkml/2019/6/5/436  "Here you need to check
kvm_halt_in_guest, not kvm_mwait_in_guest, because you need to go
through kvm_apic_expired if the guest needs to be woken up from
kvm_vcpu_block."

I think we can still be woken up from kvm_vcpu_block() if pir is set.

Regards,
Wanpeng Li
Marcelo Tosatti June 21, 2019, 9:42 p.m. UTC | #6
On Fri, Jun 21, 2019 at 09:42:39AM +0800, Wanpeng Li wrote:
> On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >
> > Hi Li,
> >
> > On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> > > On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > >
> > > > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > > > From: Wanpeng Li <wanpengli@tencent.com>
> > > > >
> > > > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > > > programming timer in guest and the emulated timer fires incur vmexits.
> > > > > This patch tries to avoid vmexit which is incurred by the emulated
> > > > > timer fires in dedicated instance scenario.
> > > > >
> > > > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > > > is really common in recent years. The guest timer interrupt is injected
> > > > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > > > timer fires.
> > > > >
> > > > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > > > >
> > > > > w/o patch:
> > > > >
> > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > > > >
> > > > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > > > >
> > > > > w/ patch:
> > > > >
> > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > > > >
> > > > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > > > >
> > > > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > > > ---
> > > > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > > > >  arch/x86/kvm/lapic.h            |  1 +
> > > > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > > > >  arch/x86/kvm/x86.c              |  5 +++++
> > > > >  arch/x86/kvm/x86.h              |  2 ++
> > > > >  include/linux/sched/isolation.h |  2 ++
> > > > >  kernel/sched/isolation.c        |  6 ++++++
> > > > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > > > >
> > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > > index 87ecb56..9ceeee5 100644
> > > > > --- a/arch/x86/kvm/lapic.c
> > > > > +++ b/arch/x86/kvm/lapic.c
> > > > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > > > >       return apic->vcpu->vcpu_id;
> > > > >  }
> > > > >
> > > > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > > > +{
> > > > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > > > +}
> > > > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> > > >
> > > > Paolo, can you explain the reasoning behind this?
> > > >
> > > > Should not be necessary...
> 
> https://lkml.org/lkml/2019/6/5/436  "Here you need to check
> kvm_halt_in_guest, not kvm_mwait_in_guest, because you need to go
> through kvm_apic_expired if the guest needs to be woken up from
> kvm_vcpu_block."

Ah, i think he means that a sleeping vcpu (in kvm_vcpu_block) must
be woken up, if it receives a timer interrupt.

But your patch will go through:

kvm_apic_inject_pending_timer_irqs
__apic_accept_irq -> 
vmx_deliver_posted_interrupt ->
kvm_vcpu_trigger_posted_interrupt returns false
(because vcpu->mode != IN_GUEST_MODE) ->
kvm_vcpu_kick

Which will wakeup the vcpu.

Apart from this oops, which triggers when running:
taskset -c 1 ./cyclictest -D 3600 -p 99 -t 1 -h 30 -m -n  -i 50000 -b 40

Timer interruption from housekeeping vcpus is normal to me 
(without requiring kvm_hlt_in_guest).

[ 1145.849646] BUG: kernel NULL pointer dereference, address:
0000000000000000
[ 1145.850481] #PF: supervisor instruction fetch in kernel mode
[ 1145.851161] #PF: error_code(0x0010) - not-present page
[ 1145.851772] PGD 80000002a9fa5067 P4D 80000002a9fa5067 PUD 2abcbb067
PMD 0 
[ 1145.852578] Oops: 0010 [#1] PREEMPT SMP PTI
[ 1145.853066] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.2.0-rc1+ #11
[ 1145.853809] Hardware name: Red Hat KVM, BIOS 1.11.0-2.el7 04/01/2014
[ 1145.854554] RIP: 0010:0x0
[ 1145.854879] Code: Bad RIP value.
[ 1145.855270] RSP: 0018:ffffc90001903e68 EFLAGS: 00010013
[ 1145.855902] RAX: 0000010ac9f60043 RBX: ffff8882b58a8320 RCX:
00000000c526b7c4              
[ 1145.856726] RDX: 0000000000000000 RSI: ffffffff820d9640 RDI:
ffff8882b58a8320              
[ 1145.857560] RBP: ffffffff820d9640 R08: 00000000c526b7c4 R09:
0000000000000832              
[ 1145.858390] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000000000              
[ 1145.859222] R13: ffffffff820d9658 R14: ffff8881063b2880 R15:
0000000000000002              
[ 1145.860047] FS:  0000000000000000(0000) GS:ffff8882b5880000(0000)
knlGS:0000000000000000   
[ 1145.860994] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033                              
[ 1145.861692] CR2: ffffffffffffffd6 CR3: 00000002ab1de001 CR4:
0000000000160ee0              
[ 1145.862570] Call Trace:                                                                    
[ 1145.862877]  cpuidle_enter_state+0x7c/0x3e0                                                
[ 1145.863392]  cpuidle_enter+0x29/0x40                                                       


> I think we can still be woken up from kvm_vcpu_block() if pir is set.

Exactly.
Wanpeng Li June 24, 2019, 8:53 a.m. UTC | #7
On Sat, 22 Jun 2019 at 06:11, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
> On Fri, Jun 21, 2019 at 09:42:39AM +0800, Wanpeng Li wrote:
> > On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > >
> > > Hi Li,
> > >
> > > On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> > > > On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > >
> > > > > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > > > > From: Wanpeng Li <wanpengli@tencent.com>
> > > > > >
> > > > > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > > > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > > > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > > > > programming timer in guest and the emulated timer fires incur vmexits.
> > > > > > This patch tries to avoid vmexit which is incurred by the emulated
> > > > > > timer fires in dedicated instance scenario.
> > > > > >
> > > > > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > > > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > > > > is really common in recent years. The guest timer interrupt is injected
> > > > > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > > > > timer fires.
> > > > > >
> > > > > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > > > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > > > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > > > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > > > > >
> > > > > > w/o patch:
> > > > > >
> > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > > > > >
> > > > > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > > > > >
> > > > > > w/ patch:
> > > > > >
> > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > > > > >
> > > > > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > > > > >
> > > > > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > > > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > > > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > > > > ---
> > > > > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > > > > >  arch/x86/kvm/lapic.h            |  1 +
> > > > > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > > > > >  arch/x86/kvm/x86.c              |  5 +++++
> > > > > >  arch/x86/kvm/x86.h              |  2 ++
> > > > > >  include/linux/sched/isolation.h |  2 ++
> > > > > >  kernel/sched/isolation.c        |  6 ++++++
> > > > > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > > > > >
> > > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > > > index 87ecb56..9ceeee5 100644
> > > > > > --- a/arch/x86/kvm/lapic.c
> > > > > > +++ b/arch/x86/kvm/lapic.c
> > > > > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > > > > >       return apic->vcpu->vcpu_id;
> > > > > >  }
> > > > > >
> > > > > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > > > > +{
> > > > > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > > > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > > > > +}
> > > > > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> > > > >
> > > > > Paolo, can you explain the reasoning behind this?
> > > > >
> > > > > Should not be necessary...
> >
> > https://lkml.org/lkml/2019/6/5/436  "Here you need to check
> > kvm_halt_in_guest, not kvm_mwait_in_guest, because you need to go
> > through kvm_apic_expired if the guest needs to be woken up from
> > kvm_vcpu_block."
>
> Ah, i think he means that a sleeping vcpu (in kvm_vcpu_block) must
> be woken up, if it receives a timer interrupt.
>
> But your patch will go through:
>
> kvm_apic_inject_pending_timer_irqs
> __apic_accept_irq ->
> vmx_deliver_posted_interrupt ->
> kvm_vcpu_trigger_posted_interrupt returns false
> (because vcpu->mode != IN_GUEST_MODE) ->
> kvm_vcpu_kick
>
> Which will wakeup the vcpu.

Hi Marcelo,

>
> Apart from this oops, which triggers when running:
> taskset -c 1 ./cyclictest -D 3600 -p 99 -t 1 -h 30 -m -n  -i 50000 -b 40

I try both host and guest use latest kvm/queue  w/ CONFIG_PREEMPT
enabled, and expose mwait as your config, however, there is no oops.
Can you reproduce steadily or encounter casually? Can you reproduce
w/o the patchset?

>
> Timer interruption from housekeeping vcpus is normal to me
> (without requiring kvm_hlt_in_guest).
>
> [ 1145.849646] BUG: kernel NULL pointer dereference, address:
> 0000000000000000
> [ 1145.850481] #PF: supervisor instruction fetch in kernel mode
> [ 1145.851161] #PF: error_code(0x0010) - not-present page
> [ 1145.851772] PGD 80000002a9fa5067 P4D 80000002a9fa5067 PUD 2abcbb067
> PMD 0
> [ 1145.852578] Oops: 0010 [#1] PREEMPT SMP PTI
> [ 1145.853066] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.2.0-rc1+ #11
> [ 1145.853809] Hardware name: Red Hat KVM, BIOS 1.11.0-2.el7 04/01/2014
> [ 1145.854554] RIP: 0010:0x0
> [ 1145.854879] Code: Bad RIP value.
> [ 1145.855270] RSP: 0018:ffffc90001903e68 EFLAGS: 00010013
> [ 1145.855902] RAX: 0000010ac9f60043 RBX: ffff8882b58a8320 RCX:
> 00000000c526b7c4
> [ 1145.856726] RDX: 0000000000000000 RSI: ffffffff820d9640 RDI:
> ffff8882b58a8320
> [ 1145.857560] RBP: ffffffff820d9640 R08: 00000000c526b7c4 R09:
> 0000000000000832
> [ 1145.858390] R10: 0000000000000000 R11: 0000000000000000 R12:
> 0000000000000000
> [ 1145.859222] R13: ffffffff820d9658 R14: ffff8881063b2880 R15:
> 0000000000000002
> [ 1145.860047] FS:  0000000000000000(0000) GS:ffff8882b5880000(0000)
> knlGS:0000000000000000
> [ 1145.860994] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1145.861692] CR2: ffffffffffffffd6 CR3: 00000002ab1de001 CR4:
> 0000000000160ee0
> [ 1145.862570] Call Trace:
> [ 1145.862877]  cpuidle_enter_state+0x7c/0x3e0
> [ 1145.863392]  cpuidle_enter+0x29/0x40
>
>
> > I think we can still be woken up from kvm_vcpu_block() if pir is set.
>
> Exactly.
>
Paolo Bonzini June 25, 2019, 5:02 p.m. UTC | #8
On 21/06/19 23:42, Marcelo Tosatti wrote:
> On Fri, Jun 21, 2019 at 09:42:39AM +0800, Wanpeng Li wrote:
>> On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>
>>> Hi Li,
>>>
>>> On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
>>>> On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>>>
>>>>> On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
>>>>>> From: Wanpeng Li <wanpengli@tencent.com>
>>>>>>
>>>>>> Dedicated instances are currently disturbed by unnecessary jitter due
>>>>>> to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
>>>>>> There is no hardware virtual timer on Intel for guest like ARM. Both
>>>>>> programming timer in guest and the emulated timer fires incur vmexits.
>>>>>> This patch tries to avoid vmexit which is incurred by the emulated
>>>>>> timer fires in dedicated instance scenario.
>>>>>>
>>>>>> When nohz_full is enabled in dedicated instances scenario, the emulated
>>>>>> timers can be offload to the nearest busy housekeeping cpus since APICv
>>>>>> is really common in recent years. The guest timer interrupt is injected
>>>>>> by posted-interrupt which is delivered by housekeeping cpu once the emulated
>>>>>> timer fires.
>>>>>>
>>>>>> The host admin should fine tuned, e.g. dedicated instances scenario w/
>>>>>> nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
>>>>>> for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
>>>>>> mode, ~3% redis performance benefit can be observed on Skylake server.
>>>>>>
>>>>>> w/o patch:
>>>>>>
>>>>>>             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
>>>>>>
>>>>>> EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
>>>>>>
>>>>>> w/ patch:
>>>>>>
>>>>>>             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
>>>>>>
>>>>>> EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
>>>>>>
>>>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>>>>> Cc: Radim Krčmář <rkrcmar@redhat.com>
>>>>>> Cc: Marcelo Tosatti <mtosatti@redhat.com>
>>>>>> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
>>>>>> ---
>>>>>>  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
>>>>>>  arch/x86/kvm/lapic.h            |  1 +
>>>>>>  arch/x86/kvm/vmx/vmx.c          |  3 ++-
>>>>>>  arch/x86/kvm/x86.c              |  5 +++++
>>>>>>  arch/x86/kvm/x86.h              |  2 ++
>>>>>>  include/linux/sched/isolation.h |  2 ++
>>>>>>  kernel/sched/isolation.c        |  6 ++++++
>>>>>>  7 files changed, 44 insertions(+), 8 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>>>> index 87ecb56..9ceeee5 100644
>>>>>> --- a/arch/x86/kvm/lapic.c
>>>>>> +++ b/arch/x86/kvm/lapic.c
>>>>>> @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
>>>>>>       return apic->vcpu->vcpu_id;
>>>>>>  }
>>>>>>
>>>>>> +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
>>>>>> +{
>>>>>> +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
>>>>>> +             kvm_hlt_in_guest(vcpu->kvm);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
>>>>>
>>>>> Paolo, can you explain the reasoning behind this?
>>>>>
>>>>> Should not be necessary...
>>
>> https://lkml.org/lkml/2019/6/5/436  "Here you need to check
>> kvm_halt_in_guest, not kvm_mwait_in_guest, because you need to go
>> through kvm_apic_expired if the guest needs to be woken up from
>> kvm_vcpu_block."
> 
> Ah, i think he means that a sleeping vcpu (in kvm_vcpu_block) must
> be woken up, if it receives a timer interrupt.

Yes, this is true.

Paolo

> But your patch will go through:
> 
> kvm_apic_inject_pending_timer_irqs
> __apic_accept_irq -> 
> vmx_deliver_posted_interrupt ->
> kvm_vcpu_trigger_posted_interrupt returns false
> (because vcpu->mode != IN_GUEST_MODE) ->
> kvm_vcpu_kick
> 
> Which will wakeup the vcpu.
> 
> Apart from this oops, which triggers when running:
> taskset -c 1 ./cyclictest -D 3600 -p 99 -t 1 -h 30 -m -n  -i 50000 -b 40
> 
> Timer interruption from housekeeping vcpus is normal to me 
> (without requiring kvm_hlt_in_guest).
> 
> [ 1145.849646] BUG: kernel NULL pointer dereference, address:
> 0000000000000000
> [ 1145.850481] #PF: supervisor instruction fetch in kernel mode
> [ 1145.851161] #PF: error_code(0x0010) - not-present page
> [ 1145.851772] PGD 80000002a9fa5067 P4D 80000002a9fa5067 PUD 2abcbb067
> PMD 0 
> [ 1145.852578] Oops: 0010 [#1] PREEMPT SMP PTI
> [ 1145.853066] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.2.0-rc1+ #11
> [ 1145.853809] Hardware name: Red Hat KVM, BIOS 1.11.0-2.el7 04/01/2014
> [ 1145.854554] RIP: 0010:0x0
> [ 1145.854879] Code: Bad RIP value.
> [ 1145.855270] RSP: 0018:ffffc90001903e68 EFLAGS: 00010013
> [ 1145.855902] RAX: 0000010ac9f60043 RBX: ffff8882b58a8320 RCX:
> 00000000c526b7c4              
> [ 1145.856726] RDX: 0000000000000000 RSI: ffffffff820d9640 RDI:
> ffff8882b58a8320              
> [ 1145.857560] RBP: ffffffff820d9640 R08: 00000000c526b7c4 R09:
> 0000000000000832              
> [ 1145.858390] R10: 0000000000000000 R11: 0000000000000000 R12:
> 0000000000000000              
> [ 1145.859222] R13: ffffffff820d9658 R14: ffff8881063b2880 R15:
> 0000000000000002              
> [ 1145.860047] FS:  0000000000000000(0000) GS:ffff8882b5880000(0000)
> knlGS:0000000000000000   
> [ 1145.860994] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033                              
> [ 1145.861692] CR2: ffffffffffffffd6 CR3: 00000002ab1de001 CR4:
> 0000000000160ee0              
> [ 1145.862570] Call Trace:                                                                    
> [ 1145.862877]  cpuidle_enter_state+0x7c/0x3e0                                                
> [ 1145.863392]  cpuidle_enter+0x29/0x40                                                       
> 
> 
>> I think we can still be woken up from kvm_vcpu_block() if pir is set.
> 
> Exactly.
>
Marcelo Tosatti June 25, 2019, 7 p.m. UTC | #9
On Mon, Jun 24, 2019 at 04:53:53PM +0800, Wanpeng Li wrote:
> On Sat, 22 Jun 2019 at 06:11, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >
> > On Fri, Jun 21, 2019 at 09:42:39AM +0800, Wanpeng Li wrote:
> > > On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > >
> > > > Hi Li,
> > > >
> > > > On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> > > > > On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > > > > > From: Wanpeng Li <wanpengli@tencent.com>
> > > > > > >
> > > > > > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > > > > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > > > > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > > > > > programming timer in guest and the emulated timer fires incur vmexits.
> > > > > > > This patch tries to avoid vmexit which is incurred by the emulated
> > > > > > > timer fires in dedicated instance scenario.
> > > > > > >
> > > > > > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > > > > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > > > > > is really common in recent years. The guest timer interrupt is injected
> > > > > > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > > > > > timer fires.
> > > > > > >
> > > > > > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > > > > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > > > > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > > > > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > > > > > >
> > > > > > > w/o patch:
> > > > > > >
> > > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > > > > > >
> > > > > > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > > > > > >
> > > > > > > w/ patch:
> > > > > > >
> > > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > > > > > >
> > > > > > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > > > > > >
> > > > > > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > > > > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > > > > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > > > > > ---
> > > > > > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > > > > > >  arch/x86/kvm/lapic.h            |  1 +
> > > > > > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > > > > > >  arch/x86/kvm/x86.c              |  5 +++++
> > > > > > >  arch/x86/kvm/x86.h              |  2 ++
> > > > > > >  include/linux/sched/isolation.h |  2 ++
> > > > > > >  kernel/sched/isolation.c        |  6 ++++++
> > > > > > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > > > > > >
> > > > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > > > > index 87ecb56..9ceeee5 100644
> > > > > > > --- a/arch/x86/kvm/lapic.c
> > > > > > > +++ b/arch/x86/kvm/lapic.c
> > > > > > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > > > > > >       return apic->vcpu->vcpu_id;
> > > > > > >  }
> > > > > > >
> > > > > > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > > > > > +{
> > > > > > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > > > > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > > > > > +}
> > > > > > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> > > > > >
> > > > > > Paolo, can you explain the reasoning behind this?
> > > > > >
> > > > > > Should not be necessary...
> > >
> > > https://lkml.org/lkml/2019/6/5/436  "Here you need to check
> > > kvm_halt_in_guest, not kvm_mwait_in_guest, because you need to go
> > > through kvm_apic_expired if the guest needs to be woken up from
> > > kvm_vcpu_block."
> >
> > Ah, i think he means that a sleeping vcpu (in kvm_vcpu_block) must
> > be woken up, if it receives a timer interrupt.
> >
> > But your patch will go through:
> >
> > kvm_apic_inject_pending_timer_irqs
> > __apic_accept_irq ->
> > vmx_deliver_posted_interrupt ->
> > kvm_vcpu_trigger_posted_interrupt returns false
> > (because vcpu->mode != IN_GUEST_MODE) ->
> > kvm_vcpu_kick
> >
> > Which will wakeup the vcpu.
> 
> Hi Marcelo,
> 
> >
> > Apart from this oops, which triggers when running:
> > taskset -c 1 ./cyclictest -D 3600 -p 99 -t 1 -h 30 -m -n  -i 50000 -b 40
> 
> I try both host and guest use latest kvm/queue  w/ CONFIG_PREEMPT
> enabled, and expose mwait as your config, however, there is no oops.
> Can you reproduce steadily or encounter casually? Can you reproduce
> w/o the patchset?

Hi Li,

Steadily.

Do you have this as well:

Index: kvm/arch/x86/kvm/lapic.c
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -129,8 +129,7 @@ static inline u32 kvm_x2apic_id(struct k

 bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
 {
-       return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
-               kvm_hlt_in_guest(vcpu->kvm);
+       return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
 }
 EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
Wanpeng Li June 26, 2019, 11:02 a.m. UTC | #10
On Wed, 26 Jun 2019 at 03:03, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
> On Mon, Jun 24, 2019 at 04:53:53PM +0800, Wanpeng Li wrote:
> > On Sat, 22 Jun 2019 at 06:11, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > >
> > > On Fri, Jun 21, 2019 at 09:42:39AM +0800, Wanpeng Li wrote:
> > > > On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > >
> > > > > Hi Li,
> > > > >
> > > > > On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> > > > > > On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > > > > > > From: Wanpeng Li <wanpengli@tencent.com>
> > > > > > > >
> > > > > > > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > > > > > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > > > > > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > > > > > > programming timer in guest and the emulated timer fires incur vmexits.
> > > > > > > > This patch tries to avoid vmexit which is incurred by the emulated
> > > > > > > > timer fires in dedicated instance scenario.
> > > > > > > >
> > > > > > > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > > > > > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > > > > > > is really common in recent years. The guest timer interrupt is injected
> > > > > > > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > > > > > > timer fires.
> > > > > > > >
> > > > > > > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > > > > > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > > > > > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > > > > > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > > > > > > >
> > > > > > > > w/o patch:
> > > > > > > >
> > > > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > > > > > > >
> > > > > > > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > > > > > > >
> > > > > > > > w/ patch:
> > > > > > > >
> > > > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > > > > > > >
> > > > > > > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > > > > > > >
> > > > > > > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > > > > > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > > > > > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > > > > > > ---
> > > > > > > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > > > > > > >  arch/x86/kvm/lapic.h            |  1 +
> > > > > > > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > > > > > > >  arch/x86/kvm/x86.c              |  5 +++++
> > > > > > > >  arch/x86/kvm/x86.h              |  2 ++
> > > > > > > >  include/linux/sched/isolation.h |  2 ++
> > > > > > > >  kernel/sched/isolation.c        |  6 ++++++
> > > > > > > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > > > > > index 87ecb56..9ceeee5 100644
> > > > > > > > --- a/arch/x86/kvm/lapic.c
> > > > > > > > +++ b/arch/x86/kvm/lapic.c
> > > > > > > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > > > > > > >       return apic->vcpu->vcpu_id;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > > > > > > +{
> > > > > > > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > > > > > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > > > > > > +}
> > > > > > > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> > > > > > >
> > > > > > > Paolo, can you explain the reasoning behind this?
> > > > > > >
> > > > > > > Should not be necessary...
> > > >
> > > > https://lkml.org/lkml/2019/6/5/436  "Here you need to check
> > > > kvm_halt_in_guest, not kvm_mwait_in_guest, because you need to go
> > > > through kvm_apic_expired if the guest needs to be woken up from
> > > > kvm_vcpu_block."
> > >
> > > Ah, i think he means that a sleeping vcpu (in kvm_vcpu_block) must
> > > be woken up, if it receives a timer interrupt.
> > >
> > > But your patch will go through:
> > >
> > > kvm_apic_inject_pending_timer_irqs
> > > __apic_accept_irq ->
> > > vmx_deliver_posted_interrupt ->
> > > kvm_vcpu_trigger_posted_interrupt returns false
> > > (because vcpu->mode != IN_GUEST_MODE) ->
> > > kvm_vcpu_kick
> > >
> > > Which will wakeup the vcpu.
> >
> > Hi Marcelo,
> >
> > >
> > > Apart from this oops, which triggers when running:
> > > taskset -c 1 ./cyclictest -D 3600 -p 99 -t 1 -h 30 -m -n  -i 50000 -b 40
> >
> > I try both host and guest use latest kvm/queue  w/ CONFIG_PREEMPT
> > enabled, and expose mwait as your config, however, there is no oops.
> > Can you reproduce steadily or encounter casually? Can you reproduce
> > w/o the patchset?
>
> Hi Li,

Hi Marcelo,

>
> Steadily.
>
> Do you have this as well:

w/ or w/o below diff, testing on both SKX and HSW servers on hand, I
didn't see any oops. Could you observe the oops disappear when w/o
below diff? If the answer is yes, then the oops will not block to
merge the patchset since Paolo prefers to add the kvm_hlt_in_guest()
condition to guarantee be woken up from kvm_vcpu_block(). For the
exitless injection if the guest is running(DPDK style workloads that
busy-spin on network card) scenarios, we can find a solution later.

Regards,
Wanpeng Li

>
> Index: kvm/arch/x86/kvm/lapic.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/lapic.c
> +++ kvm/arch/x86/kvm/lapic.c
> @@ -129,8 +129,7 @@ static inline u32 kvm_x2apic_id(struct k
>
>  bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
>  {
> -       return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> -               kvm_hlt_in_guest(vcpu->kvm);
> +       return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
>  }
>  EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
Marcelo Tosatti June 26, 2019, 4:44 p.m. UTC | #11
On Wed, Jun 26, 2019 at 07:02:13PM +0800, Wanpeng Li wrote:
> On Wed, 26 Jun 2019 at 03:03, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >
> > On Mon, Jun 24, 2019 at 04:53:53PM +0800, Wanpeng Li wrote:
> > > On Sat, 22 Jun 2019 at 06:11, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > >
> > > > On Fri, Jun 21, 2019 at 09:42:39AM +0800, Wanpeng Li wrote:
> > > > > On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > > >
> > > > > > Hi Li,
> > > > > >
> > > > > > On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> > > > > > > On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > > > > > > > From: Wanpeng Li <wanpengli@tencent.com>
> > > > > > > > >
> > > > > > > > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > > > > > > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > > > > > > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > > > > > > > programming timer in guest and the emulated timer fires incur vmexits.
> > > > > > > > > This patch tries to avoid vmexit which is incurred by the emulated
> > > > > > > > > timer fires in dedicated instance scenario.
> > > > > > > > >
> > > > > > > > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > > > > > > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > > > > > > > is really common in recent years. The guest timer interrupt is injected
> > > > > > > > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > > > > > > > timer fires.
> > > > > > > > >
> > > > > > > > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > > > > > > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > > > > > > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > > > > > > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > > > > > > > >
> > > > > > > > > w/o patch:
> > > > > > > > >
> > > > > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > > > > > > > >
> > > > > > > > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > > > > > > > >
> > > > > > > > > w/ patch:
> > > > > > > > >
> > > > > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > > > > > > > >
> > > > > > > > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > > > > > > > >
> > > > > > > > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > > > > > > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > > > > > > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > > > > > > > ---
> > > > > > > > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > > > > > > > >  arch/x86/kvm/lapic.h            |  1 +
> > > > > > > > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > > > > > > > >  arch/x86/kvm/x86.c              |  5 +++++
> > > > > > > > >  arch/x86/kvm/x86.h              |  2 ++
> > > > > > > > >  include/linux/sched/isolation.h |  2 ++
> > > > > > > > >  kernel/sched/isolation.c        |  6 ++++++
> > > > > > > > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > > > > > > index 87ecb56..9ceeee5 100644
> > > > > > > > > --- a/arch/x86/kvm/lapic.c
> > > > > > > > > +++ b/arch/x86/kvm/lapic.c
> > > > > > > > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > > > > > > > >       return apic->vcpu->vcpu_id;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > > > > > > > +{
> > > > > > > > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > > > > > > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > > > > > > > +}
> > > > > > > > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> > > > > > > >
> > > > > > > > Paolo, can you explain the reasoning behind this?
> > > > > > > >
> > > > > > > > Should not be necessary...
> > > > >
> > > > > https://lkml.org/lkml/2019/6/5/436  "Here you need to check
> > > > > kvm_halt_in_guest, not kvm_mwait_in_guest, because you need to go
> > > > > through kvm_apic_expired if the guest needs to be woken up from
> > > > > kvm_vcpu_block."
> > > >
> > > > Ah, i think he means that a sleeping vcpu (in kvm_vcpu_block) must
> > > > be woken up, if it receives a timer interrupt.
> > > >
> > > > But your patch will go through:
> > > >
> > > > kvm_apic_inject_pending_timer_irqs
> > > > __apic_accept_irq ->
> > > > vmx_deliver_posted_interrupt ->
> > > > kvm_vcpu_trigger_posted_interrupt returns false
> > > > (because vcpu->mode != IN_GUEST_MODE) ->
> > > > kvm_vcpu_kick
> > > >
> > > > Which will wakeup the vcpu.
> > >
> > > Hi Marcelo,
> > >
> > > >
> > > > Apart from this oops, which triggers when running:
> > > > taskset -c 1 ./cyclictest -D 3600 -p 99 -t 1 -h 30 -m -n  -i 50000 -b 40
> > >
> > > I try both host and guest use latest kvm/queue  w/ CONFIG_PREEMPT
> > > enabled, and expose mwait as your config, however, there is no oops.
> > > Can you reproduce steadily or encounter casually? Can you reproduce
> > > w/o the patchset?
> >
> > Hi Li,
> 
> Hi Marcelo,
> 
> >
> > Steadily.
> >
> > Do you have this as well:
> 
> w/ or w/o below diff, testing on both SKX and HSW servers on hand, I
> didn't see any oops. Could you observe the oops disappear when w/o
> below diff? If the answer is yes, then the oops will not block to
> merge the patchset since Paolo prefers to add the kvm_hlt_in_guest()
> condition to guarantee be woken up from kvm_vcpu_block(). 

He agreed that its not necessary. Removing the HLT in guest widens 
the scope of the patch greatly.

> For the
> exitless injection if the guest is running(DPDK style workloads that
> busy-spin on network card) scenarios, we can find a solution later.

What is the use-case for HLT in guest again?

I'll find the source for the oops (or confirm can't reproduce with 
kvm/queue RSN).
Wanpeng Li June 28, 2019, 8:26 a.m. UTC | #12
On Thu, 27 Jun 2019 at 00:44, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
> On Wed, Jun 26, 2019 at 07:02:13PM +0800, Wanpeng Li wrote:
> > On Wed, 26 Jun 2019 at 03:03, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > >
> > > On Mon, Jun 24, 2019 at 04:53:53PM +0800, Wanpeng Li wrote:
> > > > On Sat, 22 Jun 2019 at 06:11, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > >
> > > > > On Fri, Jun 21, 2019 at 09:42:39AM +0800, Wanpeng Li wrote:
> > > > > > On Thu, 20 Jun 2019 at 05:04, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > > > >
> > > > > > > Hi Li,
> > > > > > >
> > > > > > > On Wed, Jun 19, 2019 at 08:36:06AM +0800, Wanpeng Li wrote:
> > > > > > > > On Tue, 18 Jun 2019 at 21:36, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Jun 17, 2019 at 07:24:44PM +0800, Wanpeng Li wrote:
> > > > > > > > > > From: Wanpeng Li <wanpengli@tencent.com>
> > > > > > > > > >
> > > > > > > > > > Dedicated instances are currently disturbed by unnecessary jitter due
> > > > > > > > > > to the emulated lapic timers fire on the same pCPUs which vCPUs resident.
> > > > > > > > > > There is no hardware virtual timer on Intel for guest like ARM. Both
> > > > > > > > > > programming timer in guest and the emulated timer fires incur vmexits.
> > > > > > > > > > This patch tries to avoid vmexit which is incurred by the emulated
> > > > > > > > > > timer fires in dedicated instance scenario.
> > > > > > > > > >
> > > > > > > > > > When nohz_full is enabled in dedicated instances scenario, the emulated
> > > > > > > > > > timers can be offload to the nearest busy housekeeping cpus since APICv
> > > > > > > > > > is really common in recent years. The guest timer interrupt is injected
> > > > > > > > > > by posted-interrupt which is delivered by housekeeping cpu once the emulated
> > > > > > > > > > timer fires.
> > > > > > > > > >
> > > > > > > > > > The host admin should fine tuned, e.g. dedicated instances scenario w/
> > > > > > > > > > nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus
> > > > > > > > > > for busy housekeeping, disable mwait/hlt/pause vmexits to keep in non-root
> > > > > > > > > > mode, ~3% redis performance benefit can be observed on Skylake server.
> > > > > > > > > >
> > > > > > > > > > w/o patch:
> > > > > > > > > >
> > > > > > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time   Avg time
> > > > > > > > > >
> > > > > > > > > > EXTERNAL_INTERRUPT    42916    49.43%   39.30%   0.47us   106.09us   0.71us ( +-   1.09% )
> > > > > > > > > >
> > > > > > > > > > w/ patch:
> > > > > > > > > >
> > > > > > > > > >             VM-EXIT  Samples  Samples%  Time%   Min Time  Max Time         Avg time
> > > > > > > > > >
> > > > > > > > > > EXTERNAL_INTERRUPT    6871     9.29%     2.96%   0.44us    57.88us   0.72us ( +-   4.02% )
> > > > > > > > > >
> > > > > > > > > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > > > > > > > > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > > > > > > > > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > > > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > > > > > > > > > ---
> > > > > > > > > >  arch/x86/kvm/lapic.c            | 33 ++++++++++++++++++++++++++-------
> > > > > > > > > >  arch/x86/kvm/lapic.h            |  1 +
> > > > > > > > > >  arch/x86/kvm/vmx/vmx.c          |  3 ++-
> > > > > > > > > >  arch/x86/kvm/x86.c              |  5 +++++
> > > > > > > > > >  arch/x86/kvm/x86.h              |  2 ++
> > > > > > > > > >  include/linux/sched/isolation.h |  2 ++
> > > > > > > > > >  kernel/sched/isolation.c        |  6 ++++++
> > > > > > > > > >  7 files changed, 44 insertions(+), 8 deletions(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > > > > > > > index 87ecb56..9ceeee5 100644
> > > > > > > > > > --- a/arch/x86/kvm/lapic.c
> > > > > > > > > > +++ b/arch/x86/kvm/lapic.c
> > > > > > > > > > @@ -122,6 +122,13 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
> > > > > > > > > >       return apic->vcpu->vcpu_id;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
> > > > > > > > > > +{
> > > > > > > > > > +     return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> > > > > > > > > > +             kvm_hlt_in_guest(vcpu->kvm);
> > > > > > > > > > +}
> > > > > > > > > > +EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
> > > > > > > > >
> > > > > > > > > Paolo, can you explain the reasoning behind this?
> > > > > > > > >
> > > > > > > > > Should not be necessary...
> > > > > >
> > > > > > https://lkml.org/lkml/2019/6/5/436  "Here you need to check
> > > > > > kvm_halt_in_guest, not kvm_mwait_in_guest, because you need to go
> > > > > > through kvm_apic_expired if the guest needs to be woken up from
> > > > > > kvm_vcpu_block."
> > > > >
> > > > > Ah, i think he means that a sleeping vcpu (in kvm_vcpu_block) must
> > > > > be woken up, if it receives a timer interrupt.
> > > > >
> > > > > But your patch will go through:
> > > > >
> > > > > kvm_apic_inject_pending_timer_irqs
> > > > > __apic_accept_irq ->
> > > > > vmx_deliver_posted_interrupt ->
> > > > > kvm_vcpu_trigger_posted_interrupt returns false
> > > > > (because vcpu->mode != IN_GUEST_MODE) ->
> > > > > kvm_vcpu_kick
> > > > >
> > > > > Which will wakeup the vcpu.
> > > >
> > > > Hi Marcelo,
> > > >
> > > > >
> > > > > Apart from this oops, which triggers when running:
> > > > > taskset -c 1 ./cyclictest -D 3600 -p 99 -t 1 -h 30 -m -n  -i 50000 -b 40
> > > >
> > > > I try both host and guest use latest kvm/queue  w/ CONFIG_PREEMPT
> > > > enabled, and expose mwait as your config, however, there is no oops.
> > > > Can you reproduce steadily or encounter casually? Can you reproduce
> > > > w/o the patchset?
> > >
> > > Hi Li,
> >
> > Hi Marcelo,
> >
> > >
> > > Steadily.
> > >
> > > Do you have this as well:
> >
> > w/ or w/o below diff, testing on both SKX and HSW servers on hand, I
> > didn't see any oops. Could you observe the oops disappear when w/o
> > below diff? If the answer is yes, then the oops will not block to
> > merge the patchset since Paolo prefers to add the kvm_hlt_in_guest()
> > condition to guarantee be woken up from kvm_vcpu_block().
>
> He agreed that its not necessary. Removing the HLT in guest widens
> the scope of the patch greatly.
>
> > For the
> > exitless injection if the guest is running(DPDK style workloads that
> > busy-spin on network card) scenarios, we can find a solution later.
>
> What is the use-case for HLT in guest again?

Together w/ mwait/hlt/pause no vmexits for dedicated instances. In
addition, hlt in guest will disable pv qspinlock which is not optimal
for dedicated instance. Refer to commit
b2798ba0b876 (KVM: X86: Choose qspinlock when dedicated physical CPUs
are available) and commit caa057a2cad64 (KVM: X86: Provide a
capability to disable HLT intercepts).

>
> I'll find the source for the oops (or confirm can't reproduce with
> kvm/queue RSN).

Looking forward to it, thanks Marcelo, hope we can catch the upcoming
merge window. :)

Regards,
Wanpeng Li
diff mbox series

Patch

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 87ecb56..9ceeee5 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -122,6 +122,13 @@  static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
 	return apic->vcpu->vcpu_id;
 }
 
+bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu)
+{
+	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+		kvm_hlt_in_guest(vcpu->kvm);
+}
+EXPORT_SYMBOL_GPL(posted_interrupt_inject_timer);
+
 static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
 		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
 	switch (map->mode) {
@@ -1432,6 +1439,19 @@  static void apic_update_lvtt(struct kvm_lapic *apic)
 	}
 }
 
+static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
+{
+	struct kvm_timer *ktimer = &apic->lapic_timer;
+
+	kvm_apic_local_deliver(apic, APIC_LVTT);
+	if (apic_lvtt_tscdeadline(apic))
+		ktimer->tscdeadline = 0;
+	if (apic_lvtt_oneshot(apic)) {
+		ktimer->tscdeadline = 0;
+		ktimer->target_expiration = 0;
+	}
+}
+
 static void apic_timer_expired(struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1441,6 +1461,11 @@  static void apic_timer_expired(struct kvm_lapic *apic)
 	if (atomic_read(&apic->lapic_timer.pending))
 		return;
 
+	if (posted_interrupt_inject_timer(apic->vcpu)) {
+		kvm_apic_inject_pending_timer_irqs(apic);
+		return;
+	}
+
 	atomic_inc(&apic->lapic_timer.pending);
 	kvm_set_pending_timer(vcpu);
 
@@ -2373,13 +2398,7 @@  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	if (atomic_read(&apic->lapic_timer.pending) > 0) {
-		kvm_apic_local_deliver(apic, APIC_LVTT);
-		if (apic_lvtt_tscdeadline(apic))
-			apic->lapic_timer.tscdeadline = 0;
-		if (apic_lvtt_oneshot(apic)) {
-			apic->lapic_timer.tscdeadline = 0;
-			apic->lapic_timer.target_expiration = 0;
-		}
+		kvm_apic_inject_pending_timer_irqs(apic);
 		atomic_set(&apic->lapic_timer.pending, 0);
 	}
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 3674717..e41936b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -236,6 +236,7 @@  void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
+bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu);
 
 static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8fbea03..f45c51e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7059,7 +7059,8 @@  static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
 	u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
 	struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
 
-	if (kvm_mwait_in_guest(vcpu->kvm))
+	if (kvm_mwait_in_guest(vcpu->kvm) ||
+		posted_interrupt_inject_timer(vcpu))
 		return -EOPNOTSUPP;
 
 	vmx = to_vmx(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9450a16..dae18f7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@ 
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/isolation.h>
 #include <linux/mem_encrypt.h>
 
 #include <trace/events/kvm.h>
@@ -155,6 +156,9 @@  EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
 static bool __read_mostly force_emulation_prefix = false;
 module_param(force_emulation_prefix, bool, S_IRUGO);
 
+bool __read_mostly pi_inject_timer = 0;
+module_param(pi_inject_timer, bool, S_IRUGO | S_IWUSR);
+
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
@@ -7032,6 +7036,7 @@  int kvm_arch_init(void *opaque)
 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
 	kvm_lapic_init();
+	pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
 #ifdef CONFIG_X86_64
 	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e08a128..10b26f4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -301,6 +301,8 @@  extern unsigned int min_timer_period_us;
 
 extern bool enable_vmware_backdoor;
 
+extern bool pi_inject_timer;
+
 extern struct static_key kvm_no_apic_vcpu;
 
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index b0fb144..6fc5407 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -19,6 +19,7 @@  enum hk_flags {
 DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
 extern int housekeeping_any_cpu(enum hk_flags flags);
 extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
+extern bool housekeeping_enabled(enum hk_flags flags);
 extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
 extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
 extern void __init housekeeping_init(void);
@@ -38,6 +39,7 @@  static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 static inline void housekeeping_affine(struct task_struct *t,
 				       enum hk_flags flags) { }
 static inline void housekeeping_init(void) { }
+static inline bool housekeeping_enabled(enum hk_flags flags) { }
 #endif /* CONFIG_CPU_ISOLATION */
 
 static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 123ea07..ccb2808 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -14,6 +14,12 @@  EXPORT_SYMBOL_GPL(housekeeping_overridden);
 static cpumask_var_t housekeeping_mask;
 static unsigned int housekeeping_flags;
 
+bool housekeeping_enabled(enum hk_flags flags)
+{
+	return !!(housekeeping_flags & flags);
+}
+EXPORT_SYMBOL_GPL(housekeeping_enabled);
+
 int housekeeping_any_cpu(enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))