diff mbox series

Revert "KVM: x86: Unconditionally enable irqs in guest context"

Message ID 20210105192844.296277-1-nitesh@redhat.com (mailing list archive)
State New, archived
Headers show
Series Revert "KVM: x86: Unconditionally enable irqs in guest context" | expand

Commit Message

Nitesh Narayan Lal Jan. 5, 2021, 7:28 p.m. UTC
This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.

After the introduction of the patch:

	87fa7f3e9: x86/kvm: Move context tracking where it belongs

since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
enabling of irqs to process pending interrupts should not be required
within vcpu_enter_guest anymore.

Conflicts:
	arch/x86/kvm/svm.c

Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
---
 arch/x86/kvm/svm/svm.c |  9 +++++++++
 arch/x86/kvm/x86.c     | 11 -----------
 2 files changed, 9 insertions(+), 11 deletions(-)

Comments

Wanpeng Li Jan. 6, 2021, 12:42 a.m. UTC | #1
On Wed, 6 Jan 2021 at 06:30, Nitesh Narayan Lal <nitesh@redhat.com> wrote:
>
> This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
>
> After the introduction of the patch:
>
>         87fa7f3e9: x86/kvm: Move context tracking where it belongs
>
> since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
> enabling of irqs to process pending interrupts should not be required
> within vcpu_enter_guest anymore.
>
> Conflicts:
>         arch/x86/kvm/svm.c
>
> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> ---
>  arch/x86/kvm/svm/svm.c |  9 +++++++++
>  arch/x86/kvm/x86.c     | 11 -----------
>  2 files changed, 9 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cce0143a6f80..c9b2fbb32484 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4187,6 +4187,15 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
>
>  static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
>  {
> +       kvm_before_interrupt(vcpu);
> +       local_irq_enable();
> +       /*
> +        * We must have an instruction with interrupts enabled, so
> +        * the timer interrupt isn't delayed by the interrupt shadow.
> +        */
> +       asm("nop");
> +       local_irq_disable();
> +       kvm_after_interrupt(vcpu);
>  }

Why do we need to reintroduce this part?

    Wanpeng
Sean Christopherson Jan. 6, 2021, 12:47 a.m. UTC | #2
+tglx

On Tue, Jan 05, 2021, Nitesh Narayan Lal wrote:
> This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
> 
> After the introduction of the patch:
> 
> 	87fa7f3e9: x86/kvm: Move context tracking where it belongs
> 
> since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
> enabling of irqs to process pending interrupts should not be required
> within vcpu_enter_guest anymore.

Ugh, except that commit completely broke tick-based accounting, on both Intel
and AMD.  With guest_exit_irqoff() being called immediately after VM-Exit, any
tick that happens after IRQs are disabled will be accounted to the host.  E.g.
on Intel, even an IRQ VM-Exit that has already been acked by the CPU isn't
processed until kvm_x86_ops.handle_exit_irqoff(), well after PF_VCPU has been
cleared.

CONFIG_VIRT_CPU_ACCOUNTING_GEN=y should still work (I didn't bother to verify).

Thomas, any clever ideas?  Handling IRQs in {vmx,svm}_vcpu_enter_exit() isn't an
option as KVM hasn't restored enough state to handle an IRQ, e.g. PKRU and XCR0
are still guest values.  Is it too heinous to fudge PF_VCPU across KVM's
"pending" IRQ handling?  E.g. this god-awful hack fixes the accounting:

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 836912b42030..5a777fd35b4b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9028,6 +9028,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
 
+       current->flags |= PF_VCPU;
        kvm_x86_ops.handle_exit_irqoff(vcpu);
 
        /*
@@ -9042,6 +9043,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        ++vcpu->stat.exits;
        local_irq_disable();
        kvm_after_interrupt(vcpu);
+       current->flags &= ~PF_VCPU;
 
        if (lapic_in_kernel(vcpu)) {
                s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;

> Conflicts:
> 	arch/x86/kvm/svm.c
> 
> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> ---
>  arch/x86/kvm/svm/svm.c |  9 +++++++++
>  arch/x86/kvm/x86.c     | 11 -----------
>  2 files changed, 9 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cce0143a6f80..c9b2fbb32484 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4187,6 +4187,15 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
>  
>  static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
>  {
> +	kvm_before_interrupt(vcpu);
> +	local_irq_enable();
> +	/*
> +	 * We must have an instruction with interrupts enabled, so
> +	 * the timer interrupt isn't delayed by the interrupt shadow.
> +	 */
> +	asm("nop");
> +	local_irq_disable();
> +	kvm_after_interrupt(vcpu);
>  }
>  
>  static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 3f7c1fc7a3ce..3e17c9ffcad8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9023,18 +9023,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	kvm_x86_ops.handle_exit_irqoff(vcpu);
>  
> -	/*
> -	 * Consume any pending interrupts, including the possible source of
> -	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
> -	 * An instruction is required after local_irq_enable() to fully unblock
> -	 * interrupts on processors that implement an interrupt shadow, the
> -	 * stat.exits increment will do nicely.
> -	 */
> -	kvm_before_interrupt(vcpu);
> -	local_irq_enable();
>  	++vcpu->stat.exits;
> -	local_irq_disable();
> -	kvm_after_interrupt(vcpu);
>  
>  	if (lapic_in_kernel(vcpu)) {
>  		s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
> -- 
> 2.27.0
>
Nitesh Narayan Lal Jan. 6, 2021, 1:35 a.m. UTC | #3
On 1/5/21 7:47 PM, Sean Christopherson wrote:
> +tglx
>
> On Tue, Jan 05, 2021, Nitesh Narayan Lal wrote:
>> This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
>>
>> After the introduction of the patch:
>>
>> 	87fa7f3e9: x86/kvm: Move context tracking where it belongs
>>
>> since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
>> enabling of irqs to process pending interrupts should not be required
>> within vcpu_enter_guest anymore.
> Ugh, except that commit completely broke tick-based accounting, on both Intel
> and AMD.

I did notice some discrepancies in the system time reported after the
introduction of this patch but I wrongly concluded that the behavior is correct.

I reported this yesterday [1] but I think I added your old email ID in
that thread (sorry about that).

>   With guest_exit_irqoff() being called immediately after VM-Exit, any
> tick that happens after IRQs are disabled will be accounted to the host.  E.g.
> on Intel, even an IRQ VM-Exit that has already been acked by the CPU isn't
> processed until kvm_x86_ops.handle_exit_irqoff(), well after PF_VCPU has been
> cleared.

Right that also explains the higher system time reported by the cpuacct.stats.

>
> CONFIG_VIRT_CPU_ACCOUNTING_GEN=y should still work (I didn't bother to verify).

For the cpuacct stats that I have shared in the other thread, this config was
enabled.

>
> Thomas, any clever ideas?  Handling IRQs in {vmx,svm}_vcpu_enter_exit() isn't an
> option as KVM hasn't restored enough state to handle an IRQ, e.g. PKRU and XCR0
> are still guest values.  Is it too heinous to fudge PF_VCPU across KVM's
> "pending" IRQ handling?  E.g. this god-awful hack fixes the accounting:
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 836912b42030..5a777fd35b4b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9028,6 +9028,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         vcpu->mode = OUTSIDE_GUEST_MODE;
>         smp_wmb();
>  
> +       current->flags |= PF_VCPU;
>         kvm_x86_ops.handle_exit_irqoff(vcpu);
>  
>         /*
> @@ -9042,6 +9043,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         ++vcpu->stat.exits;
>         local_irq_disable();
>         kvm_after_interrupt(vcpu);
> +       current->flags &= ~PF_VCPU;
>  
>         if (lapic_in_kernel(vcpu)) {
>                 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
>

I can give this a try.
What is the right way to test this (via cpuacct stats maybe)?

[1] https://lore.kernel.org/lkml/12a1b9d4-8534-e23a-6bbd-736474928e6b@redhat.com/
Vitaly Kuznetsov Jan. 6, 2021, 10:09 a.m. UTC | #4
Nitesh Narayan Lal <nitesh@redhat.com> writes:

> This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
>
> After the introduction of the patch:
>
> 	87fa7f3e9: x86/kvm: Move context tracking where it belongs
>
> since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
> enabling of irqs to process pending interrupts should not be required
> within vcpu_enter_guest anymore.
>
> Conflicts:
> 	arch/x86/kvm/svm.c
>
> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> ---
>  arch/x86/kvm/svm/svm.c |  9 +++++++++
>  arch/x86/kvm/x86.c     | 11 -----------
>  2 files changed, 9 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cce0143a6f80..c9b2fbb32484 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4187,6 +4187,15 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
>  
>  static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
>  {
> +	kvm_before_interrupt(vcpu);
> +	local_irq_enable();
> +	/*
> +	 * We must have an instruction with interrupts enabled, so
> +	 * the timer interrupt isn't delayed by the interrupt shadow.
> +	 */
> +	asm("nop");
> +	local_irq_disable();
> +	kvm_after_interrupt(vcpu);
>  }
>  
>  static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 3f7c1fc7a3ce..3e17c9ffcad8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9023,18 +9023,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	kvm_x86_ops.handle_exit_irqoff(vcpu);
>  
> -	/*
> -	 * Consume any pending interrupts, including the possible source of
> -	 * VM-Exit on SVM 

I kind of liked this part of the comment, the new (old) one in
svm_handle_exit_irqoff() doesn't actually explain what's going on.

> and any ticks that occur between VM-Exit and now.

Looking back, I don't quite understand why we wanted to account ticks
between vmexit and exiting guest context as 'guest' in the first place;
to my understanging 'guest time' is time spent within VMX non-root
operation, the rest is KVM overhead (system). It seems to match how the
accounting is done nowadays after Tglx's 87fa7f3e98a1 ("x86/kvm: Move
context tracking where it belongs").

> -	 * An instruction is required after local_irq_enable() to fully unblock
> -	 * interrupts on processors that implement an interrupt shadow, the
> -	 * stat.exits increment will do nicely.
> -	 */
> -	kvm_before_interrupt(vcpu);
> -	local_irq_enable();
>  	++vcpu->stat.exits;
> -	local_irq_disable();
> -	kvm_after_interrupt(vcpu);
>  
>  	if (lapic_in_kernel(vcpu)) {
>  		s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;

FWIW,

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Sean Christopherson Jan. 6, 2021, 5:11 p.m. UTC | #5
On Wed, Jan 06, 2021, Vitaly Kuznetsov wrote:
> Nitesh Narayan Lal <nitesh@redhat.com> writes:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 3f7c1fc7a3ce..3e17c9ffcad8 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -9023,18 +9023,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >  
> >  	kvm_x86_ops.handle_exit_irqoff(vcpu);
> >  
> > -	/*
> > -	 * Consume any pending interrupts, including the possible source of
> > -	 * VM-Exit on SVM 
> 
> I kind of liked this part of the comment, the new (old) one in
> svm_handle_exit_irqoff() doesn't actually explain what's going on.
> 
> > and any ticks that occur between VM-Exit and now.
> 
> Looking back, I don't quite understand why we wanted to account ticks
> between vmexit and exiting guest context as 'guest' in the first place;
> to my understanging 'guest time' is time spent within VMX non-root
> operation, the rest is KVM overhead (system).

With tick-based accounting, if the tick IRQ is received after PF_VCPU is cleared
then that tick will be accounted to the host/system.  The motivation for opening
an IRQ window after VM-Exit is to handle the case where the guest is constantly
exiting for a different reason _just_ before the tick arrives, e.g. if the guest
has its tick configured such that the guest and host ticks get synchronized
in a bad way.

This is a non-issue when using CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, at least with a
stable TSC, as the accounting happens during guest_exit_irqoff() itself.
Accounting might be less-than-stellar if TSC is unstable, but I don't think it
would be as binary of a failure as tick-based accounting.

> It seems to match how the accounting is done nowadays after Tglx's
> 87fa7f3e98a1 ("x86/kvm: Move context tracking where it belongs").
> 
> > -	 * An instruction is required after local_irq_enable() to fully unblock
> > -	 * interrupts on processors that implement an interrupt shadow, the
> > -	 * stat.exits increment will do nicely.
> > -	 */
> > -	kvm_before_interrupt(vcpu);
> > -	local_irq_enable();
> >  	++vcpu->stat.exits;
> > -	local_irq_disable();
> > -	kvm_after_interrupt(vcpu);
> >  
> >  	if (lapic_in_kernel(vcpu)) {
> >  		s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
> 
> FWIW,
> 
> Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> 
> -- 
> Vitaly
>
Vitaly Kuznetsov Jan. 7, 2021, 9:33 a.m. UTC | #6
Sean Christopherson <seanjc@google.com> writes:

> On Wed, Jan 06, 2021, Vitaly Kuznetsov wrote:
>> 
>> Looking back, I don't quite understand why we wanted to account ticks
>> between vmexit and exiting guest context as 'guest' in the first place;
>> to my understanging 'guest time' is time spent within VMX non-root
>> operation, the rest is KVM overhead (system).
>
> With tick-based accounting, if the tick IRQ is received after PF_VCPU is cleared
> then that tick will be accounted to the host/system.  The motivation for opening
> an IRQ window after VM-Exit is to handle the case where the guest is constantly
> exiting for a different reason _just_ before the tick arrives, e.g. if the guest
> has its tick configured such that the guest and host ticks get synchronized
> in a bad way.
>
> This is a non-issue when using CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, at least with a
> stable TSC, as the accounting happens during guest_exit_irqoff() itself.
> Accounting might be less-than-stellar if TSC is unstable, but I don't think it
> would be as binary of a failure as tick-based accounting.
>

Oh, yea, I vaguely remember we had to deal with a very similar problem
but for userspace/kernel accounting. It was possible to observe e.g. a
userspace task going 100% kernel while in reality it was just perfectly
synchronized with the tick and doing a syscall just before it arrives
(or something like that, I may be misremembering the details).

So depending on the frequency, it is probably possible to e.g observe
'100% host' with tick based accounting, the guest just has to
synchronize exiting to KVM in a way that the tick will always arrive
past guest_exit_irqoff().

It seems to me this is a fundamental problem in case the frequency of
guest exits can match the frequency of the time accounting tick.
Wanpeng Li Jan. 7, 2021, 9:41 a.m. UTC | #7
On Thu, 7 Jan 2021 at 17:35, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>
> Sean Christopherson <seanjc@google.com> writes:
>
> > On Wed, Jan 06, 2021, Vitaly Kuznetsov wrote:
> >>
> >> Looking back, I don't quite understand why we wanted to account ticks
> >> between vmexit and exiting guest context as 'guest' in the first place;
> >> to my understanging 'guest time' is time spent within VMX non-root
> >> operation, the rest is KVM overhead (system).
> >
> > With tick-based accounting, if the tick IRQ is received after PF_VCPU is cleared
> > then that tick will be accounted to the host/system.  The motivation for opening
> > an IRQ window after VM-Exit is to handle the case where the guest is constantly
> > exiting for a different reason _just_ before the tick arrives, e.g. if the guest
> > has its tick configured such that the guest and host ticks get synchronized
> > in a bad way.
> >
> > This is a non-issue when using CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, at least with a
> > stable TSC, as the accounting happens during guest_exit_irqoff() itself.
> > Accounting might be less-than-stellar if TSC is unstable, but I don't think it
> > would be as binary of a failure as tick-based accounting.
> >
>
> Oh, yea, I vaguely remember we had to deal with a very similar problem
> but for userspace/kernel accounting. It was possible to observe e.g. a
> userspace task going 100% kernel while in reality it was just perfectly
> synchronized with the tick and doing a syscall just before it arrives
> (or something like that, I may be misremembering the details).

Yes. :)  commit 2a42eb9594a1 ("sched/cputime: Accumulate vtime on top
of nsec clocksource")

> So depending on the frequency, it is probably possible to e.g observe
> '100% host' with tick based accounting, the guest just has to
> synchronize exiting to KVM in a way that the tick will always arrive
> past guest_exit_irqoff().
>
> It seems to me this is a fundamental problem in case the frequency of
> guest exits can match the frequency of the time accounting tick.
>
> --
> Vitaly
>
Xinlong Lin Jan. 7, 2021, 10:55 a.m. UTC | #8
On 2021-01-07 at 01:11, Sean Christopherson wrote:
>On Wed, Jan 06, 2021, Vitaly Kuznetsov wrote:
>> Nitesh Narayan Lal <nitesh@redhat.com> writes:
>> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> > index 3f7c1fc7a3ce..3e17c9ffcad8 100644
>> > --- a/arch/x86/kvm/x86.c
>> > +++ b/arch/x86/kvm/x86.c
>> > @@ -9023,18 +9023,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>> > 
>> >  kvm_x86_ops.handle_exit_irqoff(vcpu);
>> > 
>> > -	/*
>> > -	* Consume any pending interrupts, including the possible source of
>> > -	* VM-Exit on SVM
>>
>> I kind of liked this part of the comment, the new (old) one in
>> svm_handle_exit_irqoff() doesn't actually explain what's going on.
>>
>> > and any ticks that occur between VM-Exit and now.
>>
>> Looking back, I don't quite understand why we wanted to account ticks
>> between vmexit and exiting guest context as 'guest' in the first place;
>> to my understanging 'guest time' is time spent within VMX non-root
>> operation, the rest is KVM overhead (system).
>
>With tick-based accounting, if the tick IRQ is received after PF_VCPU is cleared
>then that tick will be accounted to the host/system.  The motivation for opening
>an IRQ window after VM-Exit is to handle the case where the guest is constantly
>exiting for a different reason _just_ before the tick arrives, e.g. if the guest
>has its tick configured such that the guest and host ticks get synchronized
>in a bad way.
>
>This is a non-issue when using CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, at least with a
>stable TSC, as the accounting happens during guest_exit_irqoff() itself.
>Accounting might be less-than-stellar if TSC is unstable, but I don't think it
>would be as binary of a failure as tick-based accounting. 

If I don't specify "nohz_full" in boot command line when using
CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, Will the problem still exist?

>
>> It seems to match how the accounting is done nowadays after Tglx's
>> 87fa7f3e98a1 ("x86/kvm: Move context tracking where it belongs").
>>
>> > -	* An instruction is required after local_irq_enable() to fully unblock
>> > -	* interrupts on processors that implement an interrupt shadow, the
>> > -	* stat.exits increment will do nicely.
>> > -	*/
>> > -	kvm_before_interrupt(vcpu);
>> > -	local_irq_enable();
>> >  ++vcpu->stat.exits;
>> > -	local_irq_disable();
>> > -	kvm_after_interrupt(vcpu);
>> > 
>> >  if (lapic_in_kernel(vcpu)) {
>> >  s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
>>
>> FWIW,
>>
>> Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>>
>> --
>> Vitaly
>>
Nitesh Narayan Lal Jan. 12, 2021, 9:43 p.m. UTC | #9
On 1/7/21 4:33 AM, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
>
>> On Wed, Jan 06, 2021, Vitaly Kuznetsov wrote:
>>> Looking back, I don't quite understand why we wanted to account ticks
>>> between vmexit and exiting guest context as 'guest' in the first place;
>>> to my understanging 'guest time' is time spent within VMX non-root
>>> operation, the rest is KVM overhead (system).
>> With tick-based accounting, if the tick IRQ is received after PF_VCPU is cleared
>> then that tick will be accounted to the host/system.  The motivation for opening
>> an IRQ window after VM-Exit is to handle the case where the guest is constantly
>> exiting for a different reason _just_ before the tick arrives, e.g. if the guest
>> has its tick configured such that the guest and host ticks get synchronized
>> in a bad way.
>>
>> This is a non-issue when using CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, at least with a
>> stable TSC, as the accounting happens during guest_exit_irqoff() itself.
>> Accounting might be less-than-stellar if TSC is unstable, but I don't think it
>> would be as binary of a failure as tick-based accounting.
>>
> Oh, yea, I vaguely remember we had to deal with a very similar problem
> but for userspace/kernel accounting. It was possible to observe e.g. a
> userspace task going 100% kernel while in reality it was just perfectly
> synchronized with the tick and doing a syscall just before it arrives
> (or something like that, I may be misremembering the details).
>
> So depending on the frequency, it is probably possible to e.g observe
> '100% host' with tick based accounting, the guest just has to
> synchronize exiting to KVM in a way that the tick will always arrive
> past guest_exit_irqoff().
>
> It seems to me this is a fundamental problem in case the frequency of
> guest exits can match the frequency of the time accounting tick.
>

Just to make sure that I am understanding things correctly.
There are two issues:
1. The first issue is with the tick IRQs that arrive after PF_VCPU is
   cleared as they are then accounted into the system context atleast on
   the setup where CONFIG_VIRT_CPU_ACCOUNTING_GEN is not enabled. With the
   patch "KVM: x86: Unconditionally enable irqs in guest context", we are
   atleast taking care of the scenario where the guest context is exiting
   constantly just before the arrival of the tick.
 
2. The second issue that Sean mentioned was introduced because of moving
   guest_exit_irqoff() closer to VM-exit. Due to this change, any ticks that
   happen after IRQs are disabled are incorrectly accounted into the system
   context. This is because we exit the guest context early without
   ensuring if the required guest states to handle IRQs are restored.
 
So, the increase in the system time (reported by cpuacct.stats) that I was
observing is not entirely correct after all.
Am I missing anything here?

--
Thanks
Nitesh
Sean Christopherson Jan. 12, 2021, 10:04 p.m. UTC | #10
On Tue, Jan 12, 2021, Nitesh Narayan Lal wrote:
> 
> On 1/7/21 4:33 AM, Vitaly Kuznetsov wrote:
> > Sean Christopherson <seanjc@google.com> writes:
> >
> >> On Wed, Jan 06, 2021, Vitaly Kuznetsov wrote:
> >>> Looking back, I don't quite understand why we wanted to account ticks
> >>> between vmexit and exiting guest context as 'guest' in the first place;
> >>> to my understanging 'guest time' is time spent within VMX non-root
> >>> operation, the rest is KVM overhead (system).
> >> With tick-based accounting, if the tick IRQ is received after PF_VCPU is cleared
> >> then that tick will be accounted to the host/system.  The motivation for opening
> >> an IRQ window after VM-Exit is to handle the case where the guest is constantly
> >> exiting for a different reason _just_ before the tick arrives, e.g. if the guest
> >> has its tick configured such that the guest and host ticks get synchronized
> >> in a bad way.
> >>
> >> This is a non-issue when using CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, at least with a
> >> stable TSC, as the accounting happens during guest_exit_irqoff() itself.
> >> Accounting might be less-than-stellar if TSC is unstable, but I don't think it
> >> would be as binary of a failure as tick-based accounting.
> >>
> > Oh, yea, I vaguely remember we had to deal with a very similar problem
> > but for userspace/kernel accounting. It was possible to observe e.g. a
> > userspace task going 100% kernel while in reality it was just perfectly
> > synchronized with the tick and doing a syscall just before it arrives
> > (or something like that, I may be misremembering the details).
> >
> > So depending on the frequency, it is probably possible to e.g observe
> > '100% host' with tick based accounting, the guest just has to
> > synchronize exiting to KVM in a way that the tick will always arrive
> > past guest_exit_irqoff().
> >
> > It seems to me this is a fundamental problem in case the frequency of
> > guest exits can match the frequency of the time accounting tick.
> >
> 
> Just to make sure that I am understanding things correctly.
> There are two issues:
> 1. The first issue is with the tick IRQs that arrive after PF_VCPU is
>    cleared as they are then accounted into the system context atleast on
>    the setup where CONFIG_VIRT_CPU_ACCOUNTING_GEN is not enabled. With the
>    patch "KVM: x86: Unconditionally enable irqs in guest context", we are
>    atleast taking care of the scenario where the guest context is exiting
>    constantly just before the arrival of the tick.

Yep.

> 2. The second issue that Sean mentioned was introduced because of moving
>    guest_exit_irqoff() closer to VM-exit. Due to this change, any ticks that
>    happen after IRQs are disabled are incorrectly accounted into the system
>    context. This is because we exit the guest context early without
>    ensuring if the required guest states to handle IRQs are restored.

Yep.

> So, the increase in the system time (reported by cpuacct.stats) that I was
> observing is not entirely correct after all.

It's correct, but iff CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, as that doesn't rely on
ticks and so closer to VM-Enter is better.  The problem is that it completely
breaks CONFIG_VIRT_CPU_ACCOUNTING_GEN=n (#2 above) because KVM will never
service an IRQ, ticks included, with PF_VCPU set.

> Am I missing anything here?
Wanpeng Li Jan. 15, 2021, 3:20 a.m. UTC | #11
On Wed, 6 Jan 2021 at 08:51, Sean Christopherson <seanjc@google.com> wrote:
>
> +tglx
>
> On Tue, Jan 05, 2021, Nitesh Narayan Lal wrote:
> > This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
> >
> > After the introduction of the patch:
> >
> >       87fa7f3e9: x86/kvm: Move context tracking where it belongs
> >
> > since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
> > enabling of irqs to process pending interrupts should not be required
> > within vcpu_enter_guest anymore.
>
> Ugh, except that commit completely broke tick-based accounting, on both Intel
> and AMD.  With guest_exit_irqoff() being called immediately after VM-Exit, any
> tick that happens after IRQs are disabled will be accounted to the host.  E.g.
> on Intel, even an IRQ VM-Exit that has already been acked by the CPU isn't
> processed until kvm_x86_ops.handle_exit_irqoff(), well after PF_VCPU has been
> cleared.
>

This issue can be 100% reproduced.
https://bugzilla.kernel.org/show_bug.cgi?id=204177

> CONFIG_VIRT_CPU_ACCOUNTING_GEN=y should still work (I didn't bother to verify).
>
> Thomas, any clever ideas?  Handling IRQs in {vmx,svm}_vcpu_enter_exit() isn't an
> option as KVM hasn't restored enough state to handle an IRQ, e.g. PKRU and XCR0
> are still guest values.  Is it too heinous to fudge PF_VCPU across KVM's
> "pending" IRQ handling?  E.g. this god-awful hack fixes the accounting:
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 836912b42030..5a777fd35b4b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9028,6 +9028,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         vcpu->mode = OUTSIDE_GUEST_MODE;
>         smp_wmb();
>
> +       current->flags |= PF_VCPU;
>         kvm_x86_ops.handle_exit_irqoff(vcpu);
>
>         /*
> @@ -9042,6 +9043,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         ++vcpu->stat.exits;
>         local_irq_disable();
>         kvm_after_interrupt(vcpu);
> +       current->flags &= ~PF_VCPU;
>
>         if (lapic_in_kernel(vcpu)) {
>                 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
>
> > Conflicts:
> >       arch/x86/kvm/svm.c
> >
> > Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> > ---
> >  arch/x86/kvm/svm/svm.c |  9 +++++++++
> >  arch/x86/kvm/x86.c     | 11 -----------
> >  2 files changed, 9 insertions(+), 11 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index cce0143a6f80..c9b2fbb32484 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -4187,6 +4187,15 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
> >
> >  static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> >  {
> > +     kvm_before_interrupt(vcpu);
> > +     local_irq_enable();
> > +     /*
> > +      * We must have an instruction with interrupts enabled, so
> > +      * the timer interrupt isn't delayed by the interrupt shadow.
> > +      */
> > +     asm("nop");
> > +     local_irq_disable();
> > +     kvm_after_interrupt(vcpu);
> >  }
> >
> >  static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 3f7c1fc7a3ce..3e17c9ffcad8 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -9023,18 +9023,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >
> >       kvm_x86_ops.handle_exit_irqoff(vcpu);
> >
> > -     /*
> > -      * Consume any pending interrupts, including the possible source of
> > -      * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
> > -      * An instruction is required after local_irq_enable() to fully unblock
> > -      * interrupts on processors that implement an interrupt shadow, the
> > -      * stat.exits increment will do nicely.
> > -      */
> > -     kvm_before_interrupt(vcpu);
> > -     local_irq_enable();
> >       ++vcpu->stat.exits;
> > -     local_irq_disable();
> > -     kvm_after_interrupt(vcpu);
> >
> >       if (lapic_in_kernel(vcpu)) {
> >               s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
> > --
> > 2.27.0
> >
Wanpeng Li Jan. 19, 2021, 1:27 a.m. UTC | #12
On Fri, 15 Jan 2021 at 11:20, Wanpeng Li <kernellwp@gmail.com> wrote:
>
> On Wed, 6 Jan 2021 at 08:51, Sean Christopherson <seanjc@google.com> wrote:
> >
> > +tglx
> >
> > On Tue, Jan 05, 2021, Nitesh Narayan Lal wrote:
> > > This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
> > >
> > > After the introduction of the patch:
> > >
> > >       87fa7f3e9: x86/kvm: Move context tracking where it belongs
> > >
> > > since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
> > > enabling of irqs to process pending interrupts should not be required
> > > within vcpu_enter_guest anymore.
> >
> > Ugh, except that commit completely broke tick-based accounting, on both Intel
> > and AMD.  With guest_exit_irqoff() being called immediately after VM-Exit, any
> > tick that happens after IRQs are disabled will be accounted to the host.  E.g.
> > on Intel, even an IRQ VM-Exit that has already been acked by the CPU isn't
> > processed until kvm_x86_ops.handle_exit_irqoff(), well after PF_VCPU has been
> > cleared.
> >
>
> This issue can be 100% reproduced.
> https://bugzilla.kernel.org/show_bug.cgi?id=204177

Sorry, the posted link should be
https://bugzilla.kernel.org/show_bug.cgi?id=209831

    Wanpeng
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cce0143a6f80..c9b2fbb32484 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4187,6 +4187,15 @@  static int svm_check_intercept(struct kvm_vcpu *vcpu,
 
 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
+	kvm_before_interrupt(vcpu);
+	local_irq_enable();
+	/*
+	 * We must have an instruction with interrupts enabled, so
+	 * the timer interrupt isn't delayed by the interrupt shadow.
+	 */
+	asm("nop");
+	local_irq_disable();
+	kvm_after_interrupt(vcpu);
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3f7c1fc7a3ce..3e17c9ffcad8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9023,18 +9023,7 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops.handle_exit_irqoff(vcpu);
 
-	/*
-	 * Consume any pending interrupts, including the possible source of
-	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
-	 * An instruction is required after local_irq_enable() to fully unblock
-	 * interrupts on processors that implement an interrupt shadow, the
-	 * stat.exits increment will do nicely.
-	 */
-	kvm_before_interrupt(vcpu);
-	local_irq_enable();
 	++vcpu->stat.exits;
-	local_irq_disable();
-	kvm_after_interrupt(vcpu);
 
 	if (lapic_in_kernel(vcpu)) {
 		s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;