diff mbox

[3/3] x86: kvm guest side support for KVM_HC_RT_PRIO hypercall

Message ID 20170921114039.466130276@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marcelo Tosatti Sept. 21, 2017, 11:38 a.m. UTC
Add hypercalls to spinlock/unlock to set/unset FIFO priority
for the vcpu, protected by a static branch to avoid performance
increase in the normal kernels.

Enable option by "kvmfifohc" kernel command line parameter (disabled
by default).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 arch/x86/kernel/kvm.c            |   31 +++++++++++++++++++++++++++++++
 include/linux/spinlock.h         |    2 +-
 include/linux/spinlock_api_smp.h |   17 +++++++++++++++++
 3 files changed, 49 insertions(+), 1 deletion(-)

Comments

Konrad Rzeszutek Wilk Sept. 21, 2017, 1:36 p.m. UTC | #1
On Thu, Sep 21, 2017 at 08:38:38AM -0300, Marcelo Tosatti wrote:
> Add hypercalls to spinlock/unlock to set/unset FIFO priority
> for the vcpu, protected by a static branch to avoid performance
> increase in the normal kernels.
> 
> Enable option by "kvmfifohc" kernel command line parameter (disabled
> by default).

Wouldn't be better if there was a global 'kvm=' which could have the
various overrides?

> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  arch/x86/kernel/kvm.c            |   31 +++++++++++++++++++++++++++++++
>  include/linux/spinlock.h         |    2 +-
>  include/linux/spinlock_api_smp.h |   17 +++++++++++++++++

Hm. It looks like you forgot to CC the maintainers:

$ scripts/get_maintainer.pl -f include/linux/spinlock.h
Peter Zijlstra <peterz@infradead.org> (maintainer:LOCKING PRIMITIVES)
Ingo Molnar <mingo@redhat.com> (maintainer:LOCKING PRIMITIVES)
linux-kernel@vger.kernel.org (open list:LOCKING PRIMITIVES)

Doing that for you.

>  3 files changed, 49 insertions(+), 1 deletion(-)
> 
> Index: kvm.fifopriohc-submit/arch/x86/kernel/kvm.c
> ===================================================================
> --- kvm.fifopriohc-submit.orig/arch/x86/kernel/kvm.c
> +++ kvm.fifopriohc-submit/arch/x86/kernel/kvm.c
> @@ -37,6 +37,7 @@
>  #include <linux/debugfs.h>
>  #include <linux/nmi.h>
>  #include <linux/swait.h>
> +#include <linux/static_key.h>
>  #include <asm/timer.h>
>  #include <asm/cpu.h>
>  #include <asm/traps.h>
> @@ -321,6 +322,36 @@ static notrace void kvm_guest_apic_eoi_w
>  	apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
>  }
>  
> +static int kvmfifohc;
> +
> +static int parse_kvmfifohc(char *arg)
> +{
> +	kvmfifohc = 1;
> +	return 0;
> +}
> +
> +early_param("kvmfifohc", parse_kvmfifohc);
> +
> +DEFINE_STATIC_KEY_FALSE(kvm_fifo_hc_key);
> +
> +static void kvm_init_fifo_hc(void)
> +{
> +	long ret;
> +
> +	ret = kvm_hypercall1(KVM_HC_RT_PRIO, 0);
> +
> +	if (ret == 0 && kvmfifohc == 1)
> +		static_branch_enable(&kvm_fifo_hc_key);
> +}
> +
> +static __init int kvmguest_late_init(void)
> +{
> +	kvm_init_fifo_hc();
> +	return 0;
> +}
> +
> +late_initcall(kvmguest_late_init);
> +
>  static void kvm_guest_cpu_init(void)
>  {
>  	if (!kvm_para_available())
> Index: kvm.fifopriohc-submit/include/linux/spinlock_api_smp.h
> ===================================================================
> --- kvm.fifopriohc-submit.orig/include/linux/spinlock_api_smp.h
> +++ kvm.fifopriohc-submit/include/linux/spinlock_api_smp.h
> @@ -136,11 +136,28 @@ static inline void __raw_spin_lock_bh(ra
>  	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>  }
>  
> +#ifdef CONFIG_KVM_GUEST
> +DECLARE_STATIC_KEY_FALSE(kvm_fifo_hc_key);
> +#endif
> +
>  static inline void __raw_spin_lock(raw_spinlock_t *lock)
>  {
>  	preempt_disable();
> +
> +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_SMP)
> +	/* enable FIFO priority */
> +	if (static_branch_unlikely(&kvm_fifo_hc_key))
> +		kvm_hypercall1(KVM_HC_RT_PRIO, 0x1);
> +#endif

I am assuming the reason you choose not to wrap this in a pvops
or any other structure that is more of hypervisor agnostic is
that only KVM exposes this. But what if other hypervisors expose
something similar? Or some other mechanism similar to this?

> +
>  	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>  	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
> +
> +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_SMP)
> +	/* disable FIFO priority */
> +	if (static_branch_unlikely(&kvm_fifo_hc_key))
> +		kvm_hypercall1(KVM_HC_RT_PRIO, 0);
> +#endif
>  }
>  
>  #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */
> Index: kvm.fifopriohc-submit/include/linux/spinlock.h
> ===================================================================
> --- kvm.fifopriohc-submit.orig/include/linux/spinlock.h
> +++ kvm.fifopriohc-submit/include/linux/spinlock.h
> @@ -56,7 +56,7 @@
>  #include <linux/stringify.h>
>  #include <linux/bottom_half.h>
>  #include <asm/barrier.h>
> -
> +#include <uapi/linux/kvm_para.h>
>  
>  /*
>   * Must define these before including other files, inline functions need them
> 
>
Peter Zijlstra Sept. 21, 2017, 2:06 p.m. UTC | #2
On Thu, Sep 21, 2017 at 09:36:53AM -0400, Konrad Rzeszutek Wilk wrote:
> On Thu, Sep 21, 2017 at 08:38:38AM -0300, Marcelo Tosatti wrote:
> > Add hypercalls to spinlock/unlock to set/unset FIFO priority
> > for the vcpu, protected by a static branch to avoid performance
> > increase in the normal kernels.
> > 
> > Enable option by "kvmfifohc" kernel command line parameter (disabled
> > by default).

WTF kind of fudge is this? Changelog completely fails to explain the
problem this would solve. Why are you doing insane things like this?


NAK!

> > Index: kvm.fifopriohc-submit/include/linux/spinlock_api_smp.h
> > ===================================================================
> > --- kvm.fifopriohc-submit.orig/include/linux/spinlock_api_smp.h
> > +++ kvm.fifopriohc-submit/include/linux/spinlock_api_smp.h
> > @@ -136,11 +136,28 @@ static inline void __raw_spin_lock_bh(ra
> >  	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
> >  }
> >  
> > +#ifdef CONFIG_KVM_GUEST
> > +DECLARE_STATIC_KEY_FALSE(kvm_fifo_hc_key);
> > +#endif
> > +
> >  static inline void __raw_spin_lock(raw_spinlock_t *lock)
> >  {
> >  	preempt_disable();
> > +
> > +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_SMP)
> > +	/* enable FIFO priority */
> > +	if (static_branch_unlikely(&kvm_fifo_hc_key))
> > +		kvm_hypercall1(KVM_HC_RT_PRIO, 0x1);
> > +#endif
> > +
> >  	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
> >  	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
> > +
> > +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_SMP)
> > +	/* disable FIFO priority */
> > +	if (static_branch_unlikely(&kvm_fifo_hc_key))
> > +		kvm_hypercall1(KVM_HC_RT_PRIO, 0);
> > +#endif
> >  }
> >  
> >  #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */
Marcelo Tosatti Sept. 22, 2017, 1:10 a.m. UTC | #3
On Thu, Sep 21, 2017 at 04:06:28PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 21, 2017 at 09:36:53AM -0400, Konrad Rzeszutek Wilk wrote:
> > On Thu, Sep 21, 2017 at 08:38:38AM -0300, Marcelo Tosatti wrote:
> > > Add hypercalls to spinlock/unlock to set/unset FIFO priority
> > > for the vcpu, protected by a static branch to avoid performance
> > > increase in the normal kernels.
> > > 
> > > Enable option by "kvmfifohc" kernel command line parameter (disabled
> > > by default).
> 
> WTF kind of fudge is this? Changelog completely fails to explain the
> problem this would solve. Why are you doing insane things like this?
> 
> 
> NAK!

Copy&pasting from the initial message, please point out whether this
explanation makes sense (better solutions to this problem are welcome):


When executing guest vcpu-0 with FIFO:1 priority, which is necessary
to
deal with the following situation:

VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)

raw_spin_lock(A)
interrupted, schedule task T-1          raw_spin_lock(A) (spin)

raw_spin_unlock(A)

Certain operations must interrupt guest vcpu-0 (see trace below).

To fix this issue, only change guest vcpu-0 to FIFO priority
on spinlock critical sections (see patch).

Hang trace
==========

Without FIFO priority:

qemu-kvm-6705  [002] ....1.. 767785.648964: kvm_exit: reason
IO_INSTRUCTION rip 0xe8fe info 1f00039 0
qemu-kvm-6705  [002] ....1.. 767785.648965: kvm_exit: reason
IO_INSTRUCTION rip 0xe911 info 3f60008 0
qemu-kvm-6705  [002] ....1.. 767785.648968: kvm_exit: reason
IO_INSTRUCTION rip 0x8984 info 608000b 0
qemu-kvm-6705  [002] ....1.. 767785.648971: kvm_exit: reason
IO_INSTRUCTION rip 0xb313 info 1f70008 0
qemu-kvm-6705  [002] ....1.. 767785.648974: kvm_exit: reason
IO_INSTRUCTION rip 0xb514 info 3f60000 0
qemu-kvm-6705  [002] ....1.. 767785.648977: kvm_exit: reason
PENDING_INTERRUPT rip 0x8052 info 0 0
qemu-kvm-6705  [002] ....1.. 767785.648980: kvm_exit: reason
IO_INSTRUCTION rip 0xeee6 info 200040 0
qemu-kvm-6705  [002] ....1.. 767785.648999: kvm_exit: reason
EPT_MISCONFIG rip 0x2120 info 0 0

With FIFO priority:

qemu-kvm-7636  [002] ....1.. 768218.205065: kvm_exit: reason
IO_INSTRUCTION rip 0xb313 info 1f70008 0
qemu-kvm-7636  [002] ....1.. 768218.205068: kvm_exit: reason
IO_INSTRUCTION rip 0x8984 info 608000b 0
qemu-kvm-7636  [002] ....1.. 768218.205071: kvm_exit: reason
IO_INSTRUCTION rip 0xb313 info 1f70008 0
qemu-kvm-7636  [002] ....1.. 768218.205074: kvm_exit: reason
IO_INSTRUCTION rip 0x8984 info 608000b 0
qemu-kvm-7636  [002] ....1.. 768218.205077: kvm_exit: reason
IO_INSTRUCTION rip 0xb313 info 1f70008 0
..

Performance numbers (kernel compilation with make -j2)
======================================================

With hypercall: 4:40.  (make -j2)
Without hypercall: 3:38.  (make -j2)

Note for NFV workloads spinlock performance is not relevant
since DPDK should not enter the kernel (and housekeeping vcpu
performance is far from a key factor).


> 
> > > Index: kvm.fifopriohc-submit/include/linux/spinlock_api_smp.h
> > > ===================================================================
> > > --- kvm.fifopriohc-submit.orig/include/linux/spinlock_api_smp.h
> > > +++ kvm.fifopriohc-submit/include/linux/spinlock_api_smp.h
> > > @@ -136,11 +136,28 @@ static inline void __raw_spin_lock_bh(ra
> > >  	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
> > >  }
> > >  
> > > +#ifdef CONFIG_KVM_GUEST
> > > +DECLARE_STATIC_KEY_FALSE(kvm_fifo_hc_key);
> > > +#endif
> > > +
> > >  static inline void __raw_spin_lock(raw_spinlock_t *lock)
> > >  {
> > >  	preempt_disable();
> > > +
> > > +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_SMP)
> > > +	/* enable FIFO priority */
> > > +	if (static_branch_unlikely(&kvm_fifo_hc_key))
> > > +		kvm_hypercall1(KVM_HC_RT_PRIO, 0x1);
> > > +#endif
> > > +
> > >  	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
> > >  	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
> > > +
> > > +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_SMP)
> > > +	/* disable FIFO priority */
> > > +	if (static_branch_unlikely(&kvm_fifo_hc_key))
> > > +		kvm_hypercall1(KVM_HC_RT_PRIO, 0);
> > > +#endif
> > >  }
> > >  
> > >  #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */
Peter Zijlstra Sept. 22, 2017, 10 a.m. UTC | #4
On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> to
> deal with the following situation:
> 
> VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> 
> raw_spin_lock(A)
> interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> 
> raw_spin_unlock(A)
> 
> Certain operations must interrupt guest vcpu-0 (see trace below).

Those traces don't make any sense. All they include is kvm_exit and you
can't tell anything from that.

> To fix this issue, only change guest vcpu-0 to FIFO priority
> on spinlock critical sections (see patch).

This doesn't make sense. So you're saying that if you run all VCPUs as
FIFO things come apart? Why?

And why can't they still come apart when the guest holds a spinlock?
Peter Zijlstra Sept. 22, 2017, 10:56 a.m. UTC | #5
On Fri, Sep 22, 2017 at 12:00:04PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> > When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> > to
> > deal with the following situation:
> > 
> > VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> > 
> > raw_spin_lock(A)
> > interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> > 
> > raw_spin_unlock(A)
> > 
> > Certain operations must interrupt guest vcpu-0 (see trace below).
> 
> Those traces don't make any sense. All they include is kvm_exit and you
> can't tell anything from that.
> 
> > To fix this issue, only change guest vcpu-0 to FIFO priority
> > on spinlock critical sections (see patch).
> 
> This doesn't make sense. So you're saying that if you run all VCPUs as
> FIFO things come apart? Why?
> 
> And why can't they still come apart when the guest holds a spinlock?

That is, running a RT guest and not having _all_ VCPUs being RT tasks on
the host is absolutely and completely insane and broken.

Fix whatever needs fixing to allow your VCPU0 to be RT, don't do insane
things like this.
Marcelo Tosatti Sept. 22, 2017, 12:16 p.m. UTC | #6
On Fri, Sep 22, 2017 at 12:00:05PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> > When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> > to
> > deal with the following situation:
> > 
> > VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> > 
> > raw_spin_lock(A)
> > interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> > 
> > raw_spin_unlock(A)
> > 
> > Certain operations must interrupt guest vcpu-0 (see trace below).
> 
> Those traces don't make any sense. All they include is kvm_exit and you
> can't tell anything from that.

Hi Peter,

OK lets describe whats happening:

With QEMU emulator thread and vcpu-0 sharing a physical CPU
(which is a request from several NFV customers, to improve
guest packing), the following occurs when the guest generates 
the following pattern:

		1. submit IO.
		2. busy spin.

Hang trace
==========

Without FIFO priority:

qemu-kvm-6705  [002] ....1.. 767785.648964: kvm_exit: reason
IO_INSTRUCTION rip 0xe8fe info 1f00039 0
qemu-kvm-6705  [002] ....1.. 767785.648965: kvm_exit: reason
IO_INSTRUCTION rip 0xe911 info 3f60008 0
qemu-kvm-6705  [002] ....1.. 767785.648968: kvm_exit: reason
IO_INSTRUCTION rip 0x8984 info 608000b 0
qemu-kvm-6705  [002] ....1.. 767785.648971: kvm_exit: reason
IO_INSTRUCTION rip 0xb313 info 1f70008 0
qemu-kvm-6705  [002] ....1.. 767785.648974: kvm_exit: reason
IO_INSTRUCTION rip 0xb514 info 3f60000 0
qemu-kvm-6705  [002] ....1.. 767785.648977: kvm_exit: reason
PENDING_INTERRUPT rip 0x8052 info 0 0
qemu-kvm-6705  [002] ....1.. 767785.648980: kvm_exit: reason
IO_INSTRUCTION rip 0xeee6 info 200040 0
qemu-kvm-6705  [002] ....1.. 767785.648999: kvm_exit: reason
EPT_MISCONFIG rip 0x2120 info 0 0

The emulator thread is able to interrupt qemu vcpu0 at SCHED_NORMAL
priority.

With FIFO priority:

Now, with qemu vcpu0 at SCHED_FIFO priority, which is necessary to
avoid the following scenario:

(*)
VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
 
raw_spin_lock(A)
interrupted, schedule task T-1          raw_spin_lock(A) (spin)
 
raw_spin_unlock(A)

And the following code pattern by vcpu0:

		1. submit IO.
		2. busy spin.

The emulator thread is unable to interrupt vcpu0 thread
(vcpu0 busy spinning at SCHED_FIFO, emulator thread at SCHED_NORMAL), 
and you get a hang at boot as follows:

qemu-kvm-7636  [002] ....1.. 768218.205065: kvm_exit: reason
IO_INSTRUCTION rip 0xb313 info 1f70008 0
qemu-kvm-7636  [002] ....1.. 768218.205068: kvm_exit: reason
IO_INSTRUCTION rip 0x8984 info 608000b 0
qemu-kvm-7636  [002] ....1.. 768218.205071: kvm_exit: reason
IO_INSTRUCTION rip 0xb313 info 1f70008 0
qemu-kvm-7636  [002] ....1.. 768218.205074: kvm_exit: reason
IO_INSTRUCTION rip 0x8984 info 608000b 0
qemu-kvm-7636  [002] ....1.. 768218.205077: kvm_exit: reason
IO_INSTRUCTION rip 0xb313 info 1f70008 0

So to fix this problem, the patchset changes the priority
of the VCPU thread (to fix (*)), only when taking spinlocks.

Does that make sense now?

> 
> > To fix this issue, only change guest vcpu-0 to FIFO priority
> > on spinlock critical sections (see patch).
> 
> This doesn't make sense. So you're saying that if you run all VCPUs as
> FIFO things come apart? Why?

Please see above.

> And why can't they still come apart when the guest holds a spinlock?

Hopefully the above makes sense.
Peter Zijlstra Sept. 22, 2017, 12:31 p.m. UTC | #7
On Fri, Sep 22, 2017 at 09:16:40AM -0300, Marcelo Tosatti wrote:
> On Fri, Sep 22, 2017 at 12:00:05PM +0200, Peter Zijlstra wrote:
> > On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> > > When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> > > to
> > > deal with the following situation:
> > > 
> > > VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> > > 
> > > raw_spin_lock(A)
> > > interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> > > 
> > > raw_spin_unlock(A)
> > > 
> > > Certain operations must interrupt guest vcpu-0 (see trace below).
> > 
> > Those traces don't make any sense. All they include is kvm_exit and you
> > can't tell anything from that.
> 
> Hi Peter,
> 
> OK lets describe whats happening:
> 
> With QEMU emulator thread and vcpu-0 sharing a physical CPU
> (which is a request from several NFV customers, to improve
> guest packing), the following occurs when the guest generates 
> the following pattern:
> 
> 		1. submit IO.
> 		2. busy spin.

User-space spinning is a bad idea in general and terminally broken in
a RT setup. Sounds like you need to go fix qemu to not suck.
Marcelo Tosatti Sept. 22, 2017, 12:33 p.m. UTC | #8
On Fri, Sep 22, 2017 at 12:56:09PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 22, 2017 at 12:00:04PM +0200, Peter Zijlstra wrote:
> > On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> > > When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> > > to
> > > deal with the following situation:
> > > 
> > > VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> > > 
> > > raw_spin_lock(A)
> > > interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> > > 
> > > raw_spin_unlock(A)
> > > 
> > > Certain operations must interrupt guest vcpu-0 (see trace below).
> > 
> > Those traces don't make any sense. All they include is kvm_exit and you
> > can't tell anything from that.
> > 
> > > To fix this issue, only change guest vcpu-0 to FIFO priority
> > > on spinlock critical sections (see patch).
> > 
> > This doesn't make sense. So you're saying that if you run all VCPUs as
> > FIFO things come apart? Why?
> > 
> > And why can't they still come apart when the guest holds a spinlock?
> 
> That is, running a RT guest and not having _all_ VCPUs being RT tasks on
> the host is absolutely and completely insane and broken.

Can you explain why, please?

> Fix whatever needs fixing to allow your VCPU0 to be RT, don't do insane
> things like this.

VCPU0 can be RT, but you'll get the following hang, if the emulator
thread is sharing a pCPU with VCPU0:

	1. submit IO.
	2. busy spin.

As executed by the guest vcpu (its a natural problem).

Do you have a better suggestion as how to fix the problem?

We can fix the BIOS, but userspace will still be allowed to
generate the code pattern above.

And increasing the priority of the emulator thread, at random times 
(so it can inject interrupts to vcpu-0), can cause it to interrupt 
vcpu-0 in a spinlock protected section.

The only other option is for customers to live with the decreased 
packing (that is require one pcpu for each vcpu, and an additional pcpu
for emulator threads). Is that what you are suggesting?
Marcelo Tosatti Sept. 22, 2017, 12:36 p.m. UTC | #9
On Fri, Sep 22, 2017 at 02:31:07PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 22, 2017 at 09:16:40AM -0300, Marcelo Tosatti wrote:
> > On Fri, Sep 22, 2017 at 12:00:05PM +0200, Peter Zijlstra wrote:
> > > On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> > > > When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> > > > to
> > > > deal with the following situation:
> > > > 
> > > > VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> > > > 
> > > > raw_spin_lock(A)
> > > > interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> > > > 
> > > > raw_spin_unlock(A)
> > > > 
> > > > Certain operations must interrupt guest vcpu-0 (see trace below).
> > > 
> > > Those traces don't make any sense. All they include is kvm_exit and you
> > > can't tell anything from that.
> > 
> > Hi Peter,
> > 
> > OK lets describe whats happening:
> > 
> > With QEMU emulator thread and vcpu-0 sharing a physical CPU
> > (which is a request from several NFV customers, to improve
> > guest packing), the following occurs when the guest generates 
> > the following pattern:
> > 
> > 		1. submit IO.
> > 		2. busy spin.
> 
> User-space spinning is a bad idea in general and terminally broken in
> a RT setup. Sounds like you need to go fix qemu to not suck.

One can run whatever application they want on the housekeeping
vcpus. This is why rteval exists.

This is not the realtime vcpu we are talking about.

We can fix the BIOS, which is hanging now, but userspace can 
do whatever it wants, on non realtime vcpus (again, this is why
rteval test exists and is used by the -RT community as 
a testcase).

I haven't understood what is the wrong with the patch? Are you trying
to avoid pollution of the spinlock codepath to keep it simple?
Marcelo Tosatti Sept. 22, 2017, 12:40 p.m. UTC | #10
On Fri, Sep 22, 2017 at 02:31:07PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 22, 2017 at 09:16:40AM -0300, Marcelo Tosatti wrote:
> > On Fri, Sep 22, 2017 at 12:00:05PM +0200, Peter Zijlstra wrote:
> > > On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> > > > When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> > > > to
> > > > deal with the following situation:
> > > > 
> > > > VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> > > > 
> > > > raw_spin_lock(A)
> > > > interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> > > > 
> > > > raw_spin_unlock(A)
> > > > 
> > > > Certain operations must interrupt guest vcpu-0 (see trace below).
> > > 
> > > Those traces don't make any sense. All they include is kvm_exit and you
> > > can't tell anything from that.
> > 
> > Hi Peter,
> > 
> > OK lets describe whats happening:
> > 
> > With QEMU emulator thread and vcpu-0 sharing a physical CPU
> > (which is a request from several NFV customers, to improve
> > guest packing), the following occurs when the guest generates 
> > the following pattern:
> > 
> > 		1. submit IO.
> > 		2. busy spin.
> 
> User-space spinning is a bad idea in general and terminally broken in
> a RT setup. Sounds like you need to go fix qemu to not suck.

Are you arguing its invalid for the following application to execute on 
housekeeping vcpu of a realtime system:

void main(void)
{

    submit_IO();
    do {
       computation();
    } while (!interrupted());
}

Really?

Replace "busy spin" by "useful computation until interrupted".
Peter Zijlstra Sept. 22, 2017, 12:55 p.m. UTC | #11
On Fri, Sep 22, 2017 at 09:33:05AM -0300, Marcelo Tosatti wrote:
> > That is, running a RT guest and not having _all_ VCPUs being RT tasks on
> > the host is absolutely and completely insane and broken.
> 
> Can you explain why, please?

You just explained it yourself. If the thread that needs to complete
what you're waiting on has lower priority, it will _never_ get to run if
you're busy waiting on it.

This is _trivial_.

And even for !RT it can be quite costly, because you can end up having
to burn your entire slot of CPU time before you run the other task.

Userspace spinning is _bad_, do not do this.

(the one exception where it works is where you have a single thread per
cpu, because then there's effectively no scheduling).

> > Fix whatever needs fixing to allow your VCPU0 to be RT, don't do insane
> > things like this.
> 
> VCPU0 can be RT, but you'll get the following hang, if the emulator
> thread is sharing a pCPU with VCPU0:
> 
> 	1. submit IO.
> 	2. busy spin.
> 
> As executed by the guest vcpu (its a natural problem).
> 
> Do you have a better suggestion as how to fix the problem?

Yes, not busy wait. Go to sleep and make sure you're woken up once the
IO completes.

> We can fix the BIOS, but userspace will still be allowed to
> generate the code pattern above.

What does the BIOS have to do with anything?

> And increasing the priority of the emulator thread, at random times 
> (so it can inject interrupts to vcpu-0), can cause it to interrupt 
> vcpu-0 in a spinlock protected section.

You can equally boost the emulator thread while you're spin-waiting, but
that's ugly as heck too.

The normal, sane solution is to not spin-wait but block.
Peter Zijlstra Sept. 22, 2017, 12:59 p.m. UTC | #12
On Fri, Sep 22, 2017 at 09:36:39AM -0300, Marcelo Tosatti wrote:
> On Fri, Sep 22, 2017 at 02:31:07PM +0200, Peter Zijlstra wrote:
> > On Fri, Sep 22, 2017 at 09:16:40AM -0300, Marcelo Tosatti wrote:
> > > On Fri, Sep 22, 2017 at 12:00:05PM +0200, Peter Zijlstra wrote:
> > > > On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> > > > > When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> > > > > to
> > > > > deal with the following situation:
> > > > > 
> > > > > VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> > > > > 
> > > > > raw_spin_lock(A)
> > > > > interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> > > > > 
> > > > > raw_spin_unlock(A)
> > > > > 
> > > > > Certain operations must interrupt guest vcpu-0 (see trace below).
> > > > 
> > > > Those traces don't make any sense. All they include is kvm_exit and you
> > > > can't tell anything from that.
> > > 
> > > Hi Peter,
> > > 
> > > OK lets describe whats happening:
> > > 
> > > With QEMU emulator thread and vcpu-0 sharing a physical CPU
> > > (which is a request from several NFV customers, to improve
> > > guest packing), the following occurs when the guest generates 
> > > the following pattern:
> > > 
> > > 		1. submit IO.
> > > 		2. busy spin.
> > 
> > User-space spinning is a bad idea in general and terminally broken in
> > a RT setup. Sounds like you need to go fix qemu to not suck.
> 
> One can run whatever application they want on the housekeeping
> vcpus. This is why rteval exists.

Nobody cares about other tasks. The problem is between the VCPU and
emulator thread. They get a priority inversion and live-lock because of
spin-waiting.

> This is not the realtime vcpu we are talking about.

You're being confused, its a RT _guest_, all VCPUs _must_ be RT.
Because, as you ran into, the guest functions as a whole, not as a bunch
of individual CPUs.

> We can fix the BIOS, which is hanging now, but userspace can 
> do whatever it wants, on non realtime vcpus (again, this is why
> rteval test exists and is used by the -RT community as 
> a testcase).

But nobody cares what other tasks on the system do, all you care about
is that the VCPUs make deterministic forward progress.

> I haven't understood what is the wrong with the patch? Are you trying
> to avoid pollution of the spinlock codepath to keep it simple?

Your patch is voodoo programming. You don't solve the actual problem,
you try and paper over it.
Peter Zijlstra Sept. 22, 2017, 1:01 p.m. UTC | #13
On Fri, Sep 22, 2017 at 09:40:05AM -0300, Marcelo Tosatti wrote:

> Are you arguing its invalid for the following application to execute on 
> housekeeping vcpu of a realtime system:
> 
> void main(void)
> {
> 
>     submit_IO();
>     do {
>        computation();
>     } while (!interrupted());
> }
> 
> Really?

No. Nobody cares about random crap tasks.
Paolo Bonzini Sept. 23, 2017, 10:56 a.m. UTC | #14
On 22/09/2017 14:55, Peter Zijlstra wrote:
> You just explained it yourself. If the thread that needs to complete
> what you're waiting on has lower priority, it will _never_ get to run if
> you're busy waiting on it.
> 
> This is _trivial_.
> 
> And even for !RT it can be quite costly, because you can end up having
> to burn your entire slot of CPU time before you run the other task.
> 
> Userspace spinning is _bad_, do not do this.

This is not userspace spinning, it is guest spinning---which has
effectively the same effect but you cannot quite avoid.

But I agree that the solution is properly prioritizing threads that can
interrupt the VCPU, and using PI mutexes.

I'm not a priori opposed to paravirt scheduling primitives, but I am not
at all sure that it's required.

Paolo

> (the one exception where it works is where you have a single thread per
> cpu, because then there's effectively no scheduling).
Peter Zijlstra Sept. 23, 2017, 1:41 p.m. UTC | #15
On Sat, Sep 23, 2017 at 12:56:12PM +0200, Paolo Bonzini wrote:
> On 22/09/2017 14:55, Peter Zijlstra wrote:
> > You just explained it yourself. If the thread that needs to complete
> > what you're waiting on has lower priority, it will _never_ get to run if
> > you're busy waiting on it.
> > 
> > This is _trivial_.
> > 
> > And even for !RT it can be quite costly, because you can end up having
> > to burn your entire slot of CPU time before you run the other task.
> > 
> > Userspace spinning is _bad_, do not do this.
> 
> This is not userspace spinning, it is guest spinning---which has
> effectively the same effect but you cannot quite avoid.

So I'm virt illiterate and have no clue on how all this works; but
wasn't this a vmexit ? (that's what marcelo traced). And once you've
done a vmexit you're a regular task again, not a vcpu.

> But I agree that the solution is properly prioritizing threads that can
> interrupt the VCPU, and using PI mutexes.

Right, if you want to run RT VCPUs the whole emulator/vcpu interaction
needs to be designed for RT.

> I'm not a priori opposed to paravirt scheduling primitives, but I am not
> at all sure that it's required.

Problem is that the proposed thing doesn't solve anything. There is
nothing that prohibits the guest from triggering a vmexit while holding
a spinlock and landing in the self-same problems.
Paolo Bonzini Sept. 24, 2017, 1:05 p.m. UTC | #16
----- Original Message -----
> From: "Peter Zijlstra" <peterz@infradead.org>
> To: "Paolo Bonzini" <pbonzini@redhat.com>
> Cc: "Marcelo Tosatti" <mtosatti@redhat.com>, "Konrad Rzeszutek Wilk" <konrad.wilk@oracle.com>, mingo@redhat.com,
> kvm@vger.kernel.org, linux-kernel@vger.kernel.org, "Thomas Gleixner" <tglx@linutronix.de>
> Sent: Saturday, September 23, 2017 3:41:14 PM
> Subject: Re: [patch 3/3] x86: kvm guest side support for KVM_HC_RT_PRIO hypercall
> 
> On Sat, Sep 23, 2017 at 12:56:12PM +0200, Paolo Bonzini wrote:
> > On 22/09/2017 14:55, Peter Zijlstra wrote:
> > > You just explained it yourself. If the thread that needs to complete
> > > what you're waiting on has lower priority, it will _never_ get to run if
> > > you're busy waiting on it.
> > > 
> > > This is _trivial_.
> > > 
> > > And even for !RT it can be quite costly, because you can end up having
> > > to burn your entire slot of CPU time before you run the other task.
> > > 
> > > Userspace spinning is _bad_, do not do this.
> > 
> > This is not userspace spinning, it is guest spinning---which has
> > effectively the same effect but you cannot quite avoid.
> 
> So I'm virt illiterate and have no clue on how all this works; but
> wasn't this a vmexit ? (that's what marcelo traced). And once you've
> done a vmexit you're a regular task again, not a vcpu.

His trace simply shows that the timer tick happened and the SCHED_NORMAL
thread was preempted.  Bumping the vCPU thread to SCHED_FIFO drops
the scheduler tick (the system is NOHZ_FULL) and thus 1) the frequency
of EXTERNAL_INTERRUPT vmexits drops to 1 second 2) the thread is not
preempted anymore.

> > But I agree that the solution is properly prioritizing threads that can
> > interrupt the VCPU, and using PI mutexes.
> 
> Right, if you want to run RT VCPUs the whole emulator/vcpu interaction
> needs to be designed for RT.
> 
> > I'm not a priori opposed to paravirt scheduling primitives, but I am not
> > at all sure that it's required.
> 
> Problem is that the proposed thing doesn't solve anything. There is
> nothing that prohibits the guest from triggering a vmexit while holding
> a spinlock and landing in the self-same problems.

Well, part of configuring virt for RT is (at all levels: host hypervisor+QEMU
and guest kernel+userspace) is that vmexits while holding a spinlock are either
confined to one vCPU or are handled in the host hypervisor very quickly, like
less than 2000 clock cycles.

So I'm not denying that Marcelo's approach solves the problem, but it's very
heavyweight and it masks an important misconfiguration (as you write above,
everything needs to be RT and the priorities must be designed carefully).

_However_, even if you do this, you may want to put the less important vCPUs
and the emulator threads on the same physical CPU.  In that case, the vCPU
can be placed at SCHED_RR to avoid starvation (while the emulator thread needs
to stay at SCHED_FIFO and higher priority).  Some kind of trick that bumps
spinlock critical sections in that vCPU to SCHED_FIFO, for a limited time only,
might still be useful.

Paolo
Marcelo Tosatti Sept. 25, 2017, 1:52 a.m. UTC | #17
On Fri, Sep 22, 2017 at 02:59:51PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 22, 2017 at 09:36:39AM -0300, Marcelo Tosatti wrote:
> > On Fri, Sep 22, 2017 at 02:31:07PM +0200, Peter Zijlstra wrote:
> > > On Fri, Sep 22, 2017 at 09:16:40AM -0300, Marcelo Tosatti wrote:
> > > > On Fri, Sep 22, 2017 at 12:00:05PM +0200, Peter Zijlstra wrote:
> > > > > On Thu, Sep 21, 2017 at 10:10:41PM -0300, Marcelo Tosatti wrote:
> > > > > > When executing guest vcpu-0 with FIFO:1 priority, which is necessary
> > > > > > to
> > > > > > deal with the following situation:
> > > > > > 
> > > > > > VCPU-0 (housekeeping VCPU)              VCPU-1 (realtime VCPU)
> > > > > > 
> > > > > > raw_spin_lock(A)
> > > > > > interrupted, schedule task T-1          raw_spin_lock(A) (spin)
> > > > > > 
> > > > > > raw_spin_unlock(A)
> > > > > > 
> > > > > > Certain operations must interrupt guest vcpu-0 (see trace below).
> > > > > 
> > > > > Those traces don't make any sense. All they include is kvm_exit and you
> > > > > can't tell anything from that.
> > > > 
> > > > Hi Peter,
> > > > 
> > > > OK lets describe whats happening:
> > > > 
> > > > With QEMU emulator thread and vcpu-0 sharing a physical CPU
> > > > (which is a request from several NFV customers, to improve
> > > > guest packing), the following occurs when the guest generates 
> > > > the following pattern:
> > > > 
> > > > 		1. submit IO.
> > > > 		2. busy spin.
> > > 
> > > User-space spinning is a bad idea in general and terminally broken in
> > > a RT setup. Sounds like you need to go fix qemu to not suck.
> > 
> > One can run whatever application they want on the housekeeping
> > vcpus. This is why rteval exists.
> 
> Nobody cares about other tasks. The problem is between the VCPU and
> emulator thread. They get a priority inversion and live-lock because of
> spin-waiting.
> 
> > This is not the realtime vcpu we are talking about.
> 
> You're being confused, its a RT _guest_, all VCPUs _must_ be RT.
> Because, as you ran into, the guest functions as a whole, not as a bunch
> of individual CPUs.
> 
> > We can fix the BIOS, which is hanging now, but userspace can 
> > do whatever it wants, on non realtime vcpus (again, this is why
> > rteval test exists and is used by the -RT community as 
> > a testcase).
> 
> But nobody cares what other tasks on the system do, all you care about
> is that the VCPUs make deterministic forward progress.
> 
> > I haven't understood what is the wrong with the patch? Are you trying
> > to avoid pollution of the spinlock codepath to keep it simple?
> 
> Your patch is voodoo programming. You don't solve the actual problem,
> you try and paper over it.

Priority boosting on a particular section of code is voodoo programming?
Marcelo Tosatti Sept. 25, 2017, 2:22 a.m. UTC | #18
On Fri, Sep 22, 2017 at 03:01:41PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 22, 2017 at 09:40:05AM -0300, Marcelo Tosatti wrote:
> 
> > Are you arguing its invalid for the following application to execute on 
> > housekeeping vcpu of a realtime system:
> > 
> > void main(void)
> > {
> > 
> >     submit_IO();
> >     do {
> >        computation();
> >     } while (!interrupted());
> > }
> > 
> > Really?
> 
> No. Nobody cares about random crap tasks.

Nobody has control over all code that runs in userspace Peter. And not
supporting a valid sequence of steps because its "crap" (whatever your 
definition of crap is) makes no sense.

It might be that someone decides to do the above (i really can't see 
any actual reasoning i can follow and agree on your "its crap"
argument), this truly seems valid to me.

So lets follow the reasoning steps:

1) "NACK, because you didnt understand the problem".

	OK thats an invalid NACK, you did understand the problem
	later and now your argument is the following.

2) "NACK, because all VCPUs should be SCHED_FIFO all the time".

But the existence of this code path from userspace:

  submit_IO();
  do {
     computation();
  } while (!interrupted());

Its a supported code sequence, and works fine in a non-RT environment.
Therefore it should work on an -RT environment.
Think of any two applications, such as an IO application
and a CPU bound application. The IO application will be severely
impacted, or never execute, in such scenario.

Is that combination of tasks "random crap tasks" ? (No, its not, which 
makes me think you're just NACKing without giving enough thought to the
problem).

So please give me some logical reasoning for the NACK (people can live with
it, but it has to be good enough to justify the decreasing packing of 
guests in pCPUs):

1) "Voodoo programming" (its hard for me to parse what you mean with
that... do you mean you foresee this style of priority boosting causing
problems in the future? Can you give an example?).

Is there fundamentally wrong about priority boosting in spinlock
sections, or this particular style of priority boosting is wrong?

2) "Pollution of the kernel code path". That makes sense to me, if thats
whats your concerned about.

3) "Reduction of spinlock performance". Its true, but for NFV workloads
people don't care about.

4) "All vcpus should be SCHED_FIFO all the time". OK, why is that?
What dictates that to be true?

What the patch does is the following:
It reduces the window where SCHED_FIFO is applied vcpu0
to those were a spinlock is shared between -RT vcpus and vcpu0
(why: because otherwise, when the emulator thread is sharing a
pCPU with vcpu0, its unable to generate interrupts vcpu0).

And its being rejected because:
Please fill in.
Marcelo Tosatti Sept. 25, 2017, 2:57 a.m. UTC | #19
On Sun, Sep 24, 2017 at 09:05:44AM -0400, Paolo Bonzini wrote:
> 
> 
> ----- Original Message -----
> > From: "Peter Zijlstra" <peterz@infradead.org>
> > To: "Paolo Bonzini" <pbonzini@redhat.com>
> > Cc: "Marcelo Tosatti" <mtosatti@redhat.com>, "Konrad Rzeszutek Wilk" <konrad.wilk@oracle.com>, mingo@redhat.com,
> > kvm@vger.kernel.org, linux-kernel@vger.kernel.org, "Thomas Gleixner" <tglx@linutronix.de>
> > Sent: Saturday, September 23, 2017 3:41:14 PM
> > Subject: Re: [patch 3/3] x86: kvm guest side support for KVM_HC_RT_PRIO hypercall
> > 
> > On Sat, Sep 23, 2017 at 12:56:12PM +0200, Paolo Bonzini wrote:
> > > On 22/09/2017 14:55, Peter Zijlstra wrote:
> > > > You just explained it yourself. If the thread that needs to complete
> > > > what you're waiting on has lower priority, it will _never_ get to run if
> > > > you're busy waiting on it.
> > > > 
> > > > This is _trivial_.
> > > > 
> > > > And even for !RT it can be quite costly, because you can end up having
> > > > to burn your entire slot of CPU time before you run the other task.
> > > > 
> > > > Userspace spinning is _bad_, do not do this.
> > > 
> > > This is not userspace spinning, it is guest spinning---which has
> > > effectively the same effect but you cannot quite avoid.
> > 
> > So I'm virt illiterate and have no clue on how all this works; but
> > wasn't this a vmexit ? (that's what marcelo traced). And once you've
> > done a vmexit you're a regular task again, not a vcpu.
> 
> His trace simply shows that the timer tick happened and the SCHED_NORMAL
> thread was preempted.  Bumping the vCPU thread to SCHED_FIFO drops
> the scheduler tick (the system is NOHZ_FULL) and thus 1) the frequency
> of EXTERNAL_INTERRUPT vmexits drops to 1 second 2) the thread is not
> preempted anymore.
> 
> > > But I agree that the solution is properly prioritizing threads that can
> > > interrupt the VCPU, and using PI mutexes.

Thats exactly what the patch does, the prioritization is not fixed in
time, and depends on whether or not vcpu-0 is in spinlock protected
section.

Are you suggesting a different prioritization? Can you describe it
please, even if incomplete?

> > 
> > Right, if you want to run RT VCPUs the whole emulator/vcpu interaction
> > needs to be designed for RT.
> > 
> > > I'm not a priori opposed to paravirt scheduling primitives, but I am not
> > > at all sure that it's required.
> > 
> > Problem is that the proposed thing doesn't solve anything. There is
> > nothing that prohibits the guest from triggering a vmexit while holding
> > a spinlock and landing in the self-same problems.
> 
> Well, part of configuring virt for RT is (at all levels: host hypervisor+QEMU
> and guest kernel+userspace) is that vmexits while holding a spinlock are either
> confined to one vCPU or are handled in the host hypervisor very quickly, like
> less than 2000 clock cycles.
> 
> So I'm not denying that Marcelo's approach solves the problem, but it's very
> heavyweight and it masks an important misconfiguration (as you write above,
> everything needs to be RT and the priorities must be designed carefully).

I think you are missing the following point:

"vcpu0 can be interrupted when its not in a spinlock protected section, 
otherwise it can't."

So you _have_ to communicate to the host when the guest enters/leaves a
critical section.

So this point of "everything needs to be RT and the priorities must be
designed carefully", is this: 

	WHEN in spinlock protected section (more specifically, when 
	spinlock protected section _shared with realtime vcpus_),

	priority of vcpu0 > priority of emulator thread

	OTHERWISE

	priority of vcpu0 < priority of emulator thread.

(*)

So emulator thread can interrupt and inject interrupts to vcpu0.

> 
> _However_, even if you do this, you may want to put the less important vCPUs
> and the emulator threads on the same physical CPU.  In that case, the vCPU
> can be placed at SCHED_RR to avoid starvation (while the emulator thread needs
> to stay at SCHED_FIFO and higher priority).  Some kind of trick that bumps
> spinlock critical sections in that vCPU to SCHED_FIFO, for a limited time only,
> might still be useful.

Anything that violates (*) above is going to cause excessive latencies
in realtime vcpus, via:

PCPU-0:
	* vcpu-0 grabs spinlock A.
	* event wakes up emulator thread, vcpu-0 sched out, vcpu-0 sched
	in.
PCPU-1:
	* realtime vcpu grabs spinlock-A, busy spins on emulator threads
	completion.

So its more than useful, its necessary.

I'm open to suggestions as better ways to solve this problem 
while sharing emulator thread with vcpu-0 (which is something users
are interested in, for obvious economical reasons), but:

	1) Don't get the point of Peters rejection.

	2) Don't get how SCHED_RR can help the situation.
Peter Zijlstra Sept. 25, 2017, 8:35 a.m. UTC | #20
On Sun, Sep 24, 2017 at 10:52:58PM -0300, Marcelo Tosatti wrote:
> On Fri, Sep 22, 2017 at 02:59:51PM +0200, Peter Zijlstra wrote:

> > Your patch is voodoo programming. You don't solve the actual problem,
> > you try and paper over it.
> 
> Priority boosting on a particular section of code is voodoo programming? 

Yes, because there's nothing that prevents said section of code from
triggering the exact problem you're trying to avoid.

The real fix is making sure the problem cannot happen to begin with,
which is done by fixing the interaction between the VCPU and that
emulation thread thing.
Peter Zijlstra Sept. 25, 2017, 8:58 a.m. UTC | #21
On Sun, Sep 24, 2017 at 11:22:38PM -0300, Marcelo Tosatti wrote:
> On Fri, Sep 22, 2017 at 03:01:41PM +0200, Peter Zijlstra wrote:
> > On Fri, Sep 22, 2017 at 09:40:05AM -0300, Marcelo Tosatti wrote:
> > 
> > > Are you arguing its invalid for the following application to execute on 
> > > housekeeping vcpu of a realtime system:
> > > 
> > > void main(void)
> > > {
> > > 
> > >     submit_IO();
> > >     do {
> > >        computation();
> > >     } while (!interrupted());
> > > }
> > > 
> > > Really?
> > 
> > No. Nobody cares about random crap tasks.
> 
> Nobody has control over all code that runs in userspace Peter. And not
> supporting a valid sequence of steps because its "crap" (whatever your 
> definition of crap is) makes no sense.
> 
> It might be that someone decides to do the above (i really can't see 
> any actual reasoning i can follow and agree on your "its crap"
> argument), this truly seems valid to me.

We don't care what other tasks do. This isn't a hard thing to
understand. You're free to run whatever junk on your CPUs. This doesn't
(much) affect the correct functioning of RT tasks that you also run
there.

> So lets follow the reasoning steps:
> 
> 1) "NACK, because you didnt understand the problem".
> 
> 	OK thats an invalid NACK, you did understand the problem
> 	later and now your argument is the following.

It was a NACK because you wrote a shit changelog that didn't explain the
problem. But yes.

> 2) "NACK, because all VCPUs should be SCHED_FIFO all the time".

Very much, if you want a RT guest, all VCPU's should run at RT prio and
the interaction between the VCPUs and all supporting threads should be
designed for RT.

> But the existence of this code path from userspace:
> 
>   submit_IO();
>   do {
>      computation();
>   } while (!interrupted());
> 
> Its a supported code sequence, and works fine in a non-RT environment.

Who cares about that chunk of code? Have you forgotten to mention that
this is the form of the emulation thread?

> Therefore it should work on an -RT environment.

No, this is where you're wrong. That code works on -RT as long as you
don't expect it to be a valid RT program. -RT kernels will run !RT stuff
just fine.

But the moment you run a program as RT (FIFO/RR/DEADLINE) it had better
damn well be a valid RT program, and that excludes a lot of code.

> So please give me some logical reasoning for the NACK (people can live with
> it, but it has to be good enough to justify the decreasing packing of 
> guests in pCPUs):
> 
> 1) "Voodoo programming" (its hard for me to parse what you mean with
> that... do you mean you foresee this style of priority boosting causing
> problems in the future? Can you give an example?).

Your 'solution' only works if you sacrifice a goat on a full moon,
because only that ensures the guest doesn't VM_EXIT and cause the
self-same problem while you've boosted it.

Because you've _not_ fixed the actual problem!

> Is there fundamentally wrong about priority boosting in spinlock
> sections, or this particular style of priority boosting is wrong?

Yes, its fundamentally crap, because it doesn't guarantee anything.

RT is about making guarantees. An RT program needs a provable forward
progress guarantee at the very least. It including a priority inversion
disqualifies it from being sane.

> 2) "Pollution of the kernel code path". That makes sense to me, if thats
> whats your concerned about.

Also..

> 3) "Reduction of spinlock performance". Its true, but for NFV workloads
> people don't care about.

I've no idea what an NFV is.

> 4) "All vcpus should be SCHED_FIFO all the time". OK, why is that?
> What dictates that to be true?

Solid engineering. Does the guest kernel function as a bunch of
independent CPUs or does it assume all CPUs are equal and have strong
inter-cpu connections? Linux is the latter, therefore if one VCPU is RT
they all should be.

Dammit, you even recognise this in the spin-owner preemption issue
you're hacking around, but then go arse-about-face 'solving' it.

> What the patch does is the following:
> It reduces the window where SCHED_FIFO is applied vcpu0
> to those were a spinlock is shared between -RT vcpus and vcpu0
> (why: because otherwise, when the emulator thread is sharing a
> pCPU with vcpu0, its unable to generate interrupts vcpu0).
> 
> And its being rejected because:

Its not fixing the actual problem. The real problem is the prio
inversion between the VCPU and the emulation thread, _That_ is what
needs fixing.

Rewrite that VCPU/emulator interaction to be a proper RT construct.

Then you can run the VCPU at RT prio as you should, and the guest can
issue all the VM_EXIT things it wants at any time and still function
correctly.
Peter Zijlstra Sept. 25, 2017, 9:13 a.m. UTC | #22
On Sun, Sep 24, 2017 at 11:57:53PM -0300, Marcelo Tosatti wrote:
> I think you are missing the following point:
> 
> "vcpu0 can be interrupted when its not in a spinlock protected section, 
> otherwise it can't."
> 
> So you _have_ to communicate to the host when the guest enters/leaves a
> critical section.
> 
> So this point of "everything needs to be RT and the priorities must be
> designed carefully", is this: 
> 
> 	WHEN in spinlock protected section (more specifically, when 
> 	spinlock protected section _shared with realtime vcpus_),
> 
> 	priority of vcpu0 > priority of emulator thread
> 
> 	OTHERWISE
> 
> 	priority of vcpu0 < priority of emulator thread.
> 
> (*)
> 
> So emulator thread can interrupt and inject interrupts to vcpu0.

spinlock protected regions are not everything. What about lock-free
constructs where CPU's spin-wait on one another (there's plenty).

And I'm clearly ignorant of how this emulation thread works, but why
would it run for a long time? Either it is needed for forward progress
of the VCPU or its not. If its not, it shouldn't run.
Thomas Gleixner Sept. 25, 2017, 10:41 a.m. UTC | #23
On Sun, 24 Sep 2017, Marcelo Tosatti wrote:
> On Fri, Sep 22, 2017 at 03:01:41PM +0200, Peter Zijlstra wrote:
> What the patch does is the following:
> It reduces the window where SCHED_FIFO is applied vcpu0
> to those were a spinlock is shared between -RT vcpus and vcpu0
> (why: because otherwise, when the emulator thread is sharing a
> pCPU with vcpu0, its unable to generate interrupts vcpu0).
> 
> And its being rejected because:
> Please fill in.

Your patch is just papering over one particular problem, but it's not
fixing the root cause. That's the worst engineering approach and we all
know how fast this kind of crap falls over.

There are enough other issues which can cause starvation of the RT VCPUs
when the housekeeping VCPU is preempted, not just the particular problem
which you observed.

Back then when I did the first prototype of RT in KVM, I made it entirely
clear, that you have to spend one physical CPU for _each_ VCPU, independent
whether the VCPU is reserved for RT workers or the housekeeping VCPU. The
emulator thread needs to run on a separate physical CPU.

If you want to run the housekeeping VCPU and the emulator thread on the
same physical CPU then you have to make sure that both the emulator and the
housekeeper side of affairs are designed and implemented with RT in
mind. As long as that is not the case, you simply cannot run them on the
same physical CPU. RT is about guarantees and guarantees cannot be achieved
with bandaid engineering.

It's that simple, end of story.

Thanks,

	tglx
Paolo Bonzini Sept. 25, 2017, 3:12 p.m. UTC | #24
On 25/09/2017 11:13, Peter Zijlstra wrote:
> On Sun, Sep 24, 2017 at 11:57:53PM -0300, Marcelo Tosatti wrote:
>> I think you are missing the following point:
>>
>> "vcpu0 can be interrupted when its not in a spinlock protected section, 
>> otherwise it can't."

Who says that?  Certainly a driver can dedicate a single VCPU to
periodic polling of the device, in such a way that the polling does not
require a spinlock.

>> So you _have_ to communicate to the host when the guest enters/leaves a
>> critical section.
>>
>> So this point of "everything needs to be RT and the priorities must be
>> designed carefully", is this: 
>>
>> 	WHEN in spinlock protected section (more specifically, when 
>> 	spinlock protected section _shared with realtime vcpus_),
>>
>> 	priority of vcpu0 > priority of emulator thread
>>
>> 	OTHERWISE
>>
>> 	priority of vcpu0 < priority of emulator thread.

This is _not_ designed carefully, this is messy.

The emulator thread can interrupt the VCPU thread, so it has to be at
higher RT priority (+ priority inheritance of mutexes).  Once you have
done that we can decide on other approaches that e.g. let you get more
sharing by placing housekeeping VCPUs at SCHED_NORMAL or SCHED_RR.

>> So emulator thread can interrupt and inject interrupts to vcpu0.
> 
> spinlock protected regions are not everything. What about lock-free
> constructs where CPU's spin-wait on one another (there's plenty).
> 
> And I'm clearly ignorant of how this emulation thread works, but why
> would it run for a long time? Either it is needed for forward progress
> of the VCPU or its not. If its not, it shouldn't run.

The emulator thread 1) should not run for long period of times indeed,
and 2) it is needed for forward progress of the VCPU.  So it has to be
at higher RT priority.  I agree with Peter, sorry.  Spinlocks are a red
herring here.

Paolo
Konrad Rzeszutek Wilk Sept. 25, 2017, 4:20 p.m. UTC | #25
> I think you are missing the following point:
> 
> "vcpu0 can be interrupted when its not in a spinlock protected section, 
> otherwise it can't."
> 
> So you _have_ to communicate to the host when the guest enters/leaves a
> critical section.


How would this work for Windows or FreeBSD?
Jan Kiszka Sept. 25, 2017, 6:28 p.m. UTC | #26
On 2017-09-25 12:41, Thomas Gleixner wrote:
> On Sun, 24 Sep 2017, Marcelo Tosatti wrote:
>> On Fri, Sep 22, 2017 at 03:01:41PM +0200, Peter Zijlstra wrote:
>> What the patch does is the following:
>> It reduces the window where SCHED_FIFO is applied vcpu0
>> to those were a spinlock is shared between -RT vcpus and vcpu0
>> (why: because otherwise, when the emulator thread is sharing a
>> pCPU with vcpu0, its unable to generate interrupts vcpu0).
>>
>> And its being rejected because:
>> Please fill in.
> 
> Your patch is just papering over one particular problem, but it's not
> fixing the root cause. That's the worst engineering approach and we all
> know how fast this kind of crap falls over.
> 
> There are enough other issues which can cause starvation of the RT VCPUs
> when the housekeeping VCPU is preempted, not just the particular problem
> which you observed.
> 
> Back then when I did the first prototype of RT in KVM, I made it entirely
> clear, that you have to spend one physical CPU for _each_ VCPU, independent
> whether the VCPU is reserved for RT workers or the housekeeping VCPU. The
> emulator thread needs to run on a separate physical CPU.
> 
> If you want to run the housekeeping VCPU and the emulator thread on the
> same physical CPU then you have to make sure that both the emulator and the
> housekeeper side of affairs are designed and implemented with RT in
> mind. As long as that is not the case, you simply cannot run them on the
> same physical CPU. RT is about guarantees and guarantees cannot be achieved
> with bandaid engineering.

It's even more complicated for the guest: It needs to be aware of the
latencies its interaction with a VM - instead of a real machine - may
cause while being in whatever critical sections. That's an additional
design dimension that would be very hard to establish and maintain, even
in Linux.

The only way around that is to truly decouple guest CPUs via full core
isolation inside the Linux guest and have your RT guest application
exploit this partitioning, e.g. by using lock-less inter-core
communication without kernel help.

The reason I was playing with PV-sched back then was to explore how you
could map the guest's task prio dynamically on its host vcpu. That
involved boosting whenever en event (aka irq) came in for the guest
vcpu. It turned out to be a more or less working solution looking for a
real-world problem.

Jan
Marcelo Tosatti Sept. 26, 2017, 10:49 p.m. UTC | #27
On Mon, Sep 25, 2017 at 05:12:42PM +0200, Paolo Bonzini wrote:
> On 25/09/2017 11:13, Peter Zijlstra wrote:
> > On Sun, Sep 24, 2017 at 11:57:53PM -0300, Marcelo Tosatti wrote:
> >> I think you are missing the following point:
> >>
> >> "vcpu0 can be interrupted when its not in a spinlock protected section, 
> >> otherwise it can't."
> 
> Who says that?  Certainly a driver can dedicate a single VCPU to
> periodic polling of the device, in such a way that the polling does not
> require a spinlock.

This sequence:


VCPU-0					VCPU-1 (running realtime workload)

takes spinlock A
scheduled out				
					spinlock(A) (busy spins until
						     VCPU-0 is scheduled
						     back in)
scheduled in
finishes execution of 
code under protected section
releases spinlock(A)			

					takes spinlock(A)

You get that point, right?

(*)

> >> So you _have_ to communicate to the host when the guest enters/leaves a
> >> critical section.
> >>
> >> So this point of "everything needs to be RT and the priorities must be
> >> designed carefully", is this: 
> >>
> >> 	WHEN in spinlock protected section (more specifically, when 
> >> 	spinlock protected section _shared with realtime vcpus_),
> >>
> >> 	priority of vcpu0 > priority of emulator thread
> >>
> >> 	OTHERWISE
> >>
> >> 	priority of vcpu0 < priority of emulator thread.
> 
> This is _not_ designed carefully, this is messy.

This is very precise to me. What is "messy" about it? (its clearly
defined).

> The emulator thread can interrupt the VCPU thread, so it has to be at
> higher RT priority (+ priority inheritance of mutexes).  

It can only do that _when_ the VCPU thread is not running a critical
section which a higher priority task depends on.

> Once you have
> done that we can decide on other approaches that e.g. let you get more
> sharing by placing housekeeping VCPUs at SCHED_NORMAL or SCHED_RR.

Well, if someone looks at (*) he sees that if the interruption delay 
(the length between "scheduled out" and "scheduled in" in that diagram)
exceeds a given threshold, that causes the realtime vcpu1 to also 
exceed processing of the realtime task for a given threshold. 

So when you say "The emulator thread can interrupt the VCPU thread", 
you're saying that it has to be modified to interrupt for a maximum
amount of time (say 15us).

Is that what you are suggesting?

> >> So emulator thread can interrupt and inject interrupts to vcpu0.
> > 
> > spinlock protected regions are not everything. What about lock-free
> > constructs where CPU's spin-wait on one another (there's plenty).
> > 
> > And I'm clearly ignorant of how this emulation thread works, but why
> > would it run for a long time? Either it is needed for forward progress
> > of the VCPU or its not. If its not, it shouldn't run.
> 
> The emulator thread 1) should not run for long period of times indeed,
> and 2) it is needed for forward progress of the VCPU.  So it has to be
> at higher RT priority.  I agree with Peter, sorry.  Spinlocks are a red
> herring here.
> 
> Paolo

Paolo, you don't control how many interruptions of the emulator thread
happen per second. So if you let the emulator thread interrupt the
emulator thread at all times, without some kind of bounding 
of these interruptions per time unit, you have a similar
problem as (*) (where the realtime task is scheduled).

Another approach to the problem was suggested to OpenStack.
Marcelo Tosatti Sept. 26, 2017, 11:22 p.m. UTC | #28
On Mon, Sep 25, 2017 at 11:13:16AM +0200, Peter Zijlstra wrote:
> On Sun, Sep 24, 2017 at 11:57:53PM -0300, Marcelo Tosatti wrote:
> > I think you are missing the following point:
> > 
> > "vcpu0 can be interrupted when its not in a spinlock protected section, 
> > otherwise it can't."
> > 
> > So you _have_ to communicate to the host when the guest enters/leaves a
> > critical section.
> > 
> > So this point of "everything needs to be RT and the priorities must be
> > designed carefully", is this: 
> > 
> > 	WHEN in spinlock protected section (more specifically, when 
> > 	spinlock protected section _shared with realtime vcpus_),
> > 
> > 	priority of vcpu0 > priority of emulator thread
> > 
> > 	OTHERWISE
> > 
> > 	priority of vcpu0 < priority of emulator thread.
> > 
> > (*)
> > 
> > So emulator thread can interrupt and inject interrupts to vcpu0.
> 
> spinlock protected regions are not everything. What about lock-free
> constructs where CPU's spin-wait on one another (there's plenty).

True. Could add the "i am in a critical section" notifier to those
constructs as well, which would call the hypercall.

> And I'm clearly ignorant of how this emulation thread works, but why
> would it run for a long time? Either it is needed for forward progress
> of the VCPU or its not. If its not, it shouldn't run.

It is needed only when not in a critical section.
And when in a critical section, the vcpu should not get interrupted.

But the solution to reserve one pCPU per socket, to run all emulator
threads, achieves reasonable packing numbers without the downsides
of the hypercall.
Paolo Bonzini Sept. 27, 2017, 9:37 a.m. UTC | #29
On 27/09/2017 00:49, Marcelo Tosatti wrote:
> On Mon, Sep 25, 2017 at 05:12:42PM +0200, Paolo Bonzini wrote:
>> On 25/09/2017 11:13, Peter Zijlstra wrote:
>>> On Sun, Sep 24, 2017 at 11:57:53PM -0300, Marcelo Tosatti wrote:
>>>> I think you are missing the following point:
>>>>
>>>> "vcpu0 can be interrupted when its not in a spinlock protected section, 
>>>> otherwise it can't."
>>
>> Who says that?  Certainly a driver can dedicate a single VCPU to
>> periodic polling of the device, in such a way that the polling does not
>> require a spinlock.
> 
> This sequence:
> 
> 
> VCPU-0					VCPU-1 (running realtime workload)
> 
> takes spinlock A
> scheduled out				
> 					spinlock(A) (busy spins until
> 						     VCPU-0 is scheduled
> 						     back in)
> scheduled in
> finishes execution of 
> code under protected section
> releases spinlock(A)			
> 
> 					takes spinlock(A)

Yes, but then you have

                                        busy waits for flag to be set
  polls device
  scheduled out
  device receives data
                                        ...
  scheduled in
  set flag

or

  check for work to do
  scheduled out
                                        submit work
                                        busy waits for work to complete
  scheduled in
  do work

None of which have anything to do with spinlocks.  They're just
different ways to get priority inversion, and this...

>>>> So this point of "everything needs to be RT and the priorities must be
>>>> designed carefully", is this: 
>>>>
>>>> 	WHEN in spinlock protected section (more specifically, when 
>>>> 	spinlock protected section _shared with realtime vcpus_),
>>>>
>>>> 	priority of vcpu0 > priority of emulator thread
>>>>
>>>> 	OTHERWISE
>>>>
>>>> 	priority of vcpu0 < priority of emulator thread.
>>
>> This is _not_ designed carefully, this is messy.
> 
> This is very precise to me. What is "messy" about it? (its clearly
> defined).

... it's not how you design RT systems to avoid priority inversion.
It's just solving an instance of the issue.

>> The emulator thread can interrupt the VCPU thread, so it has to be at
>> higher RT priority (+ priority inheritance of mutexes).  
> 
> It can only do that _when_ the VCPU thread is not running a critical
> section which a higher priority task depends on.

All VCPUs must have the same priority.  There's no such thing as a
housekeeping VCPU.

There could be one sacrificial VCPU that you place on the same physical
CPU as the emulator thread, but that's it.  The whole system must run
smoothly even if you place those on the same physical CPU, without any
PV hacks.

> So when you say "The emulator thread can interrupt the VCPU thread", 
> you're saying that it has to be modified to interrupt for a maximum
> amount of time (say 15us).
> 
> Is that what you are suggesting?

This is correct.  But I think you are missing a fundamental point, that
is not specific to virt.  If a thread 1 can trigger an event in thread
2, and thread 2 runs at priority N, thread 1 must be placed at priority >N.

In this case, the emulator thread can signal I/O completion to the VCPU:
it doesn't matter if the VCPU is polling or using interrupts, the
emulator thread must be placed at higher priority than the VCPU.  This
is the root cause of the SeaBIOS issue that you were seeing.  Yes, it
was me who suggested moving VCPU0 and the emulator thread to
SCHED_NORMAL, but that was just a bandaid until the real fix was done
(which is to set up SCHED_FIFO priorities correctly and use PI mutexes).

In fact, this is not specific to the emulator thread.  It applies just
as well to vhost threads, to QEMU iothreads, even to the KVM PIT
emulation thread if the guest were using it.

> Paolo, you don't control how many interruptions of the emulator thread
> happen per second.

Indeed these non-VCPU threads should indeed run rarely and for a small
amount of time only to achieve bounded latencies in the VCPUs.  But if
they don't, those are simply bugs and we fix them.  In fact, sources of
frequent interruptions have all been fixed or moved outside the main
thread; for example, disks can use separate iothreads.  Configuration
problems are also bugs, just in a different place.

For a very basic VM that I've just tried (whatever QEMU does by default,
only tweak was not using the QEMU GUI), there is exactly one
interruption per second when the VM is idle.  That interruption in turn
is caused by udev periodically probing the guest CDROM (!).  So that's a
configuration problem: remove the CDROM, and it does one interruption
every 10-30 seconds, again all of them guest triggered.

Again: if you have many interruptions, it's not a flaw in KVM or QEMU's
design, it's just that someone is doing something stupid.  It could be
the guest (e.g. unnecessary devices or daemons as in the example above),
QEMU (e.g. the RTC emulation used to trigger QEMU timers twice a second
just to increment the clock), or the management (e.g. polling "is the VM
running" 50 times per second).  But it can and must be fixed.

The design should be the basic one: attribute the right priority to each
thread and things just work.

Paolo

> So if you let the emulator thread interrupt the
> emulator thread at all times, without some kind of bounding 
> of these interruptions per time unit, you have a similar
> problem as (*) (where the realtime task is scheduled).
>
Marcelo Tosatti Sept. 28, 2017, 12:44 a.m. UTC | #30
On Wed, Sep 27, 2017 at 11:37:48AM +0200, Paolo Bonzini wrote:
> On 27/09/2017 00:49, Marcelo Tosatti wrote:
> > On Mon, Sep 25, 2017 at 05:12:42PM +0200, Paolo Bonzini wrote:
> >> On 25/09/2017 11:13, Peter Zijlstra wrote:
> >>> On Sun, Sep 24, 2017 at 11:57:53PM -0300, Marcelo Tosatti wrote:
> >>>> I think you are missing the following point:
> >>>>
> >>>> "vcpu0 can be interrupted when its not in a spinlock protected section, 
> >>>> otherwise it can't."
> >>
> >> Who says that?  Certainly a driver can dedicate a single VCPU to
> >> periodic polling of the device, in such a way that the polling does not
> >> require a spinlock.
> > 
> > This sequence:
> > 
> > 
> > VCPU-0					VCPU-1 (running realtime workload)
> > 
> > takes spinlock A
> > scheduled out				
> > 					spinlock(A) (busy spins until
> > 						     VCPU-0 is scheduled
> > 						     back in)
> > scheduled in
> > finishes execution of 
> > code under protected section
> > releases spinlock(A)			
> > 
> > 					takes spinlock(A)
> 
> Yes, but then you have
> 
>                                         busy waits for flag to be set
>   polls device
>   scheduled out
>   device receives data
>                                         ...
>   scheduled in
>   set flag
> 
> or
> 
>   check for work to do
>   scheduled out
>                                         submit work
>                                         busy waits for work to complete
>   scheduled in
>   do work
> 
> None of which have anything to do with spinlocks.  They're just
> different ways to get priority inversion, and this...
> 
> >>>> So this point of "everything needs to be RT and the priorities must be
> >>>> designed carefully", is this: 
> >>>>
> >>>> 	WHEN in spinlock protected section (more specifically, when 
> >>>> 	spinlock protected section _shared with realtime vcpus_),
> >>>>
> >>>> 	priority of vcpu0 > priority of emulator thread
> >>>>
> >>>> 	OTHERWISE
> >>>>
> >>>> 	priority of vcpu0 < priority of emulator thread.
> >>
> >> This is _not_ designed carefully, this is messy.
> > 
> > This is very precise to me. What is "messy" about it? (its clearly
> > defined).
> 
> ... it's not how you design RT systems to avoid priority inversion.
> It's just solving an instance of the issue.
> 
> >> The emulator thread can interrupt the VCPU thread, so it has to be at
> >> higher RT priority (+ priority inheritance of mutexes).  
> > 
> > It can only do that _when_ the VCPU thread is not running a critical
> > section which a higher priority task depends on.
> 
> All VCPUs must have the same priority.  There's no such thing as a
> housekeeping VCPU.
> 
> There could be one sacrificial VCPU that you place on the same physical
> CPU as the emulator thread, but that's it.  The whole system must run
> smoothly even if you place those on the same physical CPU, without any
> PV hacks.
> 
> > So when you say "The emulator thread can interrupt the VCPU thread", 
> > you're saying that it has to be modified to interrupt for a maximum
> > amount of time (say 15us).
> > 
> > Is that what you are suggesting?
> 
> This is correct.  But I think you are missing a fundamental point, that
> is not specific to virt.  If a thread 1 can trigger an event in thread
> 2, and thread 2 runs at priority N, thread 1 must be placed at priority >N.
> 
> In this case, the emulator thread can signal I/O completion to the VCPU:
> it doesn't matter if the VCPU is polling or using interrupts, the
> emulator thread must be placed at higher priority than the VCPU.  This
> is the root cause of the SeaBIOS issue that you were seeing.  Yes, it
> was me who suggested moving VCPU0 and the emulator thread to
> SCHED_NORMAL, but that was just a bandaid until the real fix was done
> (which is to set up SCHED_FIFO priorities correctly and use PI mutexes).
> 
> In fact, this is not specific to the emulator thread.  It applies just
> as well to vhost threads, to QEMU iothreads, even to the KVM PIT
> emulation thread if the guest were using it.
> 
> > Paolo, you don't control how many interruptions of the emulator thread
> > happen per second.
> 
> Indeed these non-VCPU threads should indeed run rarely and for a small
> amount of time only to achieve bounded latencies in the VCPUs.  But if
> they don't, those are simply bugs and we fix them.  In fact, sources of
> frequent interruptions have all been fixed or moved outside the main
> thread; for example, disks can use separate iothreads.  Configuration
> problems are also bugs, just in a different place.


> 
> For a very basic VM that I've just tried (whatever QEMU does by default,
> only tweak was not using the QEMU GUI), there is exactly one
> interruption per second when the VM is idle.  That interruption in turn
> is caused by udev periodically probing the guest CDROM (!).  So that's a
> configuration problem: remove the CDROM, and it does one interruption
> every 10-30 seconds, again all of them guest triggered.
> 
> Again: if you have many interruptions, it's not a flaw in KVM or QEMU's
> design, it's just that someone is doing something stupid.  It could be
> the guest (e.g. unnecessary devices or daemons as in the example above),
> QEMU (e.g. the RTC emulation used to trigger QEMU timers twice a second
> just to increment the clock), or the management (e.g. polling "is the VM
> running" 50 times per second).  But it can and must be fixed.

No, i mean you can run anything in VCPU-0 (it is valid to do that).
And that "anything" can generate 1 interrupt per second, 1000 or 10.000
interrupts per second. Which are all valid things to be done.

"I can't run a kernel compilation on VCPU-0 because that will impact
latency on the realtime VCPU-1" is not acceptable.

> The design should be the basic one: attribute the right priority to each
> thread and things just work.
> 
> Paolo
> 
> > So if you let the emulator thread interrupt the
> > emulator thread at all times, without some kind of bounding 
> > of these interruptions per time unit, you have a similar
> > problem as (*) (where the realtime task is scheduled).
> >
Paolo Bonzini Sept. 28, 2017, 7:22 a.m. UTC | #31
On 28/09/2017 02:44, Marcelo Tosatti wrote:
>> Again: if you have many interruptions, it's not a flaw in KVM or QEMU's
>> design, it's just that someone is doing something stupid.  It could be
>> the guest (e.g. unnecessary devices or daemons as in the example above),
>> QEMU (e.g. the RTC emulation used to trigger QEMU timers twice a second
>> just to increment the clock), or the management (e.g. polling "is the VM
>> running" 50 times per second).  But it can and must be fixed.
>
> No, i mean you can run anything in VCPU-0 (it is valid to do that).
> And that "anything" can generate 1 interrupt per second, 1000 or 10.000
> interrupts per second. Which are all valid things to be done.
> 
> "I can't run a kernel compilation on VCPU-0 because that will impact
> latency on the realtime VCPU-1" is not acceptable.

That shouldn't happen.  Sources of frequent interruptions have all been
fixed or moved outside the main thread.

If there are more left, report the bug and we'll see how to fix it in
userspace.

Paolo
Marcelo Tosatti Sept. 28, 2017, 9:35 p.m. UTC | #32
On Thu, Sep 28, 2017 at 09:22:02AM +0200, Paolo Bonzini wrote:
> On 28/09/2017 02:44, Marcelo Tosatti wrote:
> >> Again: if you have many interruptions, it's not a flaw in KVM or QEMU's
> >> design, it's just that someone is doing something stupid.  It could be
> >> the guest (e.g. unnecessary devices or daemons as in the example above),
> >> QEMU (e.g. the RTC emulation used to trigger QEMU timers twice a second
> >> just to increment the clock), or the management (e.g. polling "is the VM
> >> running" 50 times per second).  But it can and must be fixed.
> >
> > No, i mean you can run anything in VCPU-0 (it is valid to do that).
> > And that "anything" can generate 1 interrupt per second, 1000 or 10.000
> > interrupts per second. Which are all valid things to be done.
> > 
> > "I can't run a kernel compilation on VCPU-0 because that will impact
> > latency on the realtime VCPU-1" is not acceptable.
> 
> That shouldn't happen.  Sources of frequent interruptions have all been
> fixed or moved outside the main thread.
> 
> If there are more left, report the bug and we'll see how to fix it in
> userspace.
> 
> Paolo

What should not happen? The generation of 10.000 interrupts per second
(say disk IO completion) on a given workload ?
Marcelo Tosatti Sept. 28, 2017, 9:41 p.m. UTC | #33
On Thu, Sep 28, 2017 at 06:35:08PM -0300, Marcelo Tosatti wrote:
> On Thu, Sep 28, 2017 at 09:22:02AM +0200, Paolo Bonzini wrote:
> > On 28/09/2017 02:44, Marcelo Tosatti wrote:
> > >> Again: if you have many interruptions, it's not a flaw in KVM or QEMU's
> > >> design, it's just that someone is doing something stupid.  It could be
> > >> the guest (e.g. unnecessary devices or daemons as in the example above),
> > >> QEMU (e.g. the RTC emulation used to trigger QEMU timers twice a second
> > >> just to increment the clock), or the management (e.g. polling "is the VM
> > >> running" 50 times per second).  But it can and must be fixed.
> > >
> > > No, i mean you can run anything in VCPU-0 (it is valid to do that).
> > > And that "anything" can generate 1 interrupt per second, 1000 or 10.000
> > > interrupts per second. Which are all valid things to be done.
> > > 
> > > "I can't run a kernel compilation on VCPU-0 because that will impact
> > > latency on the realtime VCPU-1" is not acceptable.
> > 
> > That shouldn't happen.  Sources of frequent interruptions have all been
> > fixed or moved outside the main thread.
> > 
> > If there are more left, report the bug and we'll see how to fix it in
> > userspace.
> > 
> > Paolo
> 
> What should not happen? The generation of 10.000 interrupts per second
> (say disk IO completion) on a given workload ?

Are you suggesting that, workloads in vcpu-0 should be limited in the number
of interrupts (and durations of each interruption), so that the realtime vcpu-1's
latency requirement is met ?

I don't see how that suggestion can work because even if you make each
exit small, the frequency of them will cause a latency violation on
vcpu-1.
Paolo Bonzini Sept. 29, 2017, 8:18 a.m. UTC | #34
On 28/09/2017 23:35, Marcelo Tosatti wrote:
> On Thu, Sep 28, 2017 at 09:22:02AM +0200, Paolo Bonzini wrote:
>> On 28/09/2017 02:44, Marcelo Tosatti wrote:
>>>> Again: if you have many interruptions, it's not a flaw in KVM or QEMU's
>>>> design, it's just that someone is doing something stupid.  It could be
>>>> the guest (e.g. unnecessary devices or daemons as in the example above),
>>>> QEMU (e.g. the RTC emulation used to trigger QEMU timers twice a second
>>>> just to increment the clock), or the management (e.g. polling "is the VM
>>>> running" 50 times per second).  But it can and must be fixed.
>>>
>>> No, i mean you can run anything in VCPU-0 (it is valid to do that).
>>> And that "anything" can generate 1 interrupt per second, 1000 or 10.000
>>> interrupts per second. Which are all valid things to be done.
>>>
>>> "I can't run a kernel compilation on VCPU-0 because that will impact
>>> latency on the realtime VCPU-1" is not acceptable.
>>
>> That shouldn't happen.  Sources of frequent interruptions have all been
>> fixed or moved outside the main thread.
>>
>> If there are more left, report the bug and we'll see how to fix it in
>> userspace.
> 
> What should not happen? The generation of 10.000 interrupts per second
> (say disk IO completion) on a given workload ?

If you know you have this kind disk workload, you must use virtio-blk or
virtio-scsi with iothreads and place the iothreads on their own physical
CPUs.

Among "run arbitrary workloads", "run real-time workloads", "pack stuff
into as few physical CPUs as possible", you can only pick two.

Paolo
Marcelo Tosatti Sept. 29, 2017, 4:40 p.m. UTC | #35
On Fri, Sep 29, 2017 at 10:18:25AM +0200, Paolo Bonzini wrote:
> On 28/09/2017 23:35, Marcelo Tosatti wrote:
> > On Thu, Sep 28, 2017 at 09:22:02AM +0200, Paolo Bonzini wrote:
> >> On 28/09/2017 02:44, Marcelo Tosatti wrote:
> >>>> Again: if you have many interruptions, it's not a flaw in KVM or QEMU's
> >>>> design, it's just that someone is doing something stupid.  It could be
> >>>> the guest (e.g. unnecessary devices or daemons as in the example above),
> >>>> QEMU (e.g. the RTC emulation used to trigger QEMU timers twice a second
> >>>> just to increment the clock), or the management (e.g. polling "is the VM
> >>>> running" 50 times per second).  But it can and must be fixed.
> >>>
> >>> No, i mean you can run anything in VCPU-0 (it is valid to do that).
> >>> And that "anything" can generate 1 interrupt per second, 1000 or 10.000
> >>> interrupts per second. Which are all valid things to be done.
> >>>
> >>> "I can't run a kernel compilation on VCPU-0 because that will impact
> >>> latency on the realtime VCPU-1" is not acceptable.
> >>
> >> That shouldn't happen.  Sources of frequent interruptions have all been
> >> fixed or moved outside the main thread.
> >>
> >> If there are more left, report the bug and we'll see how to fix it in
> >> userspace.
> > 
> > What should not happen? The generation of 10.000 interrupts per second
> > (say disk IO completion) on a given workload ?
> 
> If you know you have this kind disk workload, you must use virtio-blk or
> virtio-scsi with iothreads and place the iothreads on their own physical
> CPUs.
> 
> Among "run arbitrary workloads", "run real-time workloads", "pack stuff
> into as few physical CPUs as possible", you can only pick two.
> 
> Paolo

Thats not the state of things (userspace in vcpu-0 is not specially tailored
to not violate latencies in vcpu-1): that is not all user triggered
actions can be verified.

Think "updatedb", and so on...
Paolo Bonzini Sept. 29, 2017, 5:05 p.m. UTC | #36
On 29/09/2017 18:40, Marcelo Tosatti wrote:
>> If you know you have this kind disk workload, you must use virtio-blk or
>> virtio-scsi with iothreads and place the iothreads on their own physical
>> CPUs.
>>
>> Among "run arbitrary workloads", "run real-time workloads", "pack stuff
>> into as few physical CPUs as possible", you can only pick two.
>
> Thats not the state of things (userspace in vcpu-0 is not specially tailored
> to not violate latencies in vcpu-1): that is not all user triggered
> actions can be verified.
> 
> Think "updatedb", and so on...

_Which_ spinlock is it that can cause unwanted latency while running
updatedb on VCPU0 and a real-time workload on VCPU1, and only so on virt
because of the emulator thread?  Is this still broken if you set up
priorities for the emulator thread correctly and use PI mutexes in QEMU?
 And if so, what is the cause of interruptions in the emulator thread
and how are these interruptions causing the jitter?

Priorities and priority inheritance (or lack of them) is a _known_
issue.  Jan was doing his KVM-RT things in 2009 and he was talking about
priorities[1] back then.  The effect of correct priorities is to _lower_
jitter, not to make it worse, and anyway certainly not worse than
SCHED_NORMAL I/O thread.  Once that's fixed, we can look at other problems.

Paolo

[1] http://static.lwn.net/images/conf/rtlws11/papers/proc/p18.pdf which
also mentions pv scheduling
Marcelo Tosatti Sept. 29, 2017, 8:17 p.m. UTC | #37
On Fri, Sep 29, 2017 at 07:05:41PM +0200, Paolo Bonzini wrote:
> On 29/09/2017 18:40, Marcelo Tosatti wrote:
> >> If you know you have this kind disk workload, you must use virtio-blk or
> >> virtio-scsi with iothreads and place the iothreads on their own physical
> >> CPUs.
> >>
> >> Among "run arbitrary workloads", "run real-time workloads", "pack stuff
> >> into as few physical CPUs as possible", you can only pick two.
> >
> > Thats not the state of things (userspace in vcpu-0 is not specially tailored
> > to not violate latencies in vcpu-1): that is not all user triggered
> > actions can be verified.
> > 
> > Think "updatedb", and so on...
> 
> _Which_ spinlock is it that can cause unwanted latency while running
> updatedb on VCPU0 and a real-time workload on VCPU1, and only so on virt
> because of the emulator thread?

Hundreds of them (the one being hit is in timer_interrupt), but i went 
to check and there are hundreds of raw spinlocks shared between the
kernel threads that run on isolated CPUs and vcpu-0.

>  Is this still broken if you set up
> priorities for the emulator thread correctly and use PI mutexes in QEMU?

I don't see why it would not, if you have to schedule the emulator
thread to process and inject I/O interrupts for example.

>  And if so, what is the cause of interruptions in the emulator thread
> and how are these interruptions causing the jitter?

Interrupt injections.

> Priorities and priority inheritance (or lack of them) is a _known_
> issue.  Jan was doing his KVM-RT things in 2009 and he was talking about
> priorities[1] back then.  The effect of correct priorities is to _lower_
> jitter, not to make it worse, and anyway certainly not worse than
> SCHED_NORMAL I/O thread.  Once that's fixed, we can look at other problems.
> 
> Paolo
> 
> [1] http://static.lwn.net/images/conf/rtlws11/papers/proc/p18.pdf which
> also mentions pv scheduling
Paolo Bonzini Oct. 2, 2017, 12:30 p.m. UTC | #38
On 29/09/2017 22:17, Marcelo Tosatti wrote:
> On Fri, Sep 29, 2017 at 07:05:41PM +0200, Paolo Bonzini wrote:
>> On 29/09/2017 18:40, Marcelo Tosatti wrote:
>>> Thats not the state of things (userspace in vcpu-0 is not specially tailored
>>> to not violate latencies in vcpu-1): that is not all user triggered
>>> actions can be verified.
>>>
>>> Think "updatedb", and so on...
>>
>> _Which_ spinlock is it that can cause unwanted latency while running
>> updatedb on VCPU0 and a real-time workload on VCPU1, and only so on virt
>> because of the emulator thread?
> 
> Hundreds of them (the one being hit is in timer_interrupt), but i went 
> to check and there are hundreds of raw spinlocks shared between the
> kernel threads that run on isolated CPUs and vcpu-0.
> 
>>  Is this still broken if you set up
>> priorities for the emulator thread correctly and use PI mutexes in QEMU?
> 
> I don't see why it would not, if you have to schedule the emulator
> thread to process and inject I/O interrupts for example.

Yes, you're right if it's interrupt injections.  If it's unexpected disk
accesses, you can just add a QEMU I/O thread on a different physical
CPU.  The same physical CPU can host I/O threads for different guests if
you expect them to do little.

I don't understand why is it correct to delay interrupt injection just
because VCPU0 is running in a spinlock-protected region?  I just cannot
see the reason why it's safe and not a recipe for priority inversions.

Paolo

>>  And if so, what is the cause of interruptions in the emulator thread
>> and how are these interruptions causing the jitter?
> 
> Interrupt injections.
> 
>> Priorities and priority inheritance (or lack of them) is a _known_
>> issue.  Jan was doing his KVM-RT things in 2009 and he was talking about
>> priorities[1] back then.  The effect of correct priorities is to _lower_
>> jitter, not to make it worse, and anyway certainly not worse than
>> SCHED_NORMAL I/O thread.  Once that's fixed, we can look at other problems.
>>
>> Paolo
>>
>> [1] http://static.lwn.net/images/conf/rtlws11/papers/proc/p18.pdf which
>> also mentions pv scheduling
Peter Zijlstra Oct. 2, 2017, 12:48 p.m. UTC | #39
On Mon, Oct 02, 2017 at 02:30:33PM +0200, Paolo Bonzini wrote:
> I don't understand why is it correct to delay interrupt injection just
> because VCPU0 is running in a spinlock-protected region?  I just cannot
> see the reason why it's safe and not a recipe for priority inversions.

It is indeed not right. Something like:

	raw_spin_lock(&some_lock);

	/* do crud */

	raw_spin_unlock(&some_lock);

Should not hold off the interrupt that tells you your finger is in
imminent danger of becoming detached. Only when we do
local_irq_disable() (ie. raw_spin_lock_irq*() and the like) should we
avoid interrupt delivery.

This whole fixation on spinlock regions is misguided and must stop, its
wrong on all levels.
diff mbox

Patch

Index: kvm.fifopriohc-submit/arch/x86/kernel/kvm.c
===================================================================
--- kvm.fifopriohc-submit.orig/arch/x86/kernel/kvm.c
+++ kvm.fifopriohc-submit/arch/x86/kernel/kvm.c
@@ -37,6 +37,7 @@ 
 #include <linux/debugfs.h>
 #include <linux/nmi.h>
 #include <linux/swait.h>
+#include <linux/static_key.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -321,6 +322,36 @@  static notrace void kvm_guest_apic_eoi_w
 	apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
 }
 
+static int kvmfifohc;
+
+static int parse_kvmfifohc(char *arg)
+{
+	kvmfifohc = 1;
+	return 0;
+}
+
+early_param("kvmfifohc", parse_kvmfifohc);
+
+DEFINE_STATIC_KEY_FALSE(kvm_fifo_hc_key);
+
+static void kvm_init_fifo_hc(void)
+{
+	long ret;
+
+	ret = kvm_hypercall1(KVM_HC_RT_PRIO, 0);
+
+	if (ret == 0 && kvmfifohc == 1)
+		static_branch_enable(&kvm_fifo_hc_key);
+}
+
+static __init int kvmguest_late_init(void)
+{
+	kvm_init_fifo_hc();
+	return 0;
+}
+
+late_initcall(kvmguest_late_init);
+
 static void kvm_guest_cpu_init(void)
 {
 	if (!kvm_para_available())
Index: kvm.fifopriohc-submit/include/linux/spinlock_api_smp.h
===================================================================
--- kvm.fifopriohc-submit.orig/include/linux/spinlock_api_smp.h
+++ kvm.fifopriohc-submit/include/linux/spinlock_api_smp.h
@@ -136,11 +136,28 @@  static inline void __raw_spin_lock_bh(ra
 	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
 }
 
+#ifdef CONFIG_KVM_GUEST
+DECLARE_STATIC_KEY_FALSE(kvm_fifo_hc_key);
+#endif
+
 static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	preempt_disable();
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_SMP)
+	/* enable FIFO priority */
+	if (static_branch_unlikely(&kvm_fifo_hc_key))
+		kvm_hypercall1(KVM_HC_RT_PRIO, 0x1);
+#endif
+
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_SMP)
+	/* disable FIFO priority */
+	if (static_branch_unlikely(&kvm_fifo_hc_key))
+		kvm_hypercall1(KVM_HC_RT_PRIO, 0);
+#endif
 }
 
 #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */
Index: kvm.fifopriohc-submit/include/linux/spinlock.h
===================================================================
--- kvm.fifopriohc-submit.orig/include/linux/spinlock.h
+++ kvm.fifopriohc-submit/include/linux/spinlock.h
@@ -56,7 +56,7 @@ 
 #include <linux/stringify.h>
 #include <linux/bottom_half.h>
 #include <asm/barrier.h>
-
+#include <uapi/linux/kvm_para.h>
 
 /*
  * Must define these before including other files, inline functions need them