diff mbox series

[v3,7/7] x86/tlb: use Xen L0 assisted TLB flush when available

Message ID 20200127181115.82709-8-roger.pau@citrix.com (mailing list archive)
State Superseded
Headers show
Series x86: improve assisted tlb flush and use it in guest mode | expand

Commit Message

Roger Pau Monné Jan. 27, 2020, 6:11 p.m. UTC
Use Xen's L0 HVMOP_flush_tlbs hypercall in order to perform flushes.
This greatly increases the performance of TLB flushes when running
with a high amount of vCPUs as a Xen guest, and is specially important
when running in shim mode.

The following figures are from a PV guest running `make -j32 xen` in
shim mode with 32 vCPUs and HAP.

Using x2APIC and ALLBUT shorthand:
real	4m35.973s
user	4m35.110s
sys	36m24.117s

Using L0 assisted flush:
real    1m2.596s
user    4m34.818s
sys     5m16.374s

The implementation adds a new hook to hypervisor_ops so other
enlightenments can also implement such assisted flush just by filling
the hook. Note that the Xen implementation completely ignores the
dirty CPU mask and the linear address passed in, and always performs a
global TLB flush on all vCPUs.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Changes since v1:
 - Add a L0 assisted hook to hypervisor ops.
---
 xen/arch/x86/guest/hypervisor.c        | 10 ++++++++++
 xen/arch/x86/guest/xen/xen.c           |  6 ++++++
 xen/arch/x86/smp.c                     | 11 +++++++++++
 xen/include/asm-x86/guest/hypervisor.h | 17 +++++++++++++++++
 4 files changed, 44 insertions(+)

Comments

Wei Liu Jan. 28, 2020, 2:17 p.m. UTC | #1
On Mon, Jan 27, 2020 at 07:11:15PM +0100, Roger Pau Monne wrote:
[...]
>  
>  const struct hypervisor_ops *__init xg_probe(void)
> diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
> index 65eb7cbda8..9bc925616a 100644
> --- a/xen/arch/x86/smp.c
> +++ b/xen/arch/x86/smp.c
> @@ -15,6 +15,7 @@
>  #include <xen/perfc.h>
>  #include <xen/spinlock.h>
>  #include <asm/current.h>
> +#include <asm/guest.h>
>  #include <asm/smp.h>
>  #include <asm/mc146818rtc.h>
>  #include <asm/flushtlb.h>
> @@ -256,6 +257,16 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
>      if ( (flags & ~FLUSH_ORDER_MASK) &&
>           !cpumask_subset(mask, cpumask_of(cpu)) )
>      {
> +        if ( cpu_has_hypervisor &&
> +             !(flags & ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_VA_VALID |
> +                         FLUSH_ORDER_MASK)) &&
> +             !hypervisor_flush_tlb(mask, va, flags & FLUSH_ORDER_MASK) )
> +        {
> +            if ( tlb_clk_enabled )
> +                tlb_clk_enabled = false;

You may delete the if here to make the generated machine code shorter.

OOI why isn't tlb_clk_enabled set to false when Xen determines to use L0
assisted flush?

(Sorry I haven't read previous patches in detail)

Wei.
Roger Pau Monné Jan. 28, 2020, 2:57 p.m. UTC | #2
On Tue, Jan 28, 2020 at 02:17:36PM +0000, Wei Liu wrote:
> On Mon, Jan 27, 2020 at 07:11:15PM +0100, Roger Pau Monne wrote:
> [...]
> >  
> >  const struct hypervisor_ops *__init xg_probe(void)
> > diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
> > index 65eb7cbda8..9bc925616a 100644
> > --- a/xen/arch/x86/smp.c
> > +++ b/xen/arch/x86/smp.c
> > @@ -15,6 +15,7 @@
> >  #include <xen/perfc.h>
> >  #include <xen/spinlock.h>
> >  #include <asm/current.h>
> > +#include <asm/guest.h>
> >  #include <asm/smp.h>
> >  #include <asm/mc146818rtc.h>
> >  #include <asm/flushtlb.h>
> > @@ -256,6 +257,16 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
> >      if ( (flags & ~FLUSH_ORDER_MASK) &&
> >           !cpumask_subset(mask, cpumask_of(cpu)) )
> >      {
> > +        if ( cpu_has_hypervisor &&
> > +             !(flags & ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_VA_VALID |
> > +                         FLUSH_ORDER_MASK)) &&
> > +             !hypervisor_flush_tlb(mask, va, flags & FLUSH_ORDER_MASK) )
> > +        {
> > +            if ( tlb_clk_enabled )
> > +                tlb_clk_enabled = false;
> 
> You may delete the if here to make the generated machine code shorter.

Hm, but tlb_clk_enabled is marked as read_mostly, which won't be true
then, and would likely have a performance impact.

> OOI why isn't tlb_clk_enabled set to false when Xen determines to use L0
> assisted flush?

L0 assisted flush can fail (ie: return an error), and in that case Xen
would be better to continue using the timestamped tlb, as it could
avoid some flushes.

Thanks, Roger.
Wei Liu Jan. 28, 2020, 4:24 p.m. UTC | #3
On Tue, Jan 28, 2020 at 03:57:04PM +0100, Roger Pau Monné wrote:
> On Tue, Jan 28, 2020 at 02:17:36PM +0000, Wei Liu wrote:
> > On Mon, Jan 27, 2020 at 07:11:15PM +0100, Roger Pau Monne wrote:
> > [...]
> > >  
> > >  const struct hypervisor_ops *__init xg_probe(void)
> > > diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
> > > index 65eb7cbda8..9bc925616a 100644
> > > --- a/xen/arch/x86/smp.c
> > > +++ b/xen/arch/x86/smp.c
> > > @@ -15,6 +15,7 @@
> > >  #include <xen/perfc.h>
> > >  #include <xen/spinlock.h>
> > >  #include <asm/current.h>
> > > +#include <asm/guest.h>
> > >  #include <asm/smp.h>
> > >  #include <asm/mc146818rtc.h>
> > >  #include <asm/flushtlb.h>
> > > @@ -256,6 +257,16 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
> > >      if ( (flags & ~FLUSH_ORDER_MASK) &&
> > >           !cpumask_subset(mask, cpumask_of(cpu)) )
> > >      {
> > > +        if ( cpu_has_hypervisor &&
> > > +             !(flags & ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_VA_VALID |
> > > +                         FLUSH_ORDER_MASK)) &&
> > > +             !hypervisor_flush_tlb(mask, va, flags & FLUSH_ORDER_MASK) )
> > > +        {
> > > +            if ( tlb_clk_enabled )
> > > +                tlb_clk_enabled = false;
> > 
> > You may delete the if here to make the generated machine code shorter.
> 
> Hm, but tlb_clk_enabled is marked as read_mostly, which won't be true
> then, and would likely have a performance impact.

OK. Fair enough.

> 
> > OOI why isn't tlb_clk_enabled set to false when Xen determines to use L0
> > assisted flush?
> 
> L0 assisted flush can fail (ie: return an error), and in that case Xen
> would be better to continue using the timestamped tlb, as it could
> avoid some flushes.

Do you need to set tlb_clk_enabled in that case?

Wei.

> 
> Thanks, Roger.
Roger Pau Monné Jan. 28, 2020, 5:16 p.m. UTC | #4
On Tue, Jan 28, 2020 at 04:24:24PM +0000, Wei Liu wrote:
> On Tue, Jan 28, 2020 at 03:57:04PM +0100, Roger Pau Monné wrote:
> > On Tue, Jan 28, 2020 at 02:17:36PM +0000, Wei Liu wrote:
> > > On Mon, Jan 27, 2020 at 07:11:15PM +0100, Roger Pau Monne wrote:
> > > [...]
> > > >  
> > > >  const struct hypervisor_ops *__init xg_probe(void)
> > > > diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
> > > > index 65eb7cbda8..9bc925616a 100644
> > > > --- a/xen/arch/x86/smp.c
> > > > +++ b/xen/arch/x86/smp.c
> > > > @@ -15,6 +15,7 @@
> > > >  #include <xen/perfc.h>
> > > >  #include <xen/spinlock.h>
> > > >  #include <asm/current.h>
> > > > +#include <asm/guest.h>
> > > >  #include <asm/smp.h>
> > > >  #include <asm/mc146818rtc.h>
> > > >  #include <asm/flushtlb.h>
> > > > @@ -256,6 +257,16 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
> > > >      if ( (flags & ~FLUSH_ORDER_MASK) &&
> > > >           !cpumask_subset(mask, cpumask_of(cpu)) )
> > > >      {
> > > > +        if ( cpu_has_hypervisor &&
> > > > +             !(flags & ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_VA_VALID |
> > > > +                         FLUSH_ORDER_MASK)) &&
> > > > +             !hypervisor_flush_tlb(mask, va, flags & FLUSH_ORDER_MASK) )
> > > > +        {
> > > > +            if ( tlb_clk_enabled )
> > > > +                tlb_clk_enabled = false;
> > > 
> > > You may delete the if here to make the generated machine code shorter.
> > 
> > Hm, but tlb_clk_enabled is marked as read_mostly, which won't be true
> > then, and would likely have a performance impact.
> 
> OK. Fair enough.
> 
> > 
> > > OOI why isn't tlb_clk_enabled set to false when Xen determines to use L0
> > > assisted flush?
> > 
> > L0 assisted flush can fail (ie: return an error), and in that case Xen
> > would be better to continue using the timestamped tlb, as it could
> > avoid some flushes.
> 
> Do you need to set tlb_clk_enabled in that case?

AFAICT it's safe to enable the TLB timestamps after being disabled,
but hypervisor_flush_tlb could fail intermittently with EBUSY in the
Xen implementation, and I don't really want that to cause spurious
enabling of the timestamps periodically.

My expectation would be that such assistance could fail sporadically,
but it shouldn't re-enable the timestamped TLB as failure would be
infrequent.

Thanks, Roger.
Andrew Cooper Jan. 28, 2020, 5:20 p.m. UTC | #5
On 28/01/2020 17:16, Roger Pau Monné wrote:
>>>> OOI why isn't tlb_clk_enabled set to false when Xen determines to use L0
>>>> assisted flush?
>>> L0 assisted flush can fail (ie: return an error), and in that case Xen
>>> would be better to continue using the timestamped tlb, as it could
>>> avoid some flushes.
>> Do you need to set tlb_clk_enabled in that case?
> AFAICT it's safe to enable the TLB timestamps after being disabled,
> but hypervisor_flush_tlb could fail intermittently with EBUSY in the
> Xen implementation, and I don't really want that to cause spurious
> enabling of the timestamps periodically.

What causes -EBUSY?

I don't think it is reasonable for the hypercall to fail like that. 
There is no circumstance ever where a TLB flush is wanted, and it is
able to be deferred.

Forcing all callers to loop while busy is a terrible interface.

~Andrew
Roger Pau Monné Jan. 28, 2020, 5:38 p.m. UTC | #6
On Tue, Jan 28, 2020 at 05:20:25PM +0000, Andrew Cooper wrote:
> On 28/01/2020 17:16, Roger Pau Monné wrote:
> >>>> OOI why isn't tlb_clk_enabled set to false when Xen determines to use L0
> >>>> assisted flush?
> >>> L0 assisted flush can fail (ie: return an error), and in that case Xen
> >>> would be better to continue using the timestamped tlb, as it could
> >>> avoid some flushes.
> >> Do you need to set tlb_clk_enabled in that case?
> > AFAICT it's safe to enable the TLB timestamps after being disabled,
> > but hypervisor_flush_tlb could fail intermittently with EBUSY in the
> > Xen implementation, and I don't really want that to cause spurious
> > enabling of the timestamps periodically.
> 
> What causes -EBUSY?

My bad, it's not EBUSY, it's ERESTART but that won't be returned to
the caller.

> I don't think it is reasonable for the hypercall to fail like that. 
> There is no circumstance ever where a TLB flush is wanted, and it is
> able to be deferred.

After this series ERESTART is only used by the shadow flush
implementation, and I'm not sure I'm confident enough to try to change
the shadow code in order to not do that, also because I think the
interest is likely much lower than for the HAP case.

> Forcing all callers to loop while busy is a terrible interface.

Well, the whole implementation of hvm_flush_vcpu_tlb is quite clumsy
because it attempts to pause each vCPU to be flushed, which for guests
with a high number of vCPUs makes it likely slower than just using a
shorthand IPI inside the guest.

Thanks, Roger.
Wei Liu Feb. 6, 2020, 1:49 p.m. UTC | #7
On Mon, Jan 27, 2020 at 07:11:15PM +0100, Roger Pau Monne wrote:
> Use Xen's L0 HVMOP_flush_tlbs hypercall in order to perform flushes.
> This greatly increases the performance of TLB flushes when running
> with a high amount of vCPUs as a Xen guest, and is specially important
> when running in shim mode.
> 
> The following figures are from a PV guest running `make -j32 xen` in
> shim mode with 32 vCPUs and HAP.
> 
> Using x2APIC and ALLBUT shorthand:
> real	4m35.973s
> user	4m35.110s
> sys	36m24.117s
> 
> Using L0 assisted flush:
> real    1m2.596s
> user    4m34.818s
> sys     5m16.374s
> 
> The implementation adds a new hook to hypervisor_ops so other
> enlightenments can also implement such assisted flush just by filling
> the hook. Note that the Xen implementation completely ignores the
> dirty CPU mask and the linear address passed in, and always performs a
> global TLB flush on all vCPUs.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Changes since v1:
>  - Add a L0 assisted hook to hypervisor ops.
> ---
>  xen/arch/x86/guest/hypervisor.c        | 10 ++++++++++
>  xen/arch/x86/guest/xen/xen.c           |  6 ++++++
>  xen/arch/x86/smp.c                     | 11 +++++++++++
>  xen/include/asm-x86/guest/hypervisor.h | 17 +++++++++++++++++
>  4 files changed, 44 insertions(+)
> 
> diff --git a/xen/arch/x86/guest/hypervisor.c b/xen/arch/x86/guest/hypervisor.c
> index 4f27b98740..4085b19734 100644
> --- a/xen/arch/x86/guest/hypervisor.c
> +++ b/xen/arch/x86/guest/hypervisor.c
> @@ -18,6 +18,7 @@
>   *
>   * Copyright (c) 2019 Microsoft.
>   */
> +#include <xen/cpumask.h>
>  #include <xen/init.h>
>  #include <xen/types.h>
>  
> @@ -64,6 +65,15 @@ void hypervisor_resume(void)
>          ops->resume();
>  }
>  
> +int hypervisor_flush_tlb(const cpumask_t *mask, const void *va,
> +                         unsigned int order)
> +{
> +    if ( ops && ops->flush_tlb )
> +        return ops->flush_tlb(mask, va, order);
> +

Is there a way to make this an alternative call? I consider tlb flush a
frequent operation which can use some optimisation.

This can be done as a later improvement if it is too difficult though.
This patch already has some substantial improvement.

> +    return -ENOSYS;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/x86/guest/xen/xen.c b/xen/arch/x86/guest/xen/xen.c
> index 6dbc5f953f..639a2a4b32 100644
> --- a/xen/arch/x86/guest/xen/xen.c
> +++ b/xen/arch/x86/guest/xen/xen.c
> @@ -310,11 +310,17 @@ static void resume(void)
>          pv_console_init();
>  }
>  
> +static int flush_tlb(const cpumask_t *mask, const void *va, unsigned int order)
> +{
> +    return xen_hypercall_hvm_op(HVMOP_flush_tlbs, NULL);
> +}
> +
>  static const struct hypervisor_ops ops = {
>      .name = "Xen",
>      .setup = setup,
>      .ap_setup = ap_setup,
>      .resume = resume,
> +    .flush_tlb = flush_tlb,
>  };
>  
>  const struct hypervisor_ops *__init xg_probe(void)
> diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
> index 65eb7cbda8..9bc925616a 100644
> --- a/xen/arch/x86/smp.c
> +++ b/xen/arch/x86/smp.c
> @@ -15,6 +15,7 @@
>  #include <xen/perfc.h>
>  #include <xen/spinlock.h>
>  #include <asm/current.h>
> +#include <asm/guest.h>
>  #include <asm/smp.h>
>  #include <asm/mc146818rtc.h>
>  #include <asm/flushtlb.h>
> @@ -256,6 +257,16 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
>      if ( (flags & ~FLUSH_ORDER_MASK) &&
>           !cpumask_subset(mask, cpumask_of(cpu)) )
>      {
> +        if ( cpu_has_hypervisor &&
> +             !(flags & ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_VA_VALID |
> +                         FLUSH_ORDER_MASK)) &&
> +             !hypervisor_flush_tlb(mask, va, flags & FLUSH_ORDER_MASK) )
> +        {
> +            if ( tlb_clk_enabled )
> +                tlb_clk_enabled = false;
> +            return;
> +        }
> +

Per my understanding, not turning tlb_clk_enabled back to true after an
assisted flush fails is okay, because the effect of tlb_clk_enabled
being false is to always make NEED_FLUSH return true. That causes
excessive flushing, but it is okay in terms of correctness. Do I
understand it correctly?

Wei.
Roger Pau Monné Feb. 6, 2020, 2:09 p.m. UTC | #8
On Thu, Feb 06, 2020 at 01:49:35PM +0000, Wei Liu wrote:
> On Mon, Jan 27, 2020 at 07:11:15PM +0100, Roger Pau Monne wrote:
> > Use Xen's L0 HVMOP_flush_tlbs hypercall in order to perform flushes.
> > This greatly increases the performance of TLB flushes when running
> > with a high amount of vCPUs as a Xen guest, and is specially important
> > when running in shim mode.
> > 
> > The following figures are from a PV guest running `make -j32 xen` in
> > shim mode with 32 vCPUs and HAP.
> > 
> > Using x2APIC and ALLBUT shorthand:
> > real	4m35.973s
> > user	4m35.110s
> > sys	36m24.117s
> > 
> > Using L0 assisted flush:
> > real    1m2.596s
> > user    4m34.818s
> > sys     5m16.374s
> > 
> > The implementation adds a new hook to hypervisor_ops so other
> > enlightenments can also implement such assisted flush just by filling
> > the hook. Note that the Xen implementation completely ignores the
> > dirty CPU mask and the linear address passed in, and always performs a
> > global TLB flush on all vCPUs.
> > 
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> > Changes since v1:
> >  - Add a L0 assisted hook to hypervisor ops.
> > ---
> >  xen/arch/x86/guest/hypervisor.c        | 10 ++++++++++
> >  xen/arch/x86/guest/xen/xen.c           |  6 ++++++
> >  xen/arch/x86/smp.c                     | 11 +++++++++++
> >  xen/include/asm-x86/guest/hypervisor.h | 17 +++++++++++++++++
> >  4 files changed, 44 insertions(+)
> > 
> > diff --git a/xen/arch/x86/guest/hypervisor.c b/xen/arch/x86/guest/hypervisor.c
> > index 4f27b98740..4085b19734 100644
> > --- a/xen/arch/x86/guest/hypervisor.c
> > +++ b/xen/arch/x86/guest/hypervisor.c
> > @@ -18,6 +18,7 @@
> >   *
> >   * Copyright (c) 2019 Microsoft.
> >   */
> > +#include <xen/cpumask.h>
> >  #include <xen/init.h>
> >  #include <xen/types.h>
> >  
> > @@ -64,6 +65,15 @@ void hypervisor_resume(void)
> >          ops->resume();
> >  }
> >  
> > +int hypervisor_flush_tlb(const cpumask_t *mask, const void *va,
> > +                         unsigned int order)
> > +{
> > +    if ( ops && ops->flush_tlb )
> > +        return ops->flush_tlb(mask, va, order);
> > +
> 
> Is there a way to make this an alternative call? I consider tlb flush a
> frequent operation which can use some optimisation.
> 
> This can be done as a later improvement if it is too difficult though.
> This patch already has some substantial improvement.

I can look into making this an alternative call, if it turn out to be
too complex I will leave it out for a separate patch.

> > +    return -ENOSYS;
> > +}
> > +
> >  /*
> >   * Local variables:
> >   * mode: C
> > diff --git a/xen/arch/x86/guest/xen/xen.c b/xen/arch/x86/guest/xen/xen.c
> > index 6dbc5f953f..639a2a4b32 100644
> > --- a/xen/arch/x86/guest/xen/xen.c
> > +++ b/xen/arch/x86/guest/xen/xen.c
> > @@ -310,11 +310,17 @@ static void resume(void)
> >          pv_console_init();
> >  }
> >  
> > +static int flush_tlb(const cpumask_t *mask, const void *va, unsigned int order)
> > +{
> > +    return xen_hypercall_hvm_op(HVMOP_flush_tlbs, NULL);
> > +}
> > +
> >  static const struct hypervisor_ops ops = {
> >      .name = "Xen",
> >      .setup = setup,
> >      .ap_setup = ap_setup,
> >      .resume = resume,
> > +    .flush_tlb = flush_tlb,
> >  };
> >  
> >  const struct hypervisor_ops *__init xg_probe(void)
> > diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
> > index 65eb7cbda8..9bc925616a 100644
> > --- a/xen/arch/x86/smp.c
> > +++ b/xen/arch/x86/smp.c
> > @@ -15,6 +15,7 @@
> >  #include <xen/perfc.h>
> >  #include <xen/spinlock.h>
> >  #include <asm/current.h>
> > +#include <asm/guest.h>
> >  #include <asm/smp.h>
> >  #include <asm/mc146818rtc.h>
> >  #include <asm/flushtlb.h>
> > @@ -256,6 +257,16 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
> >      if ( (flags & ~FLUSH_ORDER_MASK) &&
> >           !cpumask_subset(mask, cpumask_of(cpu)) )
> >      {
> > +        if ( cpu_has_hypervisor &&
> > +             !(flags & ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_VA_VALID |
> > +                         FLUSH_ORDER_MASK)) &&
> > +             !hypervisor_flush_tlb(mask, va, flags & FLUSH_ORDER_MASK) )
> > +        {
> > +            if ( tlb_clk_enabled )
> > +                tlb_clk_enabled = false;
> > +            return;
> > +        }
> > +
> 
> Per my understanding, not turning tlb_clk_enabled back to true after an
> assisted flush fails is okay, because the effect of tlb_clk_enabled
> being false is to always make NEED_FLUSH return true. That causes
> excessive flushing, but it is okay in terms of correctness. Do I
> understand it correctly?

Yes, that's my understanding. The stamped TLB is an optimization in
order to avoid flushes. Keeping it always off is safe, OTOH having it
on without properly tracking the flushes can indeed cause issues.

For the time being I would rather leave it off when it's been
disabled (ie: not turn it on if the assisted flush fails), as that's
safe.

Thanks, Roger.
Jan Beulich Feb. 6, 2020, 2:46 p.m. UTC | #9
On 06.02.2020 15:09, Roger Pau Monné wrote:
> On Thu, Feb 06, 2020 at 01:49:35PM +0000, Wei Liu wrote:
>> On Mon, Jan 27, 2020 at 07:11:15PM +0100, Roger Pau Monne wrote:
>>> Use Xen's L0 HVMOP_flush_tlbs hypercall in order to perform flushes.
>>> This greatly increases the performance of TLB flushes when running
>>> with a high amount of vCPUs as a Xen guest, and is specially important
>>> when running in shim mode.
>>>
>>> The following figures are from a PV guest running `make -j32 xen` in
>>> shim mode with 32 vCPUs and HAP.
>>>
>>> Using x2APIC and ALLBUT shorthand:
>>> real	4m35.973s
>>> user	4m35.110s
>>> sys	36m24.117s
>>>
>>> Using L0 assisted flush:
>>> real    1m2.596s
>>> user    4m34.818s
>>> sys     5m16.374s
>>>
>>> The implementation adds a new hook to hypervisor_ops so other
>>> enlightenments can also implement such assisted flush just by filling
>>> the hook. Note that the Xen implementation completely ignores the
>>> dirty CPU mask and the linear address passed in, and always performs a
>>> global TLB flush on all vCPUs.
>>>
>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>>> ---
>>> Changes since v1:
>>>  - Add a L0 assisted hook to hypervisor ops.
>>> ---
>>>  xen/arch/x86/guest/hypervisor.c        | 10 ++++++++++
>>>  xen/arch/x86/guest/xen/xen.c           |  6 ++++++
>>>  xen/arch/x86/smp.c                     | 11 +++++++++++
>>>  xen/include/asm-x86/guest/hypervisor.h | 17 +++++++++++++++++
>>>  4 files changed, 44 insertions(+)
>>>
>>> diff --git a/xen/arch/x86/guest/hypervisor.c b/xen/arch/x86/guest/hypervisor.c
>>> index 4f27b98740..4085b19734 100644
>>> --- a/xen/arch/x86/guest/hypervisor.c
>>> +++ b/xen/arch/x86/guest/hypervisor.c
>>> @@ -18,6 +18,7 @@
>>>   *
>>>   * Copyright (c) 2019 Microsoft.
>>>   */
>>> +#include <xen/cpumask.h>
>>>  #include <xen/init.h>
>>>  #include <xen/types.h>
>>>  
>>> @@ -64,6 +65,15 @@ void hypervisor_resume(void)
>>>          ops->resume();
>>>  }
>>>  
>>> +int hypervisor_flush_tlb(const cpumask_t *mask, const void *va,
>>> +                         unsigned int order)
>>> +{
>>> +    if ( ops && ops->flush_tlb )
>>> +        return ops->flush_tlb(mask, va, order);
>>> +
>>
>> Is there a way to make this an alternative call? I consider tlb flush a
>> frequent operation which can use some optimisation.
>>
>> This can be done as a later improvement if it is too difficult though.
>> This patch already has some substantial improvement.
> 
> I can look into making this an alternative call, if it turn out to be
> too complex I will leave it out for a separate patch.

It'll be two steps - make a global struct hypervisor_ops instance
which the per-hypervisor instances get _copied_ into upon boot
(at that point all of those can go into .init.* sections), and
then switch the call(s) of interest. I.e. while the 2nd step can
of course be done right here, the first will want to be in a
prereq patch.

Jan
Roger Pau Monné Feb. 6, 2020, 3:37 p.m. UTC | #10
On Thu, Feb 06, 2020 at 03:46:56PM +0100, Jan Beulich wrote:
> On 06.02.2020 15:09, Roger Pau Monné wrote:
> > On Thu, Feb 06, 2020 at 01:49:35PM +0000, Wei Liu wrote:
> >> On Mon, Jan 27, 2020 at 07:11:15PM +0100, Roger Pau Monne wrote:
> >>> Use Xen's L0 HVMOP_flush_tlbs hypercall in order to perform flushes.
> >>> This greatly increases the performance of TLB flushes when running
> >>> with a high amount of vCPUs as a Xen guest, and is specially important
> >>> when running in shim mode.
> >>>
> >>> The following figures are from a PV guest running `make -j32 xen` in
> >>> shim mode with 32 vCPUs and HAP.
> >>>
> >>> Using x2APIC and ALLBUT shorthand:
> >>> real	4m35.973s
> >>> user	4m35.110s
> >>> sys	36m24.117s
> >>>
> >>> Using L0 assisted flush:
> >>> real    1m2.596s
> >>> user    4m34.818s
> >>> sys     5m16.374s
> >>>
> >>> The implementation adds a new hook to hypervisor_ops so other
> >>> enlightenments can also implement such assisted flush just by filling
> >>> the hook. Note that the Xen implementation completely ignores the
> >>> dirty CPU mask and the linear address passed in, and always performs a
> >>> global TLB flush on all vCPUs.
> >>>
> >>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> >>> ---
> >>> Changes since v1:
> >>>  - Add a L0 assisted hook to hypervisor ops.
> >>> ---
> >>>  xen/arch/x86/guest/hypervisor.c        | 10 ++++++++++
> >>>  xen/arch/x86/guest/xen/xen.c           |  6 ++++++
> >>>  xen/arch/x86/smp.c                     | 11 +++++++++++
> >>>  xen/include/asm-x86/guest/hypervisor.h | 17 +++++++++++++++++
> >>>  4 files changed, 44 insertions(+)
> >>>
> >>> diff --git a/xen/arch/x86/guest/hypervisor.c b/xen/arch/x86/guest/hypervisor.c
> >>> index 4f27b98740..4085b19734 100644
> >>> --- a/xen/arch/x86/guest/hypervisor.c
> >>> +++ b/xen/arch/x86/guest/hypervisor.c
> >>> @@ -18,6 +18,7 @@
> >>>   *
> >>>   * Copyright (c) 2019 Microsoft.
> >>>   */
> >>> +#include <xen/cpumask.h>
> >>>  #include <xen/init.h>
> >>>  #include <xen/types.h>
> >>>  
> >>> @@ -64,6 +65,15 @@ void hypervisor_resume(void)
> >>>          ops->resume();
> >>>  }
> >>>  
> >>> +int hypervisor_flush_tlb(const cpumask_t *mask, const void *va,
> >>> +                         unsigned int order)
> >>> +{
> >>> +    if ( ops && ops->flush_tlb )
> >>> +        return ops->flush_tlb(mask, va, order);
> >>> +
> >>
> >> Is there a way to make this an alternative call? I consider tlb flush a
> >> frequent operation which can use some optimisation.
> >>
> >> This can be done as a later improvement if it is too difficult though.
> >> This patch already has some substantial improvement.
> > 
> > I can look into making this an alternative call, if it turn out to be
> > too complex I will leave it out for a separate patch.
> 
> It'll be two steps - make a global struct hypervisor_ops instance
> which the per-hypervisor instances get _copied_ into upon boot
> (at that point all of those can go into .init.* sections), and
> then switch the call(s) of interest. I.e. while the 2nd step can
> of course be done right here, the first will want to be in a
> prereq patch.

Done. I've only switched the flush_tlb operation, since the rest are
not relevant from a performance PoV.

Will wait until next week before posting a new version.

Thanks, Roger.
diff mbox series

Patch

diff --git a/xen/arch/x86/guest/hypervisor.c b/xen/arch/x86/guest/hypervisor.c
index 4f27b98740..4085b19734 100644
--- a/xen/arch/x86/guest/hypervisor.c
+++ b/xen/arch/x86/guest/hypervisor.c
@@ -18,6 +18,7 @@ 
  *
  * Copyright (c) 2019 Microsoft.
  */
+#include <xen/cpumask.h>
 #include <xen/init.h>
 #include <xen/types.h>
 
@@ -64,6 +65,15 @@  void hypervisor_resume(void)
         ops->resume();
 }
 
+int hypervisor_flush_tlb(const cpumask_t *mask, const void *va,
+                         unsigned int order)
+{
+    if ( ops && ops->flush_tlb )
+        return ops->flush_tlb(mask, va, order);
+
+    return -ENOSYS;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/guest/xen/xen.c b/xen/arch/x86/guest/xen/xen.c
index 6dbc5f953f..639a2a4b32 100644
--- a/xen/arch/x86/guest/xen/xen.c
+++ b/xen/arch/x86/guest/xen/xen.c
@@ -310,11 +310,17 @@  static void resume(void)
         pv_console_init();
 }
 
+static int flush_tlb(const cpumask_t *mask, const void *va, unsigned int order)
+{
+    return xen_hypercall_hvm_op(HVMOP_flush_tlbs, NULL);
+}
+
 static const struct hypervisor_ops ops = {
     .name = "Xen",
     .setup = setup,
     .ap_setup = ap_setup,
     .resume = resume,
+    .flush_tlb = flush_tlb,
 };
 
 const struct hypervisor_ops *__init xg_probe(void)
diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
index 65eb7cbda8..9bc925616a 100644
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -15,6 +15,7 @@ 
 #include <xen/perfc.h>
 #include <xen/spinlock.h>
 #include <asm/current.h>
+#include <asm/guest.h>
 #include <asm/smp.h>
 #include <asm/mc146818rtc.h>
 #include <asm/flushtlb.h>
@@ -256,6 +257,16 @@  void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
     if ( (flags & ~FLUSH_ORDER_MASK) &&
          !cpumask_subset(mask, cpumask_of(cpu)) )
     {
+        if ( cpu_has_hypervisor &&
+             !(flags & ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_VA_VALID |
+                         FLUSH_ORDER_MASK)) &&
+             !hypervisor_flush_tlb(mask, va, flags & FLUSH_ORDER_MASK) )
+        {
+            if ( tlb_clk_enabled )
+                tlb_clk_enabled = false;
+            return;
+        }
+
         spin_lock(&flush_lock);
         cpumask_and(&flush_cpumask, mask, &cpu_online_map);
         cpumask_clear_cpu(cpu, &flush_cpumask);
diff --git a/xen/include/asm-x86/guest/hypervisor.h b/xen/include/asm-x86/guest/hypervisor.h
index 392f4b90ae..e230a5d065 100644
--- a/xen/include/asm-x86/guest/hypervisor.h
+++ b/xen/include/asm-x86/guest/hypervisor.h
@@ -19,6 +19,8 @@ 
 #ifndef __X86_HYPERVISOR_H__
 #define __X86_HYPERVISOR_H__
 
+#include <xen/cpumask.h>
+
 struct hypervisor_ops {
     /* Name of the hypervisor */
     const char *name;
@@ -28,6 +30,8 @@  struct hypervisor_ops {
     void (*ap_setup)(void);
     /* Resume from suspension */
     void (*resume)(void);
+    /* L0 assisted TLB flush */
+    int (*flush_tlb)(const cpumask_t *mask, const void *va, unsigned int order);
 };
 
 #ifdef CONFIG_GUEST
@@ -36,6 +40,14 @@  const char *hypervisor_probe(void);
 void hypervisor_setup(void);
 void hypervisor_ap_setup(void);
 void hypervisor_resume(void);
+/*
+ * L0 assisted TLB flush.
+ * mask: cpumask of the dirty vCPUs that should be flushed.
+ * va: linear address to flush, or NULL for global flushes.
+ * order: order of the linear address pointed by va.
+ */
+int hypervisor_flush_tlb(const cpumask_t *mask, const void *va,
+                         unsigned int order);
 
 #else
 
@@ -46,6 +58,11 @@  static inline const char *hypervisor_probe(void) { return NULL; }
 static inline void hypervisor_setup(void) { ASSERT_UNREACHABLE(); }
 static inline void hypervisor_ap_setup(void) { ASSERT_UNREACHABLE(); }
 static inline void hypervisor_resume(void) { ASSERT_UNREACHABLE(); }
+static inline int hypervisor_flush_tlb(const cpumask_t *mask, const void *va,
+                                       unsigned int order)
+{
+    return -ENOSYS;
+}
 
 #endif  /* CONFIG_GUEST */