diff mbox series

[4/4] x86/hyperv: L0 assisted TLB flush

Message ID 20200212160918.18470-5-liuwe@microsoft.com (mailing list archive)
State Superseded
Headers show
Series Xen on Hyper-V: Implement L0 assisted TLB flush | expand

Commit Message

Wei Liu Feb. 12, 2020, 4:09 p.m. UTC
Implement L0 assisted TLB flush for Xen on Hyper-V. It takes advantage
of several hypercalls:

 * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST
 * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
 * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE
 * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX

Pick the most efficient hypercalls available.

Signed-off-by: Wei Liu <liuwe@microsoft.com>
---
 xen/arch/x86/guest/hyperv/Makefile  |   1 +
 xen/arch/x86/guest/hyperv/private.h |   9 ++
 xen/arch/x86/guest/hyperv/tlb.c     | 172 +++++++++++++++++++++++++++-
 xen/arch/x86/guest/hyperv/util.c    |  72 ++++++++++++
 4 files changed, 253 insertions(+), 1 deletion(-)
 create mode 100644 xen/arch/x86/guest/hyperv/util.c

Comments

Roger Pau Monné Feb. 12, 2020, 5:43 p.m. UTC | #1
On Wed, Feb 12, 2020 at 04:09:18PM +0000, Wei Liu wrote:
> Implement L0 assisted TLB flush for Xen on Hyper-V. It takes advantage
> of several hypercalls:
> 
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX
> 
> Pick the most efficient hypercalls available.
> 
> Signed-off-by: Wei Liu <liuwe@microsoft.com>
> ---
>  xen/arch/x86/guest/hyperv/Makefile  |   1 +
>  xen/arch/x86/guest/hyperv/private.h |   9 ++
>  xen/arch/x86/guest/hyperv/tlb.c     | 172 +++++++++++++++++++++++++++-
>  xen/arch/x86/guest/hyperv/util.c    |  72 ++++++++++++
>  4 files changed, 253 insertions(+), 1 deletion(-)
>  create mode 100644 xen/arch/x86/guest/hyperv/util.c
> 
> diff --git a/xen/arch/x86/guest/hyperv/Makefile b/xen/arch/x86/guest/hyperv/Makefile
> index 18902c33e9..0e39410968 100644
> --- a/xen/arch/x86/guest/hyperv/Makefile
> +++ b/xen/arch/x86/guest/hyperv/Makefile
> @@ -1,2 +1,3 @@
>  obj-y += hyperv.o
>  obj-y += tlb.o
> +obj-y += util.o
> diff --git a/xen/arch/x86/guest/hyperv/private.h b/xen/arch/x86/guest/hyperv/private.h
> index 78e52f74ce..311f060495 100644
> --- a/xen/arch/x86/guest/hyperv/private.h
> +++ b/xen/arch/x86/guest/hyperv/private.h
> @@ -24,12 +24,21 @@
>  
>  #include <xen/cpumask.h>
>  #include <xen/percpu.h>
> +#include <xen/types.h>
>  
>  DECLARE_PER_CPU(void *, hv_input_page);
>  DECLARE_PER_CPU(void *, hv_vp_assist);
>  DECLARE_PER_CPU(uint32_t, hv_vp_index);
>  
> +static inline uint32_t hv_vp_index(int cpu)

unsigned int for cpu.

> +{
> +    return per_cpu(hv_vp_index, cpu);
> +}
> +
>  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
>                       unsigned int flags);
>  
> +/* Returns number of banks, -ev if error */
> +int cpumask_to_vpset(struct hv_vpset *vpset, const cpumask_t *mask);
> +
>  #endif /* __XEN_HYPERV_PRIVIATE_H__  */
> diff --git a/xen/arch/x86/guest/hyperv/tlb.c b/xen/arch/x86/guest/hyperv/tlb.c
> index 48f527229e..99b789d9e9 100644
> --- a/xen/arch/x86/guest/hyperv/tlb.c
> +++ b/xen/arch/x86/guest/hyperv/tlb.c
> @@ -19,15 +19,185 @@
>   * Copyright (c) 2020 Microsoft.
>   */
>  
> +#include <xen/cpu.h>
>  #include <xen/cpumask.h>
>  #include <xen/errno.h>
>  
> +#include <asm/guest/hyperv.h>
> +#include <asm/guest/hyperv-hcall.h>
> +#include <asm/guest/hyperv-tlfs.h>
> +
>  #include "private.h"
>  
> +/*
> + * It is possible to encode up to 4096 pages using the lower 12 bits
> + * in an element of gva_list
> + */
> +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
> +#define ORDER_TO_BYTES(order) ((1ul << (order)) * PAGE_SIZE)

There are already some conversion functions in xen/mm.h
(get_order_from_{bytes/pages}), maybe you could add a
get_bytes_from_order helper there?

> +
> +static unsigned int fill_gva_list(uint64_t *gva_list, const void *va,
> +                                  unsigned int order)
> +{
> +    unsigned long start = (unsigned long)va;
> +    unsigned long end = start + ORDER_TO_BYTES(order) - 1;
> +    unsigned int n = 0;
> +
> +    do {
> +        unsigned long remain = end > start ? end - start : 0;

I don't think you can get here with end == start?

As that's the condition of the loop, and order 0 is going to set
end = start + 4096 - 1.

> +
> +        gva_list[n] = start & PAGE_MASK;
> +
> +        /*
> +         * Use lower 12 bits to encode the number of additional pages
> +         * to flush
> +         */
> +        if ( remain >= HV_TLB_FLUSH_UNIT )
> +        {
> +            gva_list[n] |= ~PAGE_MASK;
> +            start += HV_TLB_FLUSH_UNIT;
> +        }
> +        else if ( remain )
> +        {
> +            gva_list[n] |= (remain - 1) >> PAGE_SHIFT;
> +            start = end;
> +        }
> +
> +        n++;
> +    } while ( start < end );
> +
> +    return n;
> +}
> +
> +static uint64_t flush_tlb_ex(const cpumask_t *mask, const void *va,
> +                             unsigned int flags)
> +{
> +    struct hv_tlb_flush_ex *flush = this_cpu(hv_input_page);
> +    int nr_banks;
> +    unsigned int max_gvas;
> +    unsigned int order = flags & FLUSH_ORDER_MASK;
> +    uint64_t ret;
> +
> +    ASSERT(flush);
> +    ASSERT(!local_irq_is_enabled());

Can you turn this into an if condition with ASSERT_UNREACHABLE and
return ~0ULL? (as I think that signals an error).

> +
> +    if ( !(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED) )
> +        return ~0ULL;
> +
> +    flush->address_space = 0;
> +    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> +    if ( !(flags & FLUSH_TLB_GLOBAL) )
> +        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> +
> +    flush->hv_vp_set.valid_bank_mask = 0;
> +    flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
> +
> +    nr_banks = cpumask_to_vpset(&flush->hv_vp_set, mask);
> +    if ( nr_banks < 0 )
> +        return ~0ULL;
> +
> +    max_gvas =
> +        (PAGE_SIZE - sizeof(*flush) - nr_banks *
> +         sizeof(flush->hv_vp_set.bank_contents[0])) /
> +        sizeof(uint64_t);       /* gva is represented as uint64_t */
> +
> +    /*
> +     * Flush the entire address space if va is NULL or if there is not
> +     * enough space for gva_list.
> +     */
> +    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
> +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 0,
> +                                  nr_banks, virt_to_maddr(flush), 0);
> +    else
> +    {
> +        uint64_t *gva_list = (uint64_t *)flush + sizeof(*flush) + nr_banks;

Don't you need nr_banks * sizeof(flush->hv_vp_set.bank_contents) in
order to calculate the position of the gva_list?

> +        unsigned int gvas = fill_gva_list(gva_list, va, order);
> +
> +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
> +                                  gvas, nr_banks, virt_to_maddr(flush), 0);
> +    }
> +
> +    return ret;
> +}
> +
>  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
>                       unsigned int flags)
>  {
> -    return -EOPNOTSUPP;
> +    unsigned long irq_flags;
> +    struct hv_tlb_flush *flush = this_cpu(hv_input_page);
> +    uint64_t ret;
> +    unsigned int order = flags & FLUSH_ORDER_MASK;
> +    unsigned int max_gvas;
> +
> +    ASSERT(flush);
> +    ASSERT(!cpumask_empty(mask));
> +
> +    local_irq_save(irq_flags);
> +
> +    flush->address_space = 0;
> +    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> +    flush->processor_mask = 0;
> +    if ( !(flags & FLUSH_TLB_GLOBAL) )
> +        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> +
> +    if ( cpumask_equal(mask, &cpu_online_map) )
> +        flush->flags |= HV_FLUSH_ALL_PROCESSORS;
> +    else
> +    {
> +        int cpu;

unsigned int.

> +
> +        /*
> +         * Normally VP indices are in ascending order and match Xen's
> +         * idea of CPU ids. Check the last index to see if VP index is
> +         * >= 64. If so, we can skip setting up parameters for
> +         * non-applicable hypercalls without looking further.
> +         */
> +        if ( hv_vp_index(cpumask_last(mask)) >= 64 )
> +            goto do_ex_hypercall;
> +
> +        for_each_cpu ( cpu, mask )
> +        {
> +            uint32_t vpid = hv_vp_index(cpu);
> +
> +            if ( vpid > ms_hyperv.max_vp_index )
> +            {
> +                local_irq_restore(irq_flags);
> +                return -ENXIO;
> +            }
> +
> +            if ( vpid >= 64 )
> +                goto do_ex_hypercall;
> +
> +            __set_bit(vpid, &flush->processor_mask);
> +        }
> +    }
> +
> +    max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]);
> +
> +    /*
> +     * Flush the entire address space if va is NULL or if there is not
> +     * enough space for gva_list.
> +     */
> +    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
> +        ret = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> +                              virt_to_maddr(flush), 0);
> +    else
> +    {
> +        unsigned int gvas = fill_gva_list(flush->gva_list, va, order);
> +
> +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, gvas, 0,
> +                                  virt_to_maddr(flush), 0);
> +    }
> +
> +    goto done;
> +
> + do_ex_hypercall:
> +    ret = flush_tlb_ex(mask, va, flags);
> +
> + done:
> +    local_irq_restore(irq_flags);
> +
> +    return ret & HV_HYPERCALL_RESULT_MASK;

Will this return an error code that uses the same space as Xen's errno
values?

>  }
>  
>  /*
> diff --git a/xen/arch/x86/guest/hyperv/util.c b/xen/arch/x86/guest/hyperv/util.c
> new file mode 100644
> index 0000000000..9d0b5f4a46
> --- /dev/null
> +++ b/xen/arch/x86/guest/hyperv/util.c
> @@ -0,0 +1,72 @@
> +/******************************************************************************
> + * arch/x86/guest/hyperv/util.c
> + *
> + * Hyper-V utility functions
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; If not, see <http://www.gnu.org/licenses/>.
> + *
> + * Copyright (c) 2020 Microsoft.
> + */
> +
> +#include <xen/cpu.h>
> +#include <xen/cpumask.h>
> +#include <xen/errno.h>
> +
> +#include <asm/guest/hyperv.h>
> +#include <asm/guest/hyperv-tlfs.h>
> +
> +#include "private.h"
> +
> +int cpumask_to_vpset(struct hv_vpset *vpset,
> +                     const cpumask_t *mask)
> +{
> +    int nr = 1, cpu, vcpu_bank, vcpu_offset;
> +    int max_banks = ms_hyperv.max_vp_index / 64;

I think nr whats to be int (to match the function return type), but
the rest should be unsigned ints, specially because they are used as
array indexes.

> +
> +    /* Up to 64 banks can be represented by valid_bank_mask */
> +    if ( max_banks >= 64 )
> +        return -1;

E2BIG or some such?

> +
> +    /* Clear all banks to avoid flushing unwanted CPUs */
> +    for ( vcpu_bank = 0; vcpu_bank <= max_banks; vcpu_bank++ )
> +        vpset->bank_contents[vcpu_bank] = 0;
> +
> +    vpset->valid_bank_mask = 0;
> +
> +    for_each_cpu ( cpu, mask )
> +    {
> +        int vcpu = hv_vp_index(cpu);

unsigned int or uint32_t (which is the tyupe that hv_vp_index
returns).

Thanks, Roger.
Jan Beulich Feb. 13, 2020, 9:49 a.m. UTC | #2
On 12.02.2020 18:43, Roger Pau Monné wrote:
> On Wed, Feb 12, 2020 at 04:09:18PM +0000, Wei Liu wrote:
>> Implement L0 assisted TLB flush for Xen on Hyper-V. It takes advantage
>> of several hypercalls:
>>
>>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST
>>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
>>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE
>>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX
>>
>> Pick the most efficient hypercalls available.
>>
>> Signed-off-by: Wei Liu <liuwe@microsoft.com>
>> ---
>>  xen/arch/x86/guest/hyperv/Makefile  |   1 +
>>  xen/arch/x86/guest/hyperv/private.h |   9 ++
>>  xen/arch/x86/guest/hyperv/tlb.c     | 172 +++++++++++++++++++++++++++-
>>  xen/arch/x86/guest/hyperv/util.c    |  72 ++++++++++++
>>  4 files changed, 253 insertions(+), 1 deletion(-)
>>  create mode 100644 xen/arch/x86/guest/hyperv/util.c
>>
>> diff --git a/xen/arch/x86/guest/hyperv/Makefile b/xen/arch/x86/guest/hyperv/Makefile
>> index 18902c33e9..0e39410968 100644
>> --- a/xen/arch/x86/guest/hyperv/Makefile
>> +++ b/xen/arch/x86/guest/hyperv/Makefile
>> @@ -1,2 +1,3 @@
>>  obj-y += hyperv.o
>>  obj-y += tlb.o
>> +obj-y += util.o
>> diff --git a/xen/arch/x86/guest/hyperv/private.h b/xen/arch/x86/guest/hyperv/private.h
>> index 78e52f74ce..311f060495 100644
>> --- a/xen/arch/x86/guest/hyperv/private.h
>> +++ b/xen/arch/x86/guest/hyperv/private.h
>> @@ -24,12 +24,21 @@
>>  
>>  #include <xen/cpumask.h>
>>  #include <xen/percpu.h>
>> +#include <xen/types.h>
>>  
>>  DECLARE_PER_CPU(void *, hv_input_page);
>>  DECLARE_PER_CPU(void *, hv_vp_assist);
>>  DECLARE_PER_CPU(uint32_t, hv_vp_index);
>>  
>> +static inline uint32_t hv_vp_index(int cpu)
> 
> unsigned int for cpu.

And also for the return type, as per my comment on patch 1.

>> --- a/xen/arch/x86/guest/hyperv/tlb.c
>> +++ b/xen/arch/x86/guest/hyperv/tlb.c
>> @@ -19,15 +19,185 @@
>>   * Copyright (c) 2020 Microsoft.
>>   */
>>  
>> +#include <xen/cpu.h>
>>  #include <xen/cpumask.h>
>>  #include <xen/errno.h>
>>  
>> +#include <asm/guest/hyperv.h>
>> +#include <asm/guest/hyperv-hcall.h>
>> +#include <asm/guest/hyperv-tlfs.h>
>> +
>>  #include "private.h"
>>  
>> +/*
>> + * It is possible to encode up to 4096 pages using the lower 12 bits
>> + * in an element of gva_list
>> + */
>> +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
>> +#define ORDER_TO_BYTES(order) ((1ul << (order)) * PAGE_SIZE)
> 
> There are already some conversion functions in xen/mm.h
> (get_order_from_{bytes/pages}), maybe you could add a
> get_bytes_from_order helper there?

I don't think a macro (or helper function) is worthwhile here - we
don't have any in the various other places that do the same. The
above should be used inline, preferably in the simpler form of
PAGE_SIZE << order.

Jan
Wei Liu Feb. 13, 2020, 12:20 p.m. UTC | #3
On Wed, Feb 12, 2020 at 06:43:47PM +0100, Roger Pau Monné wrote:
> On Wed, Feb 12, 2020 at 04:09:18PM +0000, Wei Liu wrote:
> > Implement L0 assisted TLB flush for Xen on Hyper-V. It takes advantage
> > of several hypercalls:
> > 
> >  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST
> >  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
> >  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE
> >  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX
> > 
> > Pick the most efficient hypercalls available.
> > 
> > Signed-off-by: Wei Liu <liuwe@microsoft.com>
> > ---
> >  xen/arch/x86/guest/hyperv/Makefile  |   1 +
> >  xen/arch/x86/guest/hyperv/private.h |   9 ++
> >  xen/arch/x86/guest/hyperv/tlb.c     | 172 +++++++++++++++++++++++++++-
> >  xen/arch/x86/guest/hyperv/util.c    |  72 ++++++++++++
> >  4 files changed, 253 insertions(+), 1 deletion(-)
> >  create mode 100644 xen/arch/x86/guest/hyperv/util.c
> > 
> > diff --git a/xen/arch/x86/guest/hyperv/Makefile b/xen/arch/x86/guest/hyperv/Makefile
> > index 18902c33e9..0e39410968 100644
> > --- a/xen/arch/x86/guest/hyperv/Makefile
> > +++ b/xen/arch/x86/guest/hyperv/Makefile
> > @@ -1,2 +1,3 @@
> >  obj-y += hyperv.o
> >  obj-y += tlb.o
> > +obj-y += util.o
> > diff --git a/xen/arch/x86/guest/hyperv/private.h b/xen/arch/x86/guest/hyperv/private.h
> > index 78e52f74ce..311f060495 100644
> > --- a/xen/arch/x86/guest/hyperv/private.h
> > +++ b/xen/arch/x86/guest/hyperv/private.h
> > @@ -24,12 +24,21 @@
> >  
> >  #include <xen/cpumask.h>
> >  #include <xen/percpu.h>
> > +#include <xen/types.h>
> >  
> >  DECLARE_PER_CPU(void *, hv_input_page);
> >  DECLARE_PER_CPU(void *, hv_vp_assist);
> >  DECLARE_PER_CPU(uint32_t, hv_vp_index);
> >  
> > +static inline uint32_t hv_vp_index(int cpu)
> 
> unsigned int for cpu.
> 
> > +{
> > +    return per_cpu(hv_vp_index, cpu);
> > +}
> > +
> >  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
> >                       unsigned int flags);
> >  
> > +/* Returns number of banks, -ev if error */
> > +int cpumask_to_vpset(struct hv_vpset *vpset, const cpumask_t *mask);
> > +
> >  #endif /* __XEN_HYPERV_PRIVIATE_H__  */
> > diff --git a/xen/arch/x86/guest/hyperv/tlb.c b/xen/arch/x86/guest/hyperv/tlb.c
> > index 48f527229e..99b789d9e9 100644
> > --- a/xen/arch/x86/guest/hyperv/tlb.c
> > +++ b/xen/arch/x86/guest/hyperv/tlb.c
> > @@ -19,15 +19,185 @@
> >   * Copyright (c) 2020 Microsoft.
> >   */
> >  
> > +#include <xen/cpu.h>
> >  #include <xen/cpumask.h>
> >  #include <xen/errno.h>
> >  
> > +#include <asm/guest/hyperv.h>
> > +#include <asm/guest/hyperv-hcall.h>
> > +#include <asm/guest/hyperv-tlfs.h>
> > +
> >  #include "private.h"
> >  
> > +/*
> > + * It is possible to encode up to 4096 pages using the lower 12 bits
> > + * in an element of gva_list
> > + */
> > +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
> > +#define ORDER_TO_BYTES(order) ((1ul << (order)) * PAGE_SIZE)
> 
> There are already some conversion functions in xen/mm.h
> (get_order_from_{bytes/pages}), maybe you could add a
> get_bytes_from_order helper there?
> 
> > +
> > +static unsigned int fill_gva_list(uint64_t *gva_list, const void *va,
> > +                                  unsigned int order)
> > +{
> > +    unsigned long start = (unsigned long)va;
> > +    unsigned long end = start + ORDER_TO_BYTES(order) - 1;
> > +    unsigned int n = 0;
> > +
> > +    do {
> > +        unsigned long remain = end > start ? end - start : 0;
> 
> I don't think you can get here with end == start?
> 
> As that's the condition of the loop, and order 0 is going to set
> end = start + 4096 - 1.

Correct. This can be simplified as remain = end - start .

> 
> > +
> > +        gva_list[n] = start & PAGE_MASK;
> > +
> > +        /*
> > +         * Use lower 12 bits to encode the number of additional pages
> > +         * to flush
> > +         */
> > +        if ( remain >= HV_TLB_FLUSH_UNIT )
> > +        {
> > +            gva_list[n] |= ~PAGE_MASK;
> > +            start += HV_TLB_FLUSH_UNIT;
> > +        }
> > +        else if ( remain )
> > +        {
> > +            gva_list[n] |= (remain - 1) >> PAGE_SHIFT;
> > +            start = end;
> > +        }
> > +
> > +        n++;
> > +    } while ( start < end );
> > +
> > +    return n;
> > +}
> > +
> > +static uint64_t flush_tlb_ex(const cpumask_t *mask, const void *va,
> > +                             unsigned int flags)
> > +{
> > +    struct hv_tlb_flush_ex *flush = this_cpu(hv_input_page);
> > +    int nr_banks;
> > +    unsigned int max_gvas;
> > +    unsigned int order = flags & FLUSH_ORDER_MASK;
> > +    uint64_t ret;
> > +
> > +    ASSERT(flush);
> > +    ASSERT(!local_irq_is_enabled());
> 
> Can you turn this into an if condition with ASSERT_UNREACHABLE and
> return ~0ULL? (as I think that signals an error).
> 

There is no need for that. This function will always be internal to
Hyper-V in the foreseeable future. If it is ever called with IRQ enabled
something is wrong with the code.

> > +
> > +    if ( !(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED) )
> > +        return ~0ULL;
> > +
> > +    flush->address_space = 0;
> > +    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> > +    if ( !(flags & FLUSH_TLB_GLOBAL) )
> > +        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> > +
> > +    flush->hv_vp_set.valid_bank_mask = 0;
> > +    flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
> > +
> > +    nr_banks = cpumask_to_vpset(&flush->hv_vp_set, mask);
> > +    if ( nr_banks < 0 )
> > +        return ~0ULL;
> > +
> > +    max_gvas =
> > +        (PAGE_SIZE - sizeof(*flush) - nr_banks *
> > +         sizeof(flush->hv_vp_set.bank_contents[0])) /
> > +        sizeof(uint64_t);       /* gva is represented as uint64_t */
> > +
> > +    /*
> > +     * Flush the entire address space if va is NULL or if there is not
> > +     * enough space for gva_list.
> > +     */
> > +    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
> > +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 0,
> > +                                  nr_banks, virt_to_maddr(flush), 0);
> > +    else
> > +    {
> > +        uint64_t *gva_list = (uint64_t *)flush + sizeof(*flush) + nr_banks;
> 
> Don't you need nr_banks * sizeof(flush->hv_vp_set.bank_contents) in
> order to calculate the position of the gva_list?
> 

The pointer arithmetic is done on uint64_t pointers so it already takes
into account sizeof(bank_contents[0]).

> > +        unsigned int gvas = fill_gva_list(gva_list, va, order);
> > +
> > +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
> > +                                  gvas, nr_banks, virt_to_maddr(flush), 0);
> > +    }
> > +
> > +    return ret;
> > +}
> > +
> >  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
> >                       unsigned int flags)
> >  {
> > -    return -EOPNOTSUPP;
> > +    unsigned long irq_flags;
> > +    struct hv_tlb_flush *flush = this_cpu(hv_input_page);
> > +    uint64_t ret;
> > +    unsigned int order = flags & FLUSH_ORDER_MASK;
> > +    unsigned int max_gvas;
> > +
> > +    ASSERT(flush);
> > +    ASSERT(!cpumask_empty(mask));
> > +
> > +    local_irq_save(irq_flags);
> > +
> > +    flush->address_space = 0;
> > +    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> > +    flush->processor_mask = 0;
> > +    if ( !(flags & FLUSH_TLB_GLOBAL) )
> > +        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> > +
> > +    if ( cpumask_equal(mask, &cpu_online_map) )
> > +        flush->flags |= HV_FLUSH_ALL_PROCESSORS;
> > +    else
> > +    {
> > +        int cpu;
> 
> unsigned int.
> 

I picked int here and above because all the cpumask functions return
int. I don't mind changing it to unsigned int -- it makes no practical
difference.

> > +
> > +        /*
> > +         * Normally VP indices are in ascending order and match Xen's
> > +         * idea of CPU ids. Check the last index to see if VP index is
> > +         * >= 64. If so, we can skip setting up parameters for
> > +         * non-applicable hypercalls without looking further.
> > +         */
> > +        if ( hv_vp_index(cpumask_last(mask)) >= 64 )
> > +            goto do_ex_hypercall;
> > +
> > +        for_each_cpu ( cpu, mask )
> > +        {
> > +            uint32_t vpid = hv_vp_index(cpu);
> > +
> > +            if ( vpid > ms_hyperv.max_vp_index )
> > +            {
> > +                local_irq_restore(irq_flags);
> > +                return -ENXIO;
> > +            }
> > +
> > +            if ( vpid >= 64 )
> > +                goto do_ex_hypercall;
> > +
> > +            __set_bit(vpid, &flush->processor_mask);
> > +        }
> > +    }
> > +
> > +    max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]);
> > +
> > +    /*
> > +     * Flush the entire address space if va is NULL or if there is not
> > +     * enough space for gva_list.
> > +     */
> > +    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
> > +        ret = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> > +                              virt_to_maddr(flush), 0);
> > +    else
> > +    {
> > +        unsigned int gvas = fill_gva_list(flush->gva_list, va, order);
> > +
> > +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, gvas, 0,
> > +                                  virt_to_maddr(flush), 0);
> > +    }
> > +
> > +    goto done;
> > +
> > + do_ex_hypercall:
> > +    ret = flush_tlb_ex(mask, va, flags);
> > +
> > + done:
> > +    local_irq_restore(irq_flags);
> > +
> > +    return ret & HV_HYPERCALL_RESULT_MASK;
> 
> Will this return an error code that uses the same space as Xen's errno
> values?
> 

No, it won't. It returns Hyper-V's status code (0 still means success).

I didn't think that was a big deal because non-zero values meant errors.
And the upper layer didn't care about the exact error values (yet).

> >  }
> >  
> >  /*
> > diff --git a/xen/arch/x86/guest/hyperv/util.c b/xen/arch/x86/guest/hyperv/util.c
> > new file mode 100644
> > index 0000000000..9d0b5f4a46
> > --- /dev/null
> > +++ b/xen/arch/x86/guest/hyperv/util.c
> > @@ -0,0 +1,72 @@
> > +/******************************************************************************
> > + * arch/x86/guest/hyperv/util.c
> > + *
> > + * Hyper-V utility functions
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; If not, see <http://www.gnu.org/licenses/>.
> > + *
> > + * Copyright (c) 2020 Microsoft.
> > + */
> > +
> > +#include <xen/cpu.h>
> > +#include <xen/cpumask.h>
> > +#include <xen/errno.h>
> > +
> > +#include <asm/guest/hyperv.h>
> > +#include <asm/guest/hyperv-tlfs.h>
> > +
> > +#include "private.h"
> > +
> > +int cpumask_to_vpset(struct hv_vpset *vpset,
> > +                     const cpumask_t *mask)
> > +{
> > +    int nr = 1, cpu, vcpu_bank, vcpu_offset;
> > +    int max_banks = ms_hyperv.max_vp_index / 64;
> 
> I think nr whats to be int (to match the function return type), but
> the rest should be unsigned ints, specially because they are used as
> array indexes.
> 

OK.

> > +
> > +    /* Up to 64 banks can be represented by valid_bank_mask */
> > +    if ( max_banks >= 64 )
> > +        return -1;
> 
> E2BIG or some such?
> 

Right. That's better than -1.

> > +
> > +    /* Clear all banks to avoid flushing unwanted CPUs */
> > +    for ( vcpu_bank = 0; vcpu_bank <= max_banks; vcpu_bank++ )
> > +        vpset->bank_contents[vcpu_bank] = 0;
> > +
> > +    vpset->valid_bank_mask = 0;
> > +
> > +    for_each_cpu ( cpu, mask )
> > +    {
> > +        int vcpu = hv_vp_index(cpu);
> 
> unsigned int or uint32_t (which is the tyupe that hv_vp_index
> returns).
> 
> Thanks, Roger.
Wei Liu Feb. 13, 2020, 12:25 p.m. UTC | #4
On Thu, Feb 13, 2020 at 10:49:39AM +0100, Jan Beulich wrote:
> >> diff --git a/xen/arch/x86/guest/hyperv/Makefile b/xen/arch/x86/guest/hyperv/Makefile
> >> index 18902c33e9..0e39410968 100644
> >> --- a/xen/arch/x86/guest/hyperv/Makefile
> >> +++ b/xen/arch/x86/guest/hyperv/Makefile
> >> @@ -1,2 +1,3 @@
> >>  obj-y += hyperv.o
> >>  obj-y += tlb.o
> >> +obj-y += util.o
> >> diff --git a/xen/arch/x86/guest/hyperv/private.h b/xen/arch/x86/guest/hyperv/private.h
> >> index 78e52f74ce..311f060495 100644
> >> --- a/xen/arch/x86/guest/hyperv/private.h
> >> +++ b/xen/arch/x86/guest/hyperv/private.h
> >> @@ -24,12 +24,21 @@
> >>  
> >>  #include <xen/cpumask.h>
> >>  #include <xen/percpu.h>
> >> +#include <xen/types.h>
> >>  
> >>  DECLARE_PER_CPU(void *, hv_input_page);
> >>  DECLARE_PER_CPU(void *, hv_vp_assist);
> >>  DECLARE_PER_CPU(uint32_t, hv_vp_index);
> >>  
> >> +static inline uint32_t hv_vp_index(int cpu)
> > 
> > unsigned int for cpu.
> 
> And also for the return type, as per my comment on patch 1.

Ack.
Roger Pau Monné Feb. 13, 2020, 12:41 p.m. UTC | #5
On Thu, Feb 13, 2020 at 12:20:33PM +0000, Wei Liu wrote:
> On Wed, Feb 12, 2020 at 06:43:47PM +0100, Roger Pau Monné wrote:
> > On Wed, Feb 12, 2020 at 04:09:18PM +0000, Wei Liu wrote:
> > > +static uint64_t flush_tlb_ex(const cpumask_t *mask, const void *va,
> > > +                             unsigned int flags)
> > > +{
> > > +    struct hv_tlb_flush_ex *flush = this_cpu(hv_input_page);
> > > +    int nr_banks;
> > > +    unsigned int max_gvas;
> > > +    unsigned int order = flags & FLUSH_ORDER_MASK;
> > > +    uint64_t ret;
> > > +
> > > +    ASSERT(flush);
> > > +    ASSERT(!local_irq_is_enabled());
> > 
> > Can you turn this into an if condition with ASSERT_UNREACHABLE and
> > return ~0ULL? (as I think that signals an error).
> > 
> 
> There is no need for that. This function will always be internal to
> Hyper-V in the foreseeable future. If it is ever called with IRQ enabled
> something is wrong with the code.

But iff it ever manages to be called violating one of those conditions
things will go badly I assume?

It would be better to stay on the safe side and simply return an error
when the conditions are no meet, and assert in the debug build.

> 
> > > +
> > > +    if ( !(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED) )
> > > +        return ~0ULL;
> > > +
> > > +    flush->address_space = 0;
> > > +    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> > > +    if ( !(flags & FLUSH_TLB_GLOBAL) )
> > > +        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> > > +
> > > +    flush->hv_vp_set.valid_bank_mask = 0;
> > > +    flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
> > > +
> > > +    nr_banks = cpumask_to_vpset(&flush->hv_vp_set, mask);
> > > +    if ( nr_banks < 0 )
> > > +        return ~0ULL;
> > > +
> > > +    max_gvas =
> > > +        (PAGE_SIZE - sizeof(*flush) - nr_banks *
> > > +         sizeof(flush->hv_vp_set.bank_contents[0])) /
> > > +        sizeof(uint64_t);       /* gva is represented as uint64_t */
> > > +
> > > +    /*
> > > +     * Flush the entire address space if va is NULL or if there is not
> > > +     * enough space for gva_list.
> > > +     */
> > > +    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
> > > +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 0,
> > > +                                  nr_banks, virt_to_maddr(flush), 0);
> > > +    else
> > > +    {
> > > +        uint64_t *gva_list = (uint64_t *)flush + sizeof(*flush) + nr_banks;
> > 
> > Don't you need nr_banks * sizeof(flush->hv_vp_set.bank_contents) in
> > order to calculate the position of the gva_list?
> > 
> 
> The pointer arithmetic is done on uint64_t pointers so it already takes
> into account sizeof(bank_contents[0]).

Oh, then the sizeof(*flush) should be divided by sizeof(uint64_t)?

> > > +        unsigned int gvas = fill_gva_list(gva_list, va, order);
> > > +
> > > +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
> > > +                                  gvas, nr_banks, virt_to_maddr(flush), 0);
> > > +    }
> > > +
> > > +    return ret;
> > > +}
> > > +
> > >  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
> > >                       unsigned int flags)
> > >  {
> > > -    return -EOPNOTSUPP;
> > > +    unsigned long irq_flags;
> > > +    struct hv_tlb_flush *flush = this_cpu(hv_input_page);
> > > +    uint64_t ret;
> > > +    unsigned int order = flags & FLUSH_ORDER_MASK;
> > > +    unsigned int max_gvas;
> > > +
> > > +    ASSERT(flush);
> > > +    ASSERT(!cpumask_empty(mask));
> > > +
> > > +    local_irq_save(irq_flags);
> > > +
> > > +    flush->address_space = 0;
> > > +    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> > > +    flush->processor_mask = 0;
> > > +    if ( !(flags & FLUSH_TLB_GLOBAL) )
> > > +        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> > > +
> > > +    if ( cpumask_equal(mask, &cpu_online_map) )
> > > +        flush->flags |= HV_FLUSH_ALL_PROCESSORS;
> > > +    else
> > > +    {
> > > +        int cpu;
> > 
> > unsigned int.
> > 
> 
> I picked int here and above because all the cpumask functions return
> int. I don't mind changing it to unsigned int -- it makes no practical
> difference.

Those should likely return unsigned ints also, as I don't think
cpumask can return errors. I prefer unsigned int, since negative cpu
values make no sense.

> > > +
> > > +        /*
> > > +         * Normally VP indices are in ascending order and match Xen's
> > > +         * idea of CPU ids. Check the last index to see if VP index is
> > > +         * >= 64. If so, we can skip setting up parameters for
> > > +         * non-applicable hypercalls without looking further.
> > > +         */
> > > +        if ( hv_vp_index(cpumask_last(mask)) >= 64 )
> > > +            goto do_ex_hypercall;
> > > +
> > > +        for_each_cpu ( cpu, mask )
> > > +        {
> > > +            uint32_t vpid = hv_vp_index(cpu);
> > > +
> > > +            if ( vpid > ms_hyperv.max_vp_index )
> > > +            {
> > > +                local_irq_restore(irq_flags);
> > > +                return -ENXIO;
> > > +            }
> > > +
> > > +            if ( vpid >= 64 )
> > > +                goto do_ex_hypercall;
> > > +
> > > +            __set_bit(vpid, &flush->processor_mask);
> > > +        }
> > > +    }
> > > +
> > > +    max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]);
> > > +
> > > +    /*
> > > +     * Flush the entire address space if va is NULL or if there is not
> > > +     * enough space for gva_list.
> > > +     */
> > > +    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
> > > +        ret = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> > > +                              virt_to_maddr(flush), 0);
> > > +    else
> > > +    {
> > > +        unsigned int gvas = fill_gva_list(flush->gva_list, va, order);
> > > +
> > > +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, gvas, 0,
> > > +                                  virt_to_maddr(flush), 0);
> > > +    }
> > > +
> > > +    goto done;
> > > +
> > > + do_ex_hypercall:
> > > +    ret = flush_tlb_ex(mask, va, flags);
> > > +
> > > + done:
> > > +    local_irq_restore(irq_flags);
> > > +
> > > +    return ret & HV_HYPERCALL_RESULT_MASK;
> > 
> > Will this return an error code that uses the same space as Xen's errno
> > values?
> > 
> 
> No, it won't. It returns Hyper-V's status code (0 still means success).
> 
> I didn't think that was a big deal because non-zero values meant errors.
> And the upper layer didn't care about the exact error values (yet).

Hm, I would rather have this return an error value in the errno.h
range. ie:

return ret & HV_HYPERCALL_RESULT_MASK ? -EINVAL : 0;

Or something along this lines, but long term you will need some kind
of mapping between HyperV and Xen error codes IMO.

Thanks, Roger.
Wei Liu Feb. 14, 2020, 10:47 a.m. UTC | #6
On Thu, Feb 13, 2020 at 01:41:27PM +0100, Roger Pau Monné wrote:
> On Thu, Feb 13, 2020 at 12:20:33PM +0000, Wei Liu wrote:
> > On Wed, Feb 12, 2020 at 06:43:47PM +0100, Roger Pau Monné wrote:
> > > On Wed, Feb 12, 2020 at 04:09:18PM +0000, Wei Liu wrote:
> > > > +static uint64_t flush_tlb_ex(const cpumask_t *mask, const void *va,
> > > > +                             unsigned int flags)
> > > > +{
> > > > +    struct hv_tlb_flush_ex *flush = this_cpu(hv_input_page);
> > > > +    int nr_banks;
> > > > +    unsigned int max_gvas;
> > > > +    unsigned int order = flags & FLUSH_ORDER_MASK;
> > > > +    uint64_t ret;
> > > > +
> > > > +    ASSERT(flush);
> > > > +    ASSERT(!local_irq_is_enabled());
> > > 
> > > Can you turn this into an if condition with ASSERT_UNREACHABLE and
> > > return ~0ULL? (as I think that signals an error).
> > > 
> > 
> > There is no need for that. This function will always be internal to
> > Hyper-V in the foreseeable future. If it is ever called with IRQ enabled
> > something is wrong with the code.
> 
> But iff it ever manages to be called violating one of those conditions
> things will go badly I assume?
> 
> It would be better to stay on the safe side and simply return an error
> when the conditions are no meet, and assert in the debug build.

OK.

> 
> > 
> > > > +
> > > > +    if ( !(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED) )
> > > > +        return ~0ULL;
> > > > +
> > > > +    flush->address_space = 0;
> > > > +    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> > > > +    if ( !(flags & FLUSH_TLB_GLOBAL) )
> > > > +        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> > > > +
> > > > +    flush->hv_vp_set.valid_bank_mask = 0;
> > > > +    flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
> > > > +
> > > > +    nr_banks = cpumask_to_vpset(&flush->hv_vp_set, mask);
> > > > +    if ( nr_banks < 0 )
> > > > +        return ~0ULL;
> > > > +
> > > > +    max_gvas =
> > > > +        (PAGE_SIZE - sizeof(*flush) - nr_banks *
> > > > +         sizeof(flush->hv_vp_set.bank_contents[0])) /
> > > > +        sizeof(uint64_t);       /* gva is represented as uint64_t */
> > > > +
> > > > +    /*
> > > > +     * Flush the entire address space if va is NULL or if there is not
> > > > +     * enough space for gva_list.
> > > > +     */
> > > > +    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
> > > > +        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 0,
> > > > +                                  nr_banks, virt_to_maddr(flush), 0);
> > > > +    else
> > > > +    {
> > > > +        uint64_t *gva_list = (uint64_t *)flush + sizeof(*flush) + nr_banks;
> > > 
> > > Don't you need nr_banks * sizeof(flush->hv_vp_set.bank_contents) in
> > > order to calculate the position of the gva_list?
> > > 
> > 
> > The pointer arithmetic is done on uint64_t pointers so it already takes
> > into account sizeof(bank_contents[0]).
> 
> Oh, then the sizeof(*flush) should be divided by sizeof(uint64_t)?
> 

Yes. I think so. Thanks for catching this.

[...]
> > > > + do_ex_hypercall:
> > > > +    ret = flush_tlb_ex(mask, va, flags);
> > > > +
> > > > + done:
> > > > +    local_irq_restore(irq_flags);
> > > > +
> > > > +    return ret & HV_HYPERCALL_RESULT_MASK;
> > > 
> > > Will this return an error code that uses the same space as Xen's errno
> > > values?
> > > 
> > 
> > No, it won't. It returns Hyper-V's status code (0 still means success).
> > 
> > I didn't think that was a big deal because non-zero values meant errors.
> > And the upper layer didn't care about the exact error values (yet).
> 
> Hm, I would rather have this return an error value in the errno.h
> range. ie:
> 
> return ret & HV_HYPERCALL_RESULT_MASK ? -EINVAL : 0;
> 

Sure this can be done. I would use ENXIO rather than EINVAL though.

> Or something along this lines, but long term you will need some kind
> of mapping between HyperV and Xen error codes IMO.
> 

Yes. When we need more sophisticated handling of error codes.

Wei.

> Thanks, Roger.
diff mbox series

Patch

diff --git a/xen/arch/x86/guest/hyperv/Makefile b/xen/arch/x86/guest/hyperv/Makefile
index 18902c33e9..0e39410968 100644
--- a/xen/arch/x86/guest/hyperv/Makefile
+++ b/xen/arch/x86/guest/hyperv/Makefile
@@ -1,2 +1,3 @@ 
 obj-y += hyperv.o
 obj-y += tlb.o
+obj-y += util.o
diff --git a/xen/arch/x86/guest/hyperv/private.h b/xen/arch/x86/guest/hyperv/private.h
index 78e52f74ce..311f060495 100644
--- a/xen/arch/x86/guest/hyperv/private.h
+++ b/xen/arch/x86/guest/hyperv/private.h
@@ -24,12 +24,21 @@ 
 
 #include <xen/cpumask.h>
 #include <xen/percpu.h>
+#include <xen/types.h>
 
 DECLARE_PER_CPU(void *, hv_input_page);
 DECLARE_PER_CPU(void *, hv_vp_assist);
 DECLARE_PER_CPU(uint32_t, hv_vp_index);
 
+static inline uint32_t hv_vp_index(int cpu)
+{
+    return per_cpu(hv_vp_index, cpu);
+}
+
 int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
                      unsigned int flags);
 
+/* Returns number of banks, -ev if error */
+int cpumask_to_vpset(struct hv_vpset *vpset, const cpumask_t *mask);
+
 #endif /* __XEN_HYPERV_PRIVIATE_H__  */
diff --git a/xen/arch/x86/guest/hyperv/tlb.c b/xen/arch/x86/guest/hyperv/tlb.c
index 48f527229e..99b789d9e9 100644
--- a/xen/arch/x86/guest/hyperv/tlb.c
+++ b/xen/arch/x86/guest/hyperv/tlb.c
@@ -19,15 +19,185 @@ 
  * Copyright (c) 2020 Microsoft.
  */
 
+#include <xen/cpu.h>
 #include <xen/cpumask.h>
 #include <xen/errno.h>
 
+#include <asm/guest/hyperv.h>
+#include <asm/guest/hyperv-hcall.h>
+#include <asm/guest/hyperv-tlfs.h>
+
 #include "private.h"
 
+/*
+ * It is possible to encode up to 4096 pages using the lower 12 bits
+ * in an element of gva_list
+ */
+#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
+#define ORDER_TO_BYTES(order) ((1ul << (order)) * PAGE_SIZE)
+
+static unsigned int fill_gva_list(uint64_t *gva_list, const void *va,
+                                  unsigned int order)
+{
+    unsigned long start = (unsigned long)va;
+    unsigned long end = start + ORDER_TO_BYTES(order) - 1;
+    unsigned int n = 0;
+
+    do {
+        unsigned long remain = end > start ? end - start : 0;
+
+        gva_list[n] = start & PAGE_MASK;
+
+        /*
+         * Use lower 12 bits to encode the number of additional pages
+         * to flush
+         */
+        if ( remain >= HV_TLB_FLUSH_UNIT )
+        {
+            gva_list[n] |= ~PAGE_MASK;
+            start += HV_TLB_FLUSH_UNIT;
+        }
+        else if ( remain )
+        {
+            gva_list[n] |= (remain - 1) >> PAGE_SHIFT;
+            start = end;
+        }
+
+        n++;
+    } while ( start < end );
+
+    return n;
+}
+
+static uint64_t flush_tlb_ex(const cpumask_t *mask, const void *va,
+                             unsigned int flags)
+{
+    struct hv_tlb_flush_ex *flush = this_cpu(hv_input_page);
+    int nr_banks;
+    unsigned int max_gvas;
+    unsigned int order = flags & FLUSH_ORDER_MASK;
+    uint64_t ret;
+
+    ASSERT(flush);
+    ASSERT(!local_irq_is_enabled());
+
+    if ( !(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED) )
+        return ~0ULL;
+
+    flush->address_space = 0;
+    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+    if ( !(flags & FLUSH_TLB_GLOBAL) )
+        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
+
+    flush->hv_vp_set.valid_bank_mask = 0;
+    flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+
+    nr_banks = cpumask_to_vpset(&flush->hv_vp_set, mask);
+    if ( nr_banks < 0 )
+        return ~0ULL;
+
+    max_gvas =
+        (PAGE_SIZE - sizeof(*flush) - nr_banks *
+         sizeof(flush->hv_vp_set.bank_contents[0])) /
+        sizeof(uint64_t);       /* gva is represented as uint64_t */
+
+    /*
+     * Flush the entire address space if va is NULL or if there is not
+     * enough space for gva_list.
+     */
+    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
+        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 0,
+                                  nr_banks, virt_to_maddr(flush), 0);
+    else
+    {
+        uint64_t *gva_list = (uint64_t *)flush + sizeof(*flush) + nr_banks;
+        unsigned int gvas = fill_gva_list(gva_list, va, order);
+
+        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
+                                  gvas, nr_banks, virt_to_maddr(flush), 0);
+    }
+
+    return ret;
+}
+
 int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
                      unsigned int flags)
 {
-    return -EOPNOTSUPP;
+    unsigned long irq_flags;
+    struct hv_tlb_flush *flush = this_cpu(hv_input_page);
+    uint64_t ret;
+    unsigned int order = flags & FLUSH_ORDER_MASK;
+    unsigned int max_gvas;
+
+    ASSERT(flush);
+    ASSERT(!cpumask_empty(mask));
+
+    local_irq_save(irq_flags);
+
+    flush->address_space = 0;
+    flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+    flush->processor_mask = 0;
+    if ( !(flags & FLUSH_TLB_GLOBAL) )
+        flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
+
+    if ( cpumask_equal(mask, &cpu_online_map) )
+        flush->flags |= HV_FLUSH_ALL_PROCESSORS;
+    else
+    {
+        int cpu;
+
+        /*
+         * Normally VP indices are in ascending order and match Xen's
+         * idea of CPU ids. Check the last index to see if VP index is
+         * >= 64. If so, we can skip setting up parameters for
+         * non-applicable hypercalls without looking further.
+         */
+        if ( hv_vp_index(cpumask_last(mask)) >= 64 )
+            goto do_ex_hypercall;
+
+        for_each_cpu ( cpu, mask )
+        {
+            uint32_t vpid = hv_vp_index(cpu);
+
+            if ( vpid > ms_hyperv.max_vp_index )
+            {
+                local_irq_restore(irq_flags);
+                return -ENXIO;
+            }
+
+            if ( vpid >= 64 )
+                goto do_ex_hypercall;
+
+            __set_bit(vpid, &flush->processor_mask);
+        }
+    }
+
+    max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]);
+
+    /*
+     * Flush the entire address space if va is NULL or if there is not
+     * enough space for gva_list.
+     */
+    if ( !va || (ORDER_TO_BYTES(order) / HV_TLB_FLUSH_UNIT) > max_gvas )
+        ret = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
+                              virt_to_maddr(flush), 0);
+    else
+    {
+        unsigned int gvas = fill_gva_list(flush->gva_list, va, order);
+
+        ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, gvas, 0,
+                                  virt_to_maddr(flush), 0);
+    }
+
+    goto done;
+
+ do_ex_hypercall:
+    ret = flush_tlb_ex(mask, va, flags);
+
+ done:
+    local_irq_restore(irq_flags);
+
+    return ret & HV_HYPERCALL_RESULT_MASK;
 }
 
 /*
diff --git a/xen/arch/x86/guest/hyperv/util.c b/xen/arch/x86/guest/hyperv/util.c
new file mode 100644
index 0000000000..9d0b5f4a46
--- /dev/null
+++ b/xen/arch/x86/guest/hyperv/util.c
@@ -0,0 +1,72 @@ 
+/******************************************************************************
+ * arch/x86/guest/hyperv/util.c
+ *
+ * Hyper-V utility functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (c) 2020 Microsoft.
+ */
+
+#include <xen/cpu.h>
+#include <xen/cpumask.h>
+#include <xen/errno.h>
+
+#include <asm/guest/hyperv.h>
+#include <asm/guest/hyperv-tlfs.h>
+
+#include "private.h"
+
+int cpumask_to_vpset(struct hv_vpset *vpset,
+                     const cpumask_t *mask)
+{
+    int nr = 1, cpu, vcpu_bank, vcpu_offset;
+    int max_banks = ms_hyperv.max_vp_index / 64;
+
+    /* Up to 64 banks can be represented by valid_bank_mask */
+    if ( max_banks >= 64 )
+        return -1;
+
+    /* Clear all banks to avoid flushing unwanted CPUs */
+    for ( vcpu_bank = 0; vcpu_bank <= max_banks; vcpu_bank++ )
+        vpset->bank_contents[vcpu_bank] = 0;
+
+    vpset->valid_bank_mask = 0;
+
+    for_each_cpu ( cpu, mask )
+    {
+        int vcpu = hv_vp_index(cpu);
+
+        vcpu_bank = vcpu / 64;
+        vcpu_offset = vcpu % 64;
+
+        __set_bit(vcpu_offset, &vpset->bank_contents[vcpu_bank]);
+        __set_bit(vcpu_bank, &vpset->valid_bank_mask);
+
+        if ( vcpu_bank >= nr )
+            nr = vcpu_bank + 1;
+    }
+
+    return nr;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */