diff mbox series

[V2,2/3] xen/arm: Add handling of extended regions for Dom0

Message ID 1631297924-8658-3-git-send-email-olekstysh@gmail.com (mailing list archive)
State Superseded
Headers show
Series Add handling of extended regions (safe ranges) on Arm (Was "xen/memory: Introduce a hypercall to provide unallocated space") | expand

Commit Message

Oleksandr Tyshchenko Sept. 10, 2021, 6:18 p.m. UTC
From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>

The extended region (safe range) is a region of guest physical
address space which is unused and could be safely used to create
grant/foreign mappings instead of wasting real RAM pages from
the domain memory for establishing these mappings.

The extended regions are chosen at the domain creation time and
advertised to it via "reg" property under hypervisor node in
the guest device-tree. As region 0 is reserved for grant table
space (always present), the indexes for extended regions are 1...N.
If extended regions could not be allocated for some reason,
Xen doesn't fail and behaves as usual, so only inserts region 0.

Please note the following limitations:
- The extended region feature is only supported for 64-bit domain.
- The ACPI case is not covered.

***

As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
the algorithm to choose extended regions for it is different
in comparison with the algorithm for non-direct mapped DomU.
What is more, that extended regions should be chosen differently
whether IOMMU is enabled or not.

Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
holes found in host device-tree if otherwise. Make sure that
extended regions are 2MB-aligned and located within maximum possible
addressable physical memory range. The maximum number of extended
regions is 128.

Suggested-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
---
Changes since RFC:
   - update patch description
   - drop uneeded "extended-region" DT property
---

 xen/arch/arm/domain_build.c | 226 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 224 insertions(+), 2 deletions(-)

Comments

Stefano Stabellini Sept. 14, 2021, 12:55 a.m. UTC | #1
On Fri, 10 Sep 2021, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> 
> The extended region (safe range) is a region of guest physical
> address space which is unused and could be safely used to create
> grant/foreign mappings instead of wasting real RAM pages from
> the domain memory for establishing these mappings.
> 
> The extended regions are chosen at the domain creation time and
> advertised to it via "reg" property under hypervisor node in
> the guest device-tree. As region 0 is reserved for grant table
> space (always present), the indexes for extended regions are 1...N.
> If extended regions could not be allocated for some reason,
> Xen doesn't fail and behaves as usual, so only inserts region 0.
> 
> Please note the following limitations:
> - The extended region feature is only supported for 64-bit domain.
> - The ACPI case is not covered.
> 
> ***
> 
> As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
> the algorithm to choose extended regions for it is different
> in comparison with the algorithm for non-direct mapped DomU.
> What is more, that extended regions should be chosen differently
> whether IOMMU is enabled or not.
> 
> Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
> holes found in host device-tree if otherwise. Make sure that
> extended regions are 2MB-aligned and located within maximum possible
> addressable physical memory range. The maximum number of extended
> regions is 128.
> 
> Suggested-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> ---
> Changes since RFC:
>    - update patch description
>    - drop uneeded "extended-region" DT property
> ---
> 
>  xen/arch/arm/domain_build.c | 226 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 224 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> index 206038d..070ec27 100644
> --- a/xen/arch/arm/domain_build.c
> +++ b/xen/arch/arm/domain_build.c
> @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct domain *d,
>      return res;
>  }
>  
> +static int __init add_ext_regions(unsigned long s, unsigned long e, void *data)
> +{
> +    struct meminfo *ext_regions = data;
> +    paddr_t start, size;
> +
> +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
> +        return 0;
> +
> +    /* Both start and size of the extended region should be 2MB aligned */
> +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
> +    if ( start > e )
> +        return 0;
> +
> +    size = (e - start + 1) & ~(SZ_2M - 1);
> +    if ( !size )
> +        return 0;

Can't you align size as well?

  size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);


> +    ext_regions->bank[ext_regions->nr_banks].start = start;
> +    ext_regions->bank[ext_regions->nr_banks].size = size;
> +    ext_regions->nr_banks ++;
                            ^ no space

> +    return 0;
> +}
> +
> +/*
> + * The extended regions will be prevalidated by the memory hotplug path
> + * in Linux which requires for any added address range to be within maximum
> + * possible addressable physical memory range for which the linear mapping
> + * could be created.
> + * For 48-bit VA space size the maximum addressable range are:
> + * 0x40000000 - 0x80003fffffff

Please don't make Linux-specific comments in Xen code for interfaces
that are supposed to be OS-agnostic.


> + */
> +#define EXT_REGION_START   0x40000000ULL
> +#define EXT_REGION_END     0x80003fffffffULL
> +
> +static int __init find_unallocated_memory(const struct kernel_info *kinfo,
> +                                          struct meminfo *ext_regions)
> +{
> +    const struct meminfo *assign_mem = &kinfo->mem;
> +    struct rangeset *unalloc_mem;
> +    paddr_t start, end;
> +    unsigned int i;
> +    int res;
> +
> +    dt_dprintk("Find unallocated memory for extended regions\n");
> +
> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> +    if ( !unalloc_mem )
> +        return -ENOMEM;
> +
> +    /* Start with all available RAM */
> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> +    {
> +        start = bootinfo.mem.bank[i].start;
> +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;

Is the -1 needed? Isn't it going to screw up the size calculation later?


> +        res = rangeset_add_range(unalloc_mem, start, end);
> +        if ( res )
> +        {
> +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> +                   start, end);
> +            goto out;
> +        }
> +    }
> +
> +    /* Remove RAM assigned to Dom0 */
> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> +    {
> +        start = assign_mem->bank[i].start;
> +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
> +        res = rangeset_remove_range(unalloc_mem, start, end);
> +        if ( res )
> +        {
> +            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> +                   start, end);
> +            goto out;
> +        }
> +    }
> +
> +    /* Remove reserved-memory regions */
> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> +    {
> +        start = bootinfo.reserved_mem.bank[i].start;
> +        end = bootinfo.reserved_mem.bank[i].start +
> +            bootinfo.reserved_mem.bank[i].size - 1;
> +        res = rangeset_remove_range(unalloc_mem, start, end);
> +        if ( res )
> +        {
> +            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> +                   start, end);
> +            goto out;
> +        }
> +    }
> +
> +    /* Remove grant table region */
> +    start = kinfo->gnttab_start;
> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> +    res = rangeset_remove_range(unalloc_mem, start, end);
> +    if ( res )
> +    {
> +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> +               start, end);
> +        goto out;
> +    }
> +
> +    start = EXT_REGION_START;
> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> +    res = rangeset_report_ranges(unalloc_mem, start, end,
> +                                 add_ext_regions, ext_regions);
> +    if ( res )
> +        ext_regions->nr_banks = 0;
> +    else if ( !ext_regions->nr_banks )
> +        res = -ENOENT;
> +
> +out:
> +    rangeset_destroy(unalloc_mem);
> +
> +    return res;
> +}
> +
> +static int __init find_memory_holes(const struct kernel_info *kinfo,
> +                                    struct meminfo *ext_regions)
> +{
> +    struct dt_device_node *np;
> +    struct rangeset *mem_holes;
> +    paddr_t start, end;
> +    unsigned int i;
> +    int res;
> +
> +    dt_dprintk("Find memory holes for extended regions\n");
> +
> +    mem_holes = rangeset_new(NULL, NULL, 0);
> +    if ( !mem_holes )
> +        return -ENOMEM;
> +
> +    /* Start with maximum possible addressable physical memory range */
> +    start = EXT_REGION_START;
> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> +    res = rangeset_add_range(mem_holes, start, end);
> +    if ( res )
> +    {
> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> +               start, end);
> +        goto out;
> +    }
> +
> +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
> +    dt_for_each_device_node( dt_host, np )

Don't you need something like device_tree_for_each_node ?
dt_for_each_device_node won't go down any deeper in the tree?

Alternatively, maybe we could simply record the highest possible address
of any memory/device/anything as we scan the device tree with
handle_node. Then we can use that as the starting point here. So that we
don't need to scan the device tree twice, and also we don't need my
suggestion below to remove 1GB-aligned 1GB-multiple regions.


> +    {
> +        unsigned int naddr;
> +        u64 addr, size;
> +
> +        naddr = dt_number_of_address(np);
> +
> +        for ( i = 0; i < naddr; i++ )
> +        {
> +            res = dt_device_get_address(np, i, &addr, &size);
> +            if ( res )
> +            {
> +                printk(XENLOG_ERR "Unable to retrieve address %u for %s\n",
> +                       i, dt_node_full_name(np));

It might be possible for a device not to have a range if it doesn't have
any MMIO regions, right? For instance, certain ARM timer nodes. I would
not print any errors and continue.


> +                goto out;
> +            }
> +
> +            start = addr & PAGE_MASK;
> +            end = PAGE_ALIGN(addr + size) - 1;
> +            res = rangeset_remove_range(mem_holes, start, end);
> +            if ( res )
> +            {
> +                printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> +                       start, end);
> +                goto out;
> +            }
> +        }
> +    }

As is, it will result in a myriad of small ranges which is unuseful and
slow to parse. I suggest to simplify it by removing a larger region than
strictly necessary. For instance, you could remove a 1GB-aligned and
1GB-multiple region for each range. That way, you are going to get fewer
large free ranges instance of many small ones which we don't need.


> +    start = EXT_REGION_START;
> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> +    res = rangeset_report_ranges(mem_holes, start, end,
> +                                 add_ext_regions,  ext_regions);
> +    if ( res )
> +        ext_regions->nr_banks = 0;
> +    else if ( !ext_regions->nr_banks )
> +        res = -ENOENT;
> +
> +out:
> +    rangeset_destroy(mem_holes);
> +
> +    return res;
> +}
> +
>  static int __init make_hypervisor_node(struct domain *d,
>                                         const struct kernel_info *kinfo,
>                                         int addrcells, int sizecells)
> @@ -731,11 +921,13 @@ static int __init make_hypervisor_node(struct domain *d,
>      const char compat[] =
>          "xen,xen-"__stringify(XEN_VERSION)"."__stringify(XEN_SUBVERSION)"\0"
>          "xen,xen";
> -    __be32 reg[4];
> +    __be32 reg[(NR_MEM_BANKS + 1) * 4];

If you are xzalloc'ing struct meminfo then you might as well xzalloc reg
too. Or keep both on the stack with a lower NR_MEM_BANKS.


>      gic_interrupt_t intr;
>      __be32 *cells;
>      int res;
>      void *fdt = kinfo->fdt;
> +    struct meminfo *ext_regions;
> +    unsigned int i;
>  
>      dt_dprintk("Create hypervisor node\n");
>  
> @@ -757,12 +949,42 @@ static int __init make_hypervisor_node(struct domain *d,
>      if ( res )
>          return res;
>  
> +    ext_regions = xzalloc(struct meminfo);
> +    if ( !ext_regions )
> +        return -ENOMEM;
> +
> +    if ( is_32bit_domain(d) )
> +        printk(XENLOG_WARNING "The extended region is only supported for 64-bit guest\n");

This is going to add an unconditional warning to all 32bit boots. I
would skip it entirely or only keep it as XENLOG_DEBUG.


> +    else
> +    {
> +        if ( !is_iommu_enabled(d) )
> +            res = find_unallocated_memory(kinfo, ext_regions);
> +        else
> +            res = find_memory_holes(kinfo, ext_regions);
> +
> +        if ( res )
> +            printk(XENLOG_WARNING "Failed to allocate extended regions\n");
> +    }
> +
>      /* reg 0 is grant table space */
>      cells = &reg[0];
>      dt_child_set_range(&cells, addrcells, sizecells,
>                         kinfo->gnttab_start, kinfo->gnttab_size);
> +    /* reg 1...N are extended regions */
> +    for ( i = 0; i < ext_regions->nr_banks; i++ )
> +    {
> +        u64 start = ext_regions->bank[i].start;
> +        u64 size = ext_regions->bank[i].size;
> +
> +        dt_dprintk("Extended region %d: %#"PRIx64"->%#"PRIx64"\n",
> +                   i, start, start + size);
> +
> +        dt_child_set_range(&cells, addrcells, sizecells, start, size);
> +    }
> +    xfree(ext_regions);
> +
>      res = fdt_property(fdt, "reg", reg,
> -                       dt_cells_to_size(addrcells + sizecells));
> +                       dt_cells_to_size(addrcells + sizecells) * (i + 1));
>      if ( res )
>          return res;
>  
> -- 
> 2.7.4
>
Oleksandr Tyshchenko Sept. 15, 2021, 7:10 p.m. UTC | #2
On 14.09.21 03:55, Stefano Stabellini wrote:

Hi Stefano

> On Fri, 10 Sep 2021, Oleksandr Tyshchenko wrote:
>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>
>> The extended region (safe range) is a region of guest physical
>> address space which is unused and could be safely used to create
>> grant/foreign mappings instead of wasting real RAM pages from
>> the domain memory for establishing these mappings.
>>
>> The extended regions are chosen at the domain creation time and
>> advertised to it via "reg" property under hypervisor node in
>> the guest device-tree. As region 0 is reserved for grant table
>> space (always present), the indexes for extended regions are 1...N.
>> If extended regions could not be allocated for some reason,
>> Xen doesn't fail and behaves as usual, so only inserts region 0.
>>
>> Please note the following limitations:
>> - The extended region feature is only supported for 64-bit domain.
>> - The ACPI case is not covered.
>>
>> ***
>>
>> As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
>> the algorithm to choose extended regions for it is different
>> in comparison with the algorithm for non-direct mapped DomU.
>> What is more, that extended regions should be chosen differently
>> whether IOMMU is enabled or not.
>>
>> Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
>> holes found in host device-tree if otherwise. Make sure that
>> extended regions are 2MB-aligned and located within maximum possible
>> addressable physical memory range. The maximum number of extended
>> regions is 128.
>>
>> Suggested-by: Julien Grall <jgrall@amazon.com>
>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>> ---
>> Changes since RFC:
>>     - update patch description
>>     - drop uneeded "extended-region" DT property
>> ---
>>
>>   xen/arch/arm/domain_build.c | 226 +++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 224 insertions(+), 2 deletions(-)
>>
>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>> index 206038d..070ec27 100644
>> --- a/xen/arch/arm/domain_build.c
>> +++ b/xen/arch/arm/domain_build.c
>> @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct domain *d,
>>       return res;
>>   }
>>   
>> +static int __init add_ext_regions(unsigned long s, unsigned long e, void *data)
>> +{
>> +    struct meminfo *ext_regions = data;
>> +    paddr_t start, size;
>> +
>> +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
>> +        return 0;
>> +
>> +    /* Both start and size of the extended region should be 2MB aligned */
>> +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
>> +    if ( start > e )
>> +        return 0;
>> +
>> +    size = (e - start + 1) & ~(SZ_2M - 1);
>> +    if ( !size )
>> +        return 0;
> Can't you align size as well?
>
>    size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);

I am sorry, I don't entirely get what you really meant here. We get both 
start and size 2MB-aligned by the calculations above
(when calculating an alignment, we need to make sure that "start_passed 
<= start_aligned && size_aligned <= size_passed").
If I add the proposing string after, I will reduce the already aligned 
size by 2MB.
If I replace the size calculation with the following, I will get the 
reduced size even if the passed region is initially 2MB-aligned, so 
doesn't need to be adjusted.
size = e - s + 1;
size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);

>
>> +    ext_regions->bank[ext_regions->nr_banks].start = start;
>> +    ext_regions->bank[ext_regions->nr_banks].size = size;
>> +    ext_regions->nr_banks ++;
>                              ^ no space

ok


>
>> +    return 0;
>> +}
>> +
>> +/*
>> + * The extended regions will be prevalidated by the memory hotplug path
>> + * in Linux which requires for any added address range to be within maximum
>> + * possible addressable physical memory range for which the linear mapping
>> + * could be created.
>> + * For 48-bit VA space size the maximum addressable range are:
>> + * 0x40000000 - 0x80003fffffff
> Please don't make Linux-specific comments in Xen code for interfaces
> that are supposed to be OS-agnostic.

You are right. I just wanted to describe where these magic numbers come 
from.
Someone might question why, for example, "0 ... max_gpaddr" can't be 
used. I will move
that Linux-specific comments to the commit message to keep some 
justification of these numbers.


>> + */
>> +#define EXT_REGION_START   0x40000000ULL
>> +#define EXT_REGION_END     0x80003fffffffULL
>> +
>> +static int __init find_unallocated_memory(const struct kernel_info *kinfo,
>> +                                          struct meminfo *ext_regions)
>> +{
>> +    const struct meminfo *assign_mem = &kinfo->mem;
>> +    struct rangeset *unalloc_mem;
>> +    paddr_t start, end;
>> +    unsigned int i;
>> +    int res;
>> +
>> +    dt_dprintk("Find unallocated memory for extended regions\n");
>> +
>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>> +    if ( !unalloc_mem )
>> +        return -ENOMEM;
>> +
>> +    /* Start with all available RAM */
>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>> +    {
>> +        start = bootinfo.mem.bank[i].start;
>> +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
> Is the -1 needed? Isn't it going to screw up the size calculation later?
I thought, it was needed. The calculation seems correct.
For example, in my setup when IOMMU is disabled for Dom0 ("unallocated 
to Dom0 RAM"):

--- All available RAM and reserved memory found in DT:

(XEN) Initrd 0000000084000040-0000000085effc48
(XEN) RAM: 0000000048000000 - 00000000bfffffff <--- RAM bank 0
(XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1
(XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2
(XEN) RAM: 0000000700000000 - 000000077fffffff <--- RAM bank 3
(XEN)
(XEN) MODULE[0]: 0000000078080000 - 00000000781d74c8 Xen
(XEN) MODULE[1]: 0000000057fe7000 - 0000000057ffd080 Device Tree
(XEN) MODULE[2]: 0000000084000040 - 0000000085effc48 Ramdisk
(XEN) MODULE[3]: 000000008a000000 - 000000008c000000 Kernel
(XEN) MODULE[4]: 000000008c000000 - 000000008c010000 XSM
(XEN)  RESVD[0]: 0000000084000040 - 0000000085effc48
(XEN)  RESVD[1]: 0000000054000000 - 0000000056ffffff  <--- Reserved memory

--- Dom0 RAM:

(XEN) Allocating 1:1 mappings totalling 256MB for dom0:
(XEN) BANK[0] 0x00000060000000-0x00000070000000 (256MB)

--- Dom0 grant table range:

(XEN) Grant table range: 0x00000078080000-0x000000780c0000

--- Calculated extended regions printed in make_hypervisor_node():

printk("Extended region %d: %#"PRIx64"->%#"PRIx64"\n", i, start, start + 
size);

(XEN) Extended region 0: 0x48000000->0x54000000
(XEN) Extended region 1: 0x57000000->0x60000000
(XEN) Extended region 2: 0x70000000->0x78000000
(XEN) Extended region 3: 0x78200000->0xc0000000
(XEN) Extended region 4: 0x500000000->0x580000000
(XEN) Extended region 5: 0x600000000->0x680000000
(XEN) Extended region 6: 0x700000000->0x780000000

--- Resulted hypervisor node in Dom0 DT:

hypervisor {
         interrupts = <0x01 0x00 0xf08>;
         interrupt-parent = <0x19>;
         compatible = "xen,xen-4.16\0xen,xen";
         reg = <0x00 0x78080000 0x00 0x40000 0x00 0x48000000 0x00 
0xc000000 0x00 0x57000000 0x00 0x9000000 0x00 0x70000000 0x00 0x8000000 
0x00 0x78200000 0x00 0x47e00000 0x05 0x00 0x00 0x80000000 0x06 0x00 0x00 
0x80000000 0x07 0x00 0x00 0x80000000>;
};

Where index 0 corresponds to the grant table region and indexes 1...N 
correspond to the extended regions.


>> +        res = rangeset_add_range(unalloc_mem, start, end);
>> +        if ( res )
>> +        {
>> +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>> +                   start, end);
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* Remove RAM assigned to Dom0 */
>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>> +    {
>> +        start = assign_mem->bank[i].start;
>> +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>> +        if ( res )
>> +        {
>> +            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>> +                   start, end);
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* Remove reserved-memory regions */
>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>> +    {
>> +        start = bootinfo.reserved_mem.bank[i].start;
>> +        end = bootinfo.reserved_mem.bank[i].start +
>> +            bootinfo.reserved_mem.bank[i].size - 1;
>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>> +        if ( res )
>> +        {
>> +            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>> +                   start, end);
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* Remove grant table region */
>> +    start = kinfo->gnttab_start;
>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>> +    if ( res )
>> +    {
>> +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>> +               start, end);
>> +        goto out;
>> +    }
>> +
>> +    start = EXT_REGION_START;
>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>> +                                 add_ext_regions, ext_regions);
>> +    if ( res )
>> +        ext_regions->nr_banks = 0;
>> +    else if ( !ext_regions->nr_banks )
>> +        res = -ENOENT;
>> +
>> +out:
>> +    rangeset_destroy(unalloc_mem);
>> +
>> +    return res;
>> +}
>> +
>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>> +                                    struct meminfo *ext_regions)
>> +{
>> +    struct dt_device_node *np;
>> +    struct rangeset *mem_holes;
>> +    paddr_t start, end;
>> +    unsigned int i;
>> +    int res;
>> +
>> +    dt_dprintk("Find memory holes for extended regions\n");
>> +
>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>> +    if ( !mem_holes )
>> +        return -ENOMEM;
>> +
>> +    /* Start with maximum possible addressable physical memory range */
>> +    start = EXT_REGION_START;
>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>> +    res = rangeset_add_range(mem_holes, start, end);
>> +    if ( res )
>> +    {
>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>> +               start, end);
>> +        goto out;
>> +    }
>> +
>> +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
>> +    dt_for_each_device_node( dt_host, np )
> Don't you need something like device_tree_for_each_node ?
> dt_for_each_device_node won't go down any deeper in the tree?

Thank you for pointing this out, I will investigate and update the patch.


>
> Alternatively, maybe we could simply record the highest possible address
> of any memory/device/anything as we scan the device tree with
> handle_node. Then we can use that as the starting point here.
I also don't like the idea to scan the DT much, but I failed to find an 
effective solution how to avoid that.
Yes, we can record the highest possible address, but I am afraid, I 
didn't entirely get a suggestion. Is the suggestion to provide a single 
region
starting from highest possible address + 1 and up to the EXT_REGION_END 
suitably aligned? Could you please clarify?


> So that we
> don't need to scan the device tree twice, and also we don't need my
> suggestion below to remove 1GB-aligned 1GB-multiple regions.
I provided some thoughts regarding this below.


>
>
>> +    {
>> +        unsigned int naddr;
>> +        u64 addr, size;
>> +
>> +        naddr = dt_number_of_address(np);
>> +
>> +        for ( i = 0; i < naddr; i++ )
>> +        {
>> +            res = dt_device_get_address(np, i, &addr, &size);
>> +            if ( res )
>> +            {
>> +                printk(XENLOG_ERR "Unable to retrieve address %u for %s\n",
>> +                       i, dt_node_full_name(np));
> It might be possible for a device not to have a range if it doesn't have
> any MMIO regions, right? For instance, certain ARM timer nodes. I would
> not print any errors and continue.
I though, if device didn't have a range, then this loop wouldn't be 
executed at all as dt_number_of_address would return 0.
I will double check.


>
>
>> +                goto out;
>> +            }
>> +
>> +            start = addr & PAGE_MASK;
>> +            end = PAGE_ALIGN(addr + size) - 1;
>> +            res = rangeset_remove_range(mem_holes, start, end);
>> +            if ( res )
>> +            {
>> +                printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>> +                       start, end);
>> +                goto out;
>> +            }
>> +        }
>> +    }
> As is, it will result in a myriad of small ranges which is unuseful and
> slow to parse. I suggest to simplify it by removing a larger region than
> strictly necessary. For instance, you could remove a 1GB-aligned and
> 1GB-multiple region for each range. That way, you are going to get fewer
> large free ranges instance of many small ones which we don't need.

I agree with you that a lot of small ranges increase the bookkeeping in 
Dom0 and there is also a theoretical (?) possibility
that small ranges occupy all space we provide for extended regions 
(NR_MEM_BANKS)...
But, let's consider my setup as an example again, but when the IOMMU is 
enabled for Dom0 ("holes found in DT").

--- The RAM configuration is the same:

(XEN) RAM: 0000000048000000 - 00000000bfffffff <--- RAM bank 0
(XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1
(XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2
(XEN) RAM: 0000000700000000 - 000000077fffffff <--- RAM bank 3

--- There are a lot of various platform devices with reg property 
described in DT, I will probably not post all IO ranges here, just say 
that mostly all of them to be mapped at 0xE0000000-0xFFFFFFFF.

--- As we only pick up ranges with size >= 2MB, the calculated extended 
regions are (based on 40-bit IPA):

(XEN) Extended region 0: 0x40000000->0x47e00000
(XEN) Extended region 1: 0xc0000000->0xe6000000
(XEN) Extended region 2: 0xe7000000->0xe7200000
(XEN) Extended region 3: 0xe7400000->0xe7600000
(XEN) Extended region 4: 0xe7800000->0xec000000
(XEN) Extended region 5: 0xec200000->0xec400000
(XEN) Extended region 6: 0xec800000->0xee000000
(XEN) Extended region 7: 0xee600000->0xee800000
(XEN) Extended region 8: 0xeea00000->0xf1000000
(XEN) Extended region 9: 0xf1200000->0xfd000000
(XEN) Extended region 10: 0xfd200000->0xfd800000
(XEN) Extended region 11: 0xfda00000->0xfe000000
(XEN) Extended region 12: 0xfe200000->0xfe600000
(XEN) Extended region 13: 0xfec00000->0xff800000
(XEN) Extended region 14: 0x100000000->0x500000000
(XEN) Extended region 15: 0x580000000->0x600000000
(XEN) Extended region 16: 0x680000000->0x700000000
(XEN) Extended region 17: 0x780000000->0x10000000000

So, if I *correctly* understood your idea about removing 1GB-aligned 
1GB-multiple region for each range we would get the following:

(XEN) Extended region 0: 0x100000000->0x500000000
(XEN) Extended region 1: 0x580000000->0x600000000
(XEN) Extended region 2: 0x680000000->0x700000000
(XEN) Extended region 3: 0x780000000->0x10000000000

As you can see there are no extended regions below 4GB at all. I assume, 
it would be good to provide them for 1:1 mapped Dom0 (for 32-bit DMA 
devices?)
What else worries me is that IPA size could be 36 or even 32. So, I am 
afraid, we might even fail to find extended regions above 4GB.


I think, if 2MB is considered small enough to bother with, probably we 
should go with something in between (16MB, 32MB, 64MB).
For example, we can take into the account ranges with size >= 16MB:

(XEN) Extended region 0: 0x40000000->0x47e00000
(XEN) Extended region 1: 0xc0000000->0xe6000000
(XEN) Extended region 2: 0xe7800000->0xec000000
(XEN) Extended region 3: 0xec800000->0xee000000
(XEN) Extended region 4: 0xeea00000->0xf1000000
(XEN) Extended region 5: 0xf1200000->0xfd000000
(XEN) Extended region 6: 0x100000000->0x500000000
(XEN) Extended region 7: 0x580000000->0x600000000
(XEN) Extended region 8: 0x680000000->0x700000000
(XEN) Extended region 9: 0x780000000->0x10000000000

Any thoughts?


>
>> +    start = EXT_REGION_START;
>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>> +    res = rangeset_report_ranges(mem_holes, start, end,
>> +                                 add_ext_regions,  ext_regions);
>> +    if ( res )
>> +        ext_regions->nr_banks = 0;
>> +    else if ( !ext_regions->nr_banks )
>> +        res = -ENOENT;
>> +
>> +out:
>> +    rangeset_destroy(mem_holes);
>> +
>> +    return res;
>> +}
>> +
>>   static int __init make_hypervisor_node(struct domain *d,
>>                                          const struct kernel_info *kinfo,
>>                                          int addrcells, int sizecells)
>> @@ -731,11 +921,13 @@ static int __init make_hypervisor_node(struct domain *d,
>>       const char compat[] =
>>           "xen,xen-"__stringify(XEN_VERSION)"."__stringify(XEN_SUBVERSION)"\0"
>>           "xen,xen";
>> -    __be32 reg[4];
>> +    __be32 reg[(NR_MEM_BANKS + 1) * 4];
> If you are xzalloc'ing struct meminfo then you might as well xzalloc reg
> too. Or keep both on the stack with a lower NR_MEM_BANKS.
sounds reasonable, I will probably xzalloc reg as well.

>
>
>>       gic_interrupt_t intr;
>>       __be32 *cells;
>>       int res;
>>       void *fdt = kinfo->fdt;
>> +    struct meminfo *ext_regions;
>> +    unsigned int i;
>>   
>>       dt_dprintk("Create hypervisor node\n");
>>   
>> @@ -757,12 +949,42 @@ static int __init make_hypervisor_node(struct domain *d,
>>       if ( res )
>>           return res;
>>   
>> +    ext_regions = xzalloc(struct meminfo);
>> +    if ( !ext_regions )
>> +        return -ENOMEM;
>> +
>> +    if ( is_32bit_domain(d) )
>> +        printk(XENLOG_WARNING "The extended region is only supported for 64-bit guest\n");
> This is going to add an unconditional warning to all 32bit boots. I
> would skip it entirely or only keep it as XENLOG_DEBUG.

agree, I will probably convert to XENLOG_DEBUG.


>
>
>> +    else
>> +    {
>> +        if ( !is_iommu_enabled(d) )
>> +            res = find_unallocated_memory(kinfo, ext_regions);
>> +        else
>> +            res = find_memory_holes(kinfo, ext_regions);
>> +
>> +        if ( res )
>> +            printk(XENLOG_WARNING "Failed to allocate extended regions\n");
>> +    }
>> +
>>       /* reg 0 is grant table space */
>>       cells = &reg[0];
>>       dt_child_set_range(&cells, addrcells, sizecells,
>>                          kinfo->gnttab_start, kinfo->gnttab_size);
>> +    /* reg 1...N are extended regions */
>> +    for ( i = 0; i < ext_regions->nr_banks; i++ )
>> +    {
>> +        u64 start = ext_regions->bank[i].start;
>> +        u64 size = ext_regions->bank[i].size;
>> +
>> +        dt_dprintk("Extended region %d: %#"PRIx64"->%#"PRIx64"\n",
>> +                   i, start, start + size);
>> +
>> +        dt_child_set_range(&cells, addrcells, sizecells, start, size);
>> +    }
>> +    xfree(ext_regions);
>> +
>>       res = fdt_property(fdt, "reg", reg,
>> -                       dt_cells_to_size(addrcells + sizecells));
>> +                       dt_cells_to_size(addrcells + sizecells) * (i + 1));
>>       if ( res )
>>           return res;
>>   
>> -- 
>> 2.7.4
>>

Thank you.
Stefano Stabellini Sept. 15, 2021, 9:21 p.m. UTC | #3
On Wed, 15 Sep 2021, Oleksandr wrote:
> > On Fri, 10 Sep 2021, Oleksandr Tyshchenko wrote:
> > > From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > 
> > > The extended region (safe range) is a region of guest physical
> > > address space which is unused and could be safely used to create
> > > grant/foreign mappings instead of wasting real RAM pages from
> > > the domain memory for establishing these mappings.
> > > 
> > > The extended regions are chosen at the domain creation time and
> > > advertised to it via "reg" property under hypervisor node in
> > > the guest device-tree. As region 0 is reserved for grant table
> > > space (always present), the indexes for extended regions are 1...N.
> > > If extended regions could not be allocated for some reason,
> > > Xen doesn't fail and behaves as usual, so only inserts region 0.
> > > 
> > > Please note the following limitations:
> > > - The extended region feature is only supported for 64-bit domain.
> > > - The ACPI case is not covered.
> > > 
> > > ***
> > > 
> > > As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
> > > the algorithm to choose extended regions for it is different
> > > in comparison with the algorithm for non-direct mapped DomU.
> > > What is more, that extended regions should be chosen differently
> > > whether IOMMU is enabled or not.
> > > 
> > > Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
> > > holes found in host device-tree if otherwise. Make sure that
> > > extended regions are 2MB-aligned and located within maximum possible
> > > addressable physical memory range. The maximum number of extended
> > > regions is 128.
> > > 
> > > Suggested-by: Julien Grall <jgrall@amazon.com>
> > > Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > ---
> > > Changes since RFC:
> > >     - update patch description
> > >     - drop uneeded "extended-region" DT property
> > > ---
> > > 
> > >   xen/arch/arm/domain_build.c | 226
> > > +++++++++++++++++++++++++++++++++++++++++++-
> > >   1 file changed, 224 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> > > index 206038d..070ec27 100644
> > > --- a/xen/arch/arm/domain_build.c
> > > +++ b/xen/arch/arm/domain_build.c
> > > @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct
> > > domain *d,
> > >       return res;
> > >   }
> > >   +static int __init add_ext_regions(unsigned long s, unsigned long e,
> > > void *data)
> > > +{
> > > +    struct meminfo *ext_regions = data;
> > > +    paddr_t start, size;
> > > +
> > > +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
> > > +        return 0;
> > > +
> > > +    /* Both start and size of the extended region should be 2MB aligned
> > > */
> > > +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
> > > +    if ( start > e )
> > > +        return 0;
> > > +
> > > +    size = (e - start + 1) & ~(SZ_2M - 1);
> > > +    if ( !size )
> > > +        return 0;
> > Can't you align size as well?
> > 
> >    size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);
> 
> I am sorry, I don't entirely get what you really meant here. We get both start
> and size 2MB-aligned by the calculations above
> (when calculating an alignment, we need to make sure that "start_passed <=
> start_aligned && size_aligned <= size_passed").
> If I add the proposing string after, I will reduce the already aligned size by
> 2MB.
> If I replace the size calculation with the following, I will get the reduced
> size even if the passed region is initially 2MB-aligned, so doesn't need to be
> adjusted.
> size = e - s + 1;
> size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);

Sorry I misread your original code, I think it was working as intended
except for the "+1". I think it should be:

  size = (e - start) & ~(SZ_2M - 1);



> > > + */
> > > +#define EXT_REGION_START   0x40000000ULL
> > > +#define EXT_REGION_END     0x80003fffffffULL
> > > +
> > > +static int __init find_unallocated_memory(const struct kernel_info
> > > *kinfo,
> > > +                                          struct meminfo *ext_regions)
> > > +{
> > > +    const struct meminfo *assign_mem = &kinfo->mem;
> > > +    struct rangeset *unalloc_mem;
> > > +    paddr_t start, end;
> > > +    unsigned int i;
> > > +    int res;
> > > +
> > > +    dt_dprintk("Find unallocated memory for extended regions\n");
> > > +
> > > +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> > > +    if ( !unalloc_mem )
> > > +        return -ENOMEM;
> > > +
> > > +    /* Start with all available RAM */
> > > +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > > +    {
> > > +        start = bootinfo.mem.bank[i].start;
> > > +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
> > Is the -1 needed? Isn't it going to screw up the size calculation later?
> I thought, it was needed. The calculation seems correct.

I think that normally for an example MMIO region:

start = 0x48000000
size  = 0x40000000
end   = 0x88000000

So end = start + size and points to the first address out of the range.
In other words, 0x88000000 doesn't actually belong to the MMIO region in
the example.

But here you are passing addresses to rangeset_add_range and other
rangeset functions and I think rangeset takes *inclusive* addresses as
input. So you need to pass start and end-1 because end-1 is the last
address of the MMIO region.

In fact you can see for instance in map_range_to_domain:

        res = iomem_permit_access(d, paddr_to_pfn(addr),
                paddr_to_pfn(PAGE_ALIGN(addr + len - 1)));

Where iomem_permit_access is based on rangeset. So for clarity, I would
do:

start = assign_mem->bank[i].start;
end = assign_mem->bank[i].start + assign_mem->bank[i].size;
res = rangeset_remove_range(unalloc_mem, start, end - 1);

So that we don't get confused on the meaning of "end" which everywhere
else means the first address not in range.


> > > +        res = rangeset_add_range(unalloc_mem, start, end);
> > > +        if ( res )
> > > +        {
> > > +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > > +                   start, end);
> > > +            goto out;
> > > +        }
> > > +    }
> > > +
> > > +    /* Remove RAM assigned to Dom0 */
> > > +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > > +    {
> > > +        start = assign_mem->bank[i].start;
> > > +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
> > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > +        if ( res )
> > > +        {
> > > +            printk(XENLOG_ERR "Failed to remove:
> > > %#"PRIx64"->%#"PRIx64"\n",
> > > +                   start, end);
> > > +            goto out;
> > > +        }
> > > +    }
> > > +
> > > +    /* Remove reserved-memory regions */
> > > +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> > > +    {
> > > +        start = bootinfo.reserved_mem.bank[i].start;
> > > +        end = bootinfo.reserved_mem.bank[i].start +
> > > +            bootinfo.reserved_mem.bank[i].size - 1;
> > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > +        if ( res )
> > > +        {
> > > +            printk(XENLOG_ERR "Failed to remove:
> > > %#"PRIx64"->%#"PRIx64"\n",
> > > +                   start, end);
> > > +            goto out;
> > > +        }
> > > +    }
> > > +
> > > +    /* Remove grant table region */
> > > +    start = kinfo->gnttab_start;
> > > +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> > > +    res = rangeset_remove_range(unalloc_mem, start, end);
> > > +    if ( res )
> > > +    {
> > > +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> > > +               start, end);
> > > +        goto out;
> > > +    }
> > > +
> > > +    start = EXT_REGION_START;
> > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > +    res = rangeset_report_ranges(unalloc_mem, start, end,
> > > +                                 add_ext_regions, ext_regions);
> > > +    if ( res )
> > > +        ext_regions->nr_banks = 0;
> > > +    else if ( !ext_regions->nr_banks )
> > > +        res = -ENOENT;
> > > +
> > > +out:
> > > +    rangeset_destroy(unalloc_mem);
> > > +
> > > +    return res;
> > > +}
> > > +
> > > +static int __init find_memory_holes(const struct kernel_info *kinfo,
> > > +                                    struct meminfo *ext_regions)
> > > +{
> > > +    struct dt_device_node *np;
> > > +    struct rangeset *mem_holes;
> > > +    paddr_t start, end;
> > > +    unsigned int i;
> > > +    int res;
> > > +
> > > +    dt_dprintk("Find memory holes for extended regions\n");
> > > +
> > > +    mem_holes = rangeset_new(NULL, NULL, 0);
> > > +    if ( !mem_holes )
> > > +        return -ENOMEM;
> > > +
> > > +    /* Start with maximum possible addressable physical memory range */
> > > +    start = EXT_REGION_START;
> > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > +    res = rangeset_add_range(mem_holes, start, end);
> > > +    if ( res )
> > > +    {
> > > +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > > +               start, end);
> > > +        goto out;
> > > +    }
> > > +
> > > +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
> > > +    dt_for_each_device_node( dt_host, np )
> > Don't you need something like device_tree_for_each_node ?
> > dt_for_each_device_node won't go down any deeper in the tree?
> 
> Thank you for pointing this out, I will investigate and update the patch.
> 
> 
> > 
> > Alternatively, maybe we could simply record the highest possible address
> > of any memory/device/anything as we scan the device tree with
> > handle_node. Then we can use that as the starting point here.
> I also don't like the idea to scan the DT much, but I failed to find an
> effective solution how to avoid that.
> Yes, we can record the highest possible address, but I am afraid, I didn't
> entirely get a suggestion. Is the suggestion to provide a single region
> starting from highest possible address + 1 and up to the EXT_REGION_END
> suitably aligned? Could you please clarify?

Yes, that is what I was suggesting as a possible alternative: start from
the highest possible address in DT + 1 and up to the EXT_REGION_END
suitably aligned. But that wouldn't solve the <4GB issue.

 
> > > +                goto out;
> > > +            }
> > > +
> > > +            start = addr & PAGE_MASK;
> > > +            end = PAGE_ALIGN(addr + size) - 1;
> > > +            res = rangeset_remove_range(mem_holes, start, end);
> > > +            if ( res )
> > > +            {
> > > +                printk(XENLOG_ERR "Failed to remove:
> > > %#"PRIx64"->%#"PRIx64"\n",
> > > +                       start, end);
> > > +                goto out;
> > > +            }
> > > +        }
> > > +    }
> > As is, it will result in a myriad of small ranges which is unuseful and
> > slow to parse. I suggest to simplify it by removing a larger region than
> > strictly necessary. For instance, you could remove a 1GB-aligned and
> > 1GB-multiple region for each range. That way, you are going to get fewer
> > large free ranges instance of many small ones which we don't need.
> 
> I agree with you that a lot of small ranges increase the bookkeeping in Dom0
> and there is also a theoretical (?) possibility
> that small ranges occupy all space we provide for extended regions
> (NR_MEM_BANKS)...
> But, let's consider my setup as an example again, but when the IOMMU is
> enabled for Dom0 ("holes found in DT").
> 
> --- The RAM configuration is the same:
> 
> (XEN) RAM: 0000000048000000 - 00000000bfffffff <--- RAM bank 0
> (XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1
> (XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2
> (XEN) RAM: 0000000700000000 - 000000077fffffff <--- RAM bank 3
> 
> --- There are a lot of various platform devices with reg property described in
> DT, I will probably not post all IO ranges here, just say that mostly all of
> them to be mapped at 0xE0000000-0xFFFFFFFF.
> 
> --- As we only pick up ranges with size >= 2MB, the calculated extended
> regions are (based on 40-bit IPA):
> 
> (XEN) Extended region 0: 0x40000000->0x47e00000
> (XEN) Extended region 1: 0xc0000000->0xe6000000
> (XEN) Extended region 2: 0xe7000000->0xe7200000
> (XEN) Extended region 3: 0xe7400000->0xe7600000
> (XEN) Extended region 4: 0xe7800000->0xec000000
> (XEN) Extended region 5: 0xec200000->0xec400000
> (XEN) Extended region 6: 0xec800000->0xee000000
> (XEN) Extended region 7: 0xee600000->0xee800000
> (XEN) Extended region 8: 0xeea00000->0xf1000000
> (XEN) Extended region 9: 0xf1200000->0xfd000000
> (XEN) Extended region 10: 0xfd200000->0xfd800000
> (XEN) Extended region 11: 0xfda00000->0xfe000000
> (XEN) Extended region 12: 0xfe200000->0xfe600000
> (XEN) Extended region 13: 0xfec00000->0xff800000
> (XEN) Extended region 14: 0x100000000->0x500000000
> (XEN) Extended region 15: 0x580000000->0x600000000
> (XEN) Extended region 16: 0x680000000->0x700000000
> (XEN) Extended region 17: 0x780000000->0x10000000000
> 
> So, if I *correctly* understood your idea about removing 1GB-aligned
> 1GB-multiple region for each range we would get the following:
> 
> (XEN) Extended region 0: 0x100000000->0x500000000
> (XEN) Extended region 1: 0x580000000->0x600000000
> (XEN) Extended region 2: 0x680000000->0x700000000
> (XEN) Extended region 3: 0x780000000->0x10000000000
> 
> As you can see there are no extended regions below 4GB at all. I assume, it
> would be good to provide them for 1:1 mapped Dom0 (for 32-bit DMA devices?)
> What else worries me is that IPA size could be 36 or even 32. So, I am afraid,
> we might even fail to find extended regions above 4GB.
> 
> 
> I think, if 2MB is considered small enough to bother with, probably we should
> go with something in between (16MB, 32MB, 64MB).
> For example, we can take into the account ranges with size >= 16MB:
> 
> (XEN) Extended region 0: 0x40000000->0x47e00000
> (XEN) Extended region 1: 0xc0000000->0xe6000000
> (XEN) Extended region 2: 0xe7800000->0xec000000
> (XEN) Extended region 3: 0xec800000->0xee000000
> (XEN) Extended region 4: 0xeea00000->0xf1000000
> (XEN) Extended region 5: 0xf1200000->0xfd000000
> (XEN) Extended region 6: 0x100000000->0x500000000
> (XEN) Extended region 7: 0x580000000->0x600000000
> (XEN) Extended region 8: 0x680000000->0x700000000
> (XEN) Extended region 9: 0x780000000->0x10000000000
> 
> Any thoughts?

Yeah maybe an intermediate value would be best. I'd go with 64MB.
Oleksandr Tyshchenko Sept. 16, 2021, 8:57 p.m. UTC | #4
On 16.09.21 00:21, Stefano Stabellini wrote:

Hi Stefano

> On Wed, 15 Sep 2021, Oleksandr wrote:
>>> On Fri, 10 Sep 2021, Oleksandr Tyshchenko wrote:
>>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>
>>>> The extended region (safe range) is a region of guest physical
>>>> address space which is unused and could be safely used to create
>>>> grant/foreign mappings instead of wasting real RAM pages from
>>>> the domain memory for establishing these mappings.
>>>>
>>>> The extended regions are chosen at the domain creation time and
>>>> advertised to it via "reg" property under hypervisor node in
>>>> the guest device-tree. As region 0 is reserved for grant table
>>>> space (always present), the indexes for extended regions are 1...N.
>>>> If extended regions could not be allocated for some reason,
>>>> Xen doesn't fail and behaves as usual, so only inserts region 0.
>>>>
>>>> Please note the following limitations:
>>>> - The extended region feature is only supported for 64-bit domain.
>>>> - The ACPI case is not covered.
>>>>
>>>> ***
>>>>
>>>> As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
>>>> the algorithm to choose extended regions for it is different
>>>> in comparison with the algorithm for non-direct mapped DomU.
>>>> What is more, that extended regions should be chosen differently
>>>> whether IOMMU is enabled or not.
>>>>
>>>> Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
>>>> holes found in host device-tree if otherwise. Make sure that
>>>> extended regions are 2MB-aligned and located within maximum possible
>>>> addressable physical memory range. The maximum number of extended
>>>> regions is 128.
>>>>
>>>> Suggested-by: Julien Grall <jgrall@amazon.com>
>>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>> ---
>>>> Changes since RFC:
>>>>      - update patch description
>>>>      - drop uneeded "extended-region" DT property
>>>> ---
>>>>
>>>>    xen/arch/arm/domain_build.c | 226
>>>> +++++++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 224 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>>>> index 206038d..070ec27 100644
>>>> --- a/xen/arch/arm/domain_build.c
>>>> +++ b/xen/arch/arm/domain_build.c
>>>> @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct
>>>> domain *d,
>>>>        return res;
>>>>    }
>>>>    +static int __init add_ext_regions(unsigned long s, unsigned long e,
>>>> void *data)
>>>> +{
>>>> +    struct meminfo *ext_regions = data;
>>>> +    paddr_t start, size;
>>>> +
>>>> +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
>>>> +        return 0;
>>>> +
>>>> +    /* Both start and size of the extended region should be 2MB aligned
>>>> */
>>>> +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
>>>> +    if ( start > e )
>>>> +        return 0;
>>>> +
>>>> +    size = (e - start + 1) & ~(SZ_2M - 1);
>>>> +    if ( !size )
>>>> +        return 0;
>>> Can't you align size as well?
>>>
>>>     size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);
>> I am sorry, I don't entirely get what you really meant here. We get both start
>> and size 2MB-aligned by the calculations above
>> (when calculating an alignment, we need to make sure that "start_passed <=
>> start_aligned && size_aligned <= size_passed").
>> If I add the proposing string after, I will reduce the already aligned size by
>> 2MB.
>> If I replace the size calculation with the following, I will get the reduced
>> size even if the passed region is initially 2MB-aligned, so doesn't need to be
>> adjusted.
>> size = e - s + 1;
>> size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);
> Sorry I misread your original code, I think it was working as intended
> except for the "+1". I think it should be:
>
>    size = (e - start) & ~(SZ_2M - 1);
But why without "+1"? Isn't "e" here the *last address* of passed range?
Without "+1" I get non entirely correct calculations, last valid 2MB is 
missed.

[snip]
(XEN) Extended region 14: 0x580000000->0x5ffe00000
(XEN) Extended region 15: 0x680000000->0x6ffe00000
(XEN) Extended region 16: 0x780000000->0xffffe00000

But should get:

[snip]
(XEN) Extended region 15: 0x580000000->0x600000000
(XEN) Extended region 16: 0x680000000->0x700000000
(XEN) Extended region 17: 0x780000000->0x10000000000

Let's consider how a hole between (for example) RAM bank 1 and bank 2 is 
calculated:
(XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1 with size 
0x80000000
(XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2 with size 
0x80000000
So the hole size should also be 0x80000000.
If we pass these RAM banks to rangeset_remove_range() one by one:
1: s = 0x500000000 e = 0x57FFFFFFF
2. s = 0x600000000 e = 0x67FFFFFFF
we get s = 0x580000000 e = 0x5FFFFFFFF in add_ext_regions(), where "e" 
is the last address of the hole (not the first address out of the hole), 
so I think, that for proper size calculation we need to add 1 to "e - 
s". Or I really missed something?


>
>>>> + */
>>>> +#define EXT_REGION_START   0x40000000ULL
>>>> +#define EXT_REGION_END     0x80003fffffffULL
>>>> +
>>>> +static int __init find_unallocated_memory(const struct kernel_info
>>>> *kinfo,
>>>> +                                          struct meminfo *ext_regions)
>>>> +{
>>>> +    const struct meminfo *assign_mem = &kinfo->mem;
>>>> +    struct rangeset *unalloc_mem;
>>>> +    paddr_t start, end;
>>>> +    unsigned int i;
>>>> +    int res;
>>>> +
>>>> +    dt_dprintk("Find unallocated memory for extended regions\n");
>>>> +
>>>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>>>> +    if ( !unalloc_mem )
>>>> +        return -ENOMEM;
>>>> +
>>>> +    /* Start with all available RAM */
>>>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>>> +    {
>>>> +        start = bootinfo.mem.bank[i].start;
>>>> +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
>>> Is the -1 needed? Isn't it going to screw up the size calculation later?
>> I thought, it was needed. The calculation seems correct.
> I think that normally for an example MMIO region:
>
> start = 0x48000000
> size  = 0x40000000
> end   = 0x88000000
>
> So end = start + size and points to the first address out of the range.
> In other words, 0x88000000 doesn't actually belong to the MMIO region in
> the example.
>
> But here you are passing addresses to rangeset_add_range and other
> rangeset functions and I think rangeset takes *inclusive* addresses as
> input. So you need to pass start and end-1 because end-1 is the last
> address of the MMIO region.
>
> In fact you can see for instance in map_range_to_domain:
>
>          res = iomem_permit_access(d, paddr_to_pfn(addr),
>                  paddr_to_pfn(PAGE_ALIGN(addr + len - 1)));
>
> Where iomem_permit_access is based on rangeset. So for clarity, I would
> do:
>
> start = assign_mem->bank[i].start;
> end = assign_mem->bank[i].start + assign_mem->bank[i].size;
> res = rangeset_remove_range(unalloc_mem, start, end - 1);
>
> So that we don't get confused on the meaning of "end" which everywhere
> else means the first address not in range.

I got your point, I will update the code if it much cleaner.


>>>> +        res = rangeset_add_range(unalloc_mem, start, end);
>>>> +        if ( res )
>>>> +        {
>>>> +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>>> +                   start, end);
>>>> +            goto out;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Remove RAM assigned to Dom0 */
>>>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>>> +    {
>>>> +        start = assign_mem->bank[i].start;
>>>> +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>> +        if ( res )
>>>> +        {
>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>> +                   start, end);
>>>> +            goto out;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Remove reserved-memory regions */
>>>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>>>> +    {
>>>> +        start = bootinfo.reserved_mem.bank[i].start;
>>>> +        end = bootinfo.reserved_mem.bank[i].start +
>>>> +            bootinfo.reserved_mem.bank[i].size - 1;
>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>> +        if ( res )
>>>> +        {
>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>> +                   start, end);
>>>> +            goto out;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Remove grant table region */
>>>> +    start = kinfo->gnttab_start;
>>>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>>>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>>>> +    if ( res )
>>>> +    {
>>>> +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>>>> +               start, end);
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    start = EXT_REGION_START;
>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>>>> +                                 add_ext_regions, ext_regions);
>>>> +    if ( res )
>>>> +        ext_regions->nr_banks = 0;
>>>> +    else if ( !ext_regions->nr_banks )
>>>> +        res = -ENOENT;
>>>> +
>>>> +out:
>>>> +    rangeset_destroy(unalloc_mem);
>>>> +
>>>> +    return res;
>>>> +}
>>>> +
>>>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>>>> +                                    struct meminfo *ext_regions)
>>>> +{
>>>> +    struct dt_device_node *np;
>>>> +    struct rangeset *mem_holes;
>>>> +    paddr_t start, end;
>>>> +    unsigned int i;
>>>> +    int res;
>>>> +
>>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>>> +
>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>> +    if ( !mem_holes )
>>>> +        return -ENOMEM;
>>>> +
>>>> +    /* Start with maximum possible addressable physical memory range */
>>>> +    start = EXT_REGION_START;
>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>> +    if ( res )
>>>> +    {
>>>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>>> +               start, end);
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
>>>> +    dt_for_each_device_node( dt_host, np )
>>> Don't you need something like device_tree_for_each_node ?
>>> dt_for_each_device_node won't go down any deeper in the tree?
>> Thank you for pointing this out, I will investigate and update the patch.
>>
>>
>>> Alternatively, maybe we could simply record the highest possible address
>>> of any memory/device/anything as we scan the device tree with
>>> handle_node. Then we can use that as the starting point here.
>> I also don't like the idea to scan the DT much, but I failed to find an
>> effective solution how to avoid that.
>> Yes, we can record the highest possible address, but I am afraid, I didn't
>> entirely get a suggestion. Is the suggestion to provide a single region
>> starting from highest possible address + 1 and up to the EXT_REGION_END
>> suitably aligned? Could you please clarify?
> Yes, that is what I was suggesting as a possible alternative: start from
> the highest possible address in DT + 1 and up to the EXT_REGION_END
> suitably aligned. But that wouldn't solve the <4GB issue.
>
>>>> +                goto out;
>>>> +            }
>>>> +
>>>> +            start = addr & PAGE_MASK;
>>>> +            end = PAGE_ALIGN(addr + size) - 1;
>>>> +            res = rangeset_remove_range(mem_holes, start, end);
>>>> +            if ( res )
>>>> +            {
>>>> +                printk(XENLOG_ERR "Failed to remove:
>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>> +                       start, end);
>>>> +                goto out;
>>>> +            }
>>>> +        }
>>>> +    }
>>> As is, it will result in a myriad of small ranges which is unuseful and
>>> slow to parse. I suggest to simplify it by removing a larger region than
>>> strictly necessary. For instance, you could remove a 1GB-aligned and
>>> 1GB-multiple region for each range. That way, you are going to get fewer
>>> large free ranges instance of many small ones which we don't need.
>> I agree with you that a lot of small ranges increase the bookkeeping in Dom0
>> and there is also a theoretical (?) possibility
>> that small ranges occupy all space we provide for extended regions
>> (NR_MEM_BANKS)...
>> But, let's consider my setup as an example again, but when the IOMMU is
>> enabled for Dom0 ("holes found in DT").
>>
>> --- The RAM configuration is the same:
>>
>> (XEN) RAM: 0000000048000000 - 00000000bfffffff <--- RAM bank 0
>> (XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1
>> (XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2
>> (XEN) RAM: 0000000700000000 - 000000077fffffff <--- RAM bank 3
>>
>> --- There are a lot of various platform devices with reg property described in
>> DT, I will probably not post all IO ranges here, just say that mostly all of
>> them to be mapped at 0xE0000000-0xFFFFFFFF.
>>
>> --- As we only pick up ranges with size >= 2MB, the calculated extended
>> regions are (based on 40-bit IPA):
>>
>> (XEN) Extended region 0: 0x40000000->0x47e00000
>> (XEN) Extended region 1: 0xc0000000->0xe6000000
>> (XEN) Extended region 2: 0xe7000000->0xe7200000
>> (XEN) Extended region 3: 0xe7400000->0xe7600000
>> (XEN) Extended region 4: 0xe7800000->0xec000000
>> (XEN) Extended region 5: 0xec200000->0xec400000
>> (XEN) Extended region 6: 0xec800000->0xee000000
>> (XEN) Extended region 7: 0xee600000->0xee800000
>> (XEN) Extended region 8: 0xeea00000->0xf1000000
>> (XEN) Extended region 9: 0xf1200000->0xfd000000
>> (XEN) Extended region 10: 0xfd200000->0xfd800000
>> (XEN) Extended region 11: 0xfda00000->0xfe000000
>> (XEN) Extended region 12: 0xfe200000->0xfe600000
>> (XEN) Extended region 13: 0xfec00000->0xff800000
>> (XEN) Extended region 14: 0x100000000->0x500000000
>> (XEN) Extended region 15: 0x580000000->0x600000000
>> (XEN) Extended region 16: 0x680000000->0x700000000
>> (XEN) Extended region 17: 0x780000000->0x10000000000
>>
>> So, if I *correctly* understood your idea about removing 1GB-aligned
>> 1GB-multiple region for each range we would get the following:
>>
>> (XEN) Extended region 0: 0x100000000->0x500000000
>> (XEN) Extended region 1: 0x580000000->0x600000000
>> (XEN) Extended region 2: 0x680000000->0x700000000
>> (XEN) Extended region 3: 0x780000000->0x10000000000
>>
>> As you can see there are no extended regions below 4GB at all. I assume, it
>> would be good to provide them for 1:1 mapped Dom0 (for 32-bit DMA devices?)
>> What else worries me is that IPA size could be 36 or even 32. So, I am afraid,
>> we might even fail to find extended regions above 4GB.
>>
>>
>> I think, if 2MB is considered small enough to bother with, probably we should
>> go with something in between (16MB, 32MB, 64MB).
>> For example, we can take into the account ranges with size >= 16MB:
>>
>> (XEN) Extended region 0: 0x40000000->0x47e00000
>> (XEN) Extended region 1: 0xc0000000->0xe6000000
>> (XEN) Extended region 2: 0xe7800000->0xec000000
>> (XEN) Extended region 3: 0xec800000->0xee000000
>> (XEN) Extended region 4: 0xeea00000->0xf1000000
>> (XEN) Extended region 5: 0xf1200000->0xfd000000
>> (XEN) Extended region 6: 0x100000000->0x500000000
>> (XEN) Extended region 7: 0x580000000->0x600000000
>> (XEN) Extended region 8: 0x680000000->0x700000000
>> (XEN) Extended region 9: 0x780000000->0x10000000000
>>
>> Any thoughts?
> Yeah maybe an intermediate value would be best. I'd go with 64MB.

I completely agree.

So what I got on my setup with that value.

1. IOMMU is enabled for Dom0:

(XEN) Extended region 0: 0x40000000->0x47e00000
(XEN) Extended region 1: 0xc0000000->0xe6000000
(XEN) Extended region 2: 0xe7800000->0xec000000
(XEN) Extended region 3: 0xf1200000->0xfd000000
(XEN) Extended region 4: 0x100000000->0x500000000
(XEN) Extended region 5: 0x580000000->0x600000000
(XEN) Extended region 6: 0x680000000->0x700000000
(XEN) Extended region 7: 0x780000000->0x10000000000

2. IOMMU is disabled for Dom0:

(XEN) Extended region 0: 0x48000000->0x54000000
(XEN) Extended region 1: 0x57000000->0x60000000
(XEN) Extended region 2: 0x70000000->0x78000000
(XEN) Extended region 3: 0x78200000->0xc0000000
(XEN) Extended region 4: 0x500000000->0x580000000
(XEN) Extended region 5: 0x600000000->0x680000000
(XEN) Extended region 6: 0x700000000->0x780000000

Which is not bad.

Thank you.
Stefano Stabellini Sept. 16, 2021, 9:30 p.m. UTC | #5
On Thu, 16 Sep 2021, Oleksandr wrote:
> > On Wed, 15 Sep 2021, Oleksandr wrote:
> > > > On Fri, 10 Sep 2021, Oleksandr Tyshchenko wrote:
> > > > > From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > > > 
> > > > > The extended region (safe range) is a region of guest physical
> > > > > address space which is unused and could be safely used to create
> > > > > grant/foreign mappings instead of wasting real RAM pages from
> > > > > the domain memory for establishing these mappings.
> > > > > 
> > > > > The extended regions are chosen at the domain creation time and
> > > > > advertised to it via "reg" property under hypervisor node in
> > > > > the guest device-tree. As region 0 is reserved for grant table
> > > > > space (always present), the indexes for extended regions are 1...N.
> > > > > If extended regions could not be allocated for some reason,
> > > > > Xen doesn't fail and behaves as usual, so only inserts region 0.
> > > > > 
> > > > > Please note the following limitations:
> > > > > - The extended region feature is only supported for 64-bit domain.
> > > > > - The ACPI case is not covered.
> > > > > 
> > > > > ***
> > > > > 
> > > > > As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
> > > > > the algorithm to choose extended regions for it is different
> > > > > in comparison with the algorithm for non-direct mapped DomU.
> > > > > What is more, that extended regions should be chosen differently
> > > > > whether IOMMU is enabled or not.
> > > > > 
> > > > > Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
> > > > > holes found in host device-tree if otherwise. Make sure that
> > > > > extended regions are 2MB-aligned and located within maximum possible
> > > > > addressable physical memory range. The maximum number of extended
> > > > > regions is 128.
> > > > > 
> > > > > Suggested-by: Julien Grall <jgrall@amazon.com>
> > > > > Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > > > ---
> > > > > Changes since RFC:
> > > > >      - update patch description
> > > > >      - drop uneeded "extended-region" DT property
> > > > > ---
> > > > > 
> > > > >    xen/arch/arm/domain_build.c | 226
> > > > > +++++++++++++++++++++++++++++++++++++++++++-
> > > > >    1 file changed, 224 insertions(+), 2 deletions(-)
> > > > > 
> > > > > diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> > > > > index 206038d..070ec27 100644
> > > > > --- a/xen/arch/arm/domain_build.c
> > > > > +++ b/xen/arch/arm/domain_build.c
> > > > > @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct
> > > > > domain *d,
> > > > >        return res;
> > > > >    }
> > > > >    +static int __init add_ext_regions(unsigned long s, unsigned long
> > > > > e,
> > > > > void *data)
> > > > > +{
> > > > > +    struct meminfo *ext_regions = data;
> > > > > +    paddr_t start, size;
> > > > > +
> > > > > +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
> > > > > +        return 0;
> > > > > +
> > > > > +    /* Both start and size of the extended region should be 2MB
> > > > > aligned
> > > > > */
> > > > > +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
> > > > > +    if ( start > e )
> > > > > +        return 0;
> > > > > +
> > > > > +    size = (e - start + 1) & ~(SZ_2M - 1);
> > > > > +    if ( !size )
> > > > > +        return 0;
> > > > Can't you align size as well?
> > > > 
> > > >     size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);
> > > I am sorry, I don't entirely get what you really meant here. We get both
> > > start
> > > and size 2MB-aligned by the calculations above
> > > (when calculating an alignment, we need to make sure that "start_passed <=
> > > start_aligned && size_aligned <= size_passed").
> > > If I add the proposing string after, I will reduce the already aligned
> > > size by
> > > 2MB.
> > > If I replace the size calculation with the following, I will get the
> > > reduced
> > > size even if the passed region is initially 2MB-aligned, so doesn't need
> > > to be
> > > adjusted.
> > > size = e - s + 1;
> > > size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);
> > Sorry I misread your original code, I think it was working as intended
> > except for the "+1". I think it should be:
> > 
> >    size = (e - start) & ~(SZ_2M - 1);
> But why without "+1"? Isn't "e" here the *last address* of passed range?
> Without "+1" I get non entirely correct calculations, last valid 2MB is
> missed.

You are right: the "+1" should not be needed if this was "end",
following the normal definition of end. However, add_ext_regions is
called by rangeset_report_ranges, so end here is not actually "end", it
is "end-1".

For clarity, I would ask you to rewrite it like this:

/* 
 * e is actually "end-1" because it is called by rangeset functions
 * which are inclusive of the last address.
 */
e += 1;
size = (e - start) & ~(SZ_2M - 1);


> [snip]
> (XEN) Extended region 14: 0x580000000->0x5ffe00000
> (XEN) Extended region 15: 0x680000000->0x6ffe00000
> (XEN) Extended region 16: 0x780000000->0xffffe00000
> 
> But should get:
> 
> [snip]
> (XEN) Extended region 15: 0x580000000->0x600000000
> (XEN) Extended region 16: 0x680000000->0x700000000
> (XEN) Extended region 17: 0x780000000->0x10000000000
> 
> Let's consider how a hole between (for example) RAM bank 1 and bank 2 is
> calculated:
> (XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1 with size
> 0x80000000
> (XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2 with size
> 0x80000000
> So the hole size should also be 0x80000000.
> If we pass these RAM banks to rangeset_remove_range() one by one:
> 1: s = 0x500000000 e = 0x57FFFFFFF
> 2. s = 0x600000000 e = 0x67FFFFFFF
> we get s = 0x580000000 e = 0x5FFFFFFFF in add_ext_regions(), where "e" is the
> last address of the hole (not the first address out of the hole), so I think,
> that for proper size calculation we need to add 1 to "e - s". Or I really
> missed something?
> 
> 
> > 
> > > > > + */
> > > > > +#define EXT_REGION_START   0x40000000ULL
> > > > > +#define EXT_REGION_END     0x80003fffffffULL
> > > > > +
> > > > > +static int __init find_unallocated_memory(const struct kernel_info
> > > > > *kinfo,
> > > > > +                                          struct meminfo
> > > > > *ext_regions)
> > > > > +{
> > > > > +    const struct meminfo *assign_mem = &kinfo->mem;
> > > > > +    struct rangeset *unalloc_mem;
> > > > > +    paddr_t start, end;
> > > > > +    unsigned int i;
> > > > > +    int res;
> > > > > +
> > > > > +    dt_dprintk("Find unallocated memory for extended regions\n");
> > > > > +
> > > > > +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> > > > > +    if ( !unalloc_mem )
> > > > > +        return -ENOMEM;
> > > > > +
> > > > > +    /* Start with all available RAM */
> > > > > +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > > > > +    {
> > > > > +        start = bootinfo.mem.bank[i].start;
> > > > > +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size
> > > > > - 1;
> > > > Is the -1 needed? Isn't it going to screw up the size calculation later?
> > > I thought, it was needed. The calculation seems correct.
> > I think that normally for an example MMIO region:
> > 
> > start = 0x48000000
> > size  = 0x40000000
> > end   = 0x88000000
> > 
> > So end = start + size and points to the first address out of the range.
> > In other words, 0x88000000 doesn't actually belong to the MMIO region in
> > the example.
> > 
> > But here you are passing addresses to rangeset_add_range and other
> > rangeset functions and I think rangeset takes *inclusive* addresses as
> > input. So you need to pass start and end-1 because end-1 is the last
> > address of the MMIO region.
> > 
> > In fact you can see for instance in map_range_to_domain:
> > 
> >          res = iomem_permit_access(d, paddr_to_pfn(addr),
> >                  paddr_to_pfn(PAGE_ALIGN(addr + len - 1)));
> > 
> > Where iomem_permit_access is based on rangeset. So for clarity, I would
> > do:
> > 
> > start = assign_mem->bank[i].start;
> > end = assign_mem->bank[i].start + assign_mem->bank[i].size;
> > res = rangeset_remove_range(unalloc_mem, start, end - 1);
> > 
> > So that we don't get confused on the meaning of "end" which everywhere
> > else means the first address not in range.
> 
> I got your point, I will update the code if it much cleaner.
> 
> 
> > > > > +        res = rangeset_add_range(unalloc_mem, start, end);
> > > > > +        if ( res )
> > > > > +        {
> > > > > +            printk(XENLOG_ERR "Failed to add:
> > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > +                   start, end);
> > > > > +            goto out;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    /* Remove RAM assigned to Dom0 */
> > > > > +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > > > > +    {
> > > > > +        start = assign_mem->bank[i].start;
> > > > > +        end = assign_mem->bank[i].start + assign_mem->bank[i].size -
> > > > > 1;
> > > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > +        if ( res )
> > > > > +        {
> > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > +                   start, end);
> > > > > +            goto out;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    /* Remove reserved-memory regions */
> > > > > +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> > > > > +    {
> > > > > +        start = bootinfo.reserved_mem.bank[i].start;
> > > > > +        end = bootinfo.reserved_mem.bank[i].start +
> > > > > +            bootinfo.reserved_mem.bank[i].size - 1;
> > > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > +        if ( res )
> > > > > +        {
> > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > +                   start, end);
> > > > > +            goto out;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    /* Remove grant table region */
> > > > > +    start = kinfo->gnttab_start;
> > > > > +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> > > > > +    res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > +    if ( res )
> > > > > +    {
> > > > > +        printk(XENLOG_ERR "Failed to remove:
> > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > +               start, end);
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    start = EXT_REGION_START;
> > > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > > +    res = rangeset_report_ranges(unalloc_mem, start, end,
> > > > > +                                 add_ext_regions, ext_regions);
> > > > > +    if ( res )
> > > > > +        ext_regions->nr_banks = 0;
> > > > > +    else if ( !ext_regions->nr_banks )
> > > > > +        res = -ENOENT;
> > > > > +
> > > > > +out:
> > > > > +    rangeset_destroy(unalloc_mem);
> > > > > +
> > > > > +    return res;
> > > > > +}
> > > > > +
> > > > > +static int __init find_memory_holes(const struct kernel_info *kinfo,
> > > > > +                                    struct meminfo *ext_regions)
> > > > > +{
> > > > > +    struct dt_device_node *np;
> > > > > +    struct rangeset *mem_holes;
> > > > > +    paddr_t start, end;
> > > > > +    unsigned int i;
> > > > > +    int res;
> > > > > +
> > > > > +    dt_dprintk("Find memory holes for extended regions\n");
> > > > > +
> > > > > +    mem_holes = rangeset_new(NULL, NULL, 0);
> > > > > +    if ( !mem_holes )
> > > > > +        return -ENOMEM;
> > > > > +
> > > > > +    /* Start with maximum possible addressable physical memory range
> > > > > */
> > > > > +    start = EXT_REGION_START;
> > > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > > +    res = rangeset_add_range(mem_holes, start, end);
> > > > > +    if ( res )
> > > > > +    {
> > > > > +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > > > > +               start, end);
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    /* Remove all regions described by "reg" property (MMIO, RAM,
> > > > > etc) */
> > > > > +    dt_for_each_device_node( dt_host, np )
> > > > Don't you need something like device_tree_for_each_node ?
> > > > dt_for_each_device_node won't go down any deeper in the tree?
> > > Thank you for pointing this out, I will investigate and update the patch.
> > > 
> > > 
> > > > Alternatively, maybe we could simply record the highest possible address
> > > > of any memory/device/anything as we scan the device tree with
> > > > handle_node. Then we can use that as the starting point here.
> > > I also don't like the idea to scan the DT much, but I failed to find an
> > > effective solution how to avoid that.
> > > Yes, we can record the highest possible address, but I am afraid, I didn't
> > > entirely get a suggestion. Is the suggestion to provide a single region
> > > starting from highest possible address + 1 and up to the EXT_REGION_END
> > > suitably aligned? Could you please clarify?
> > Yes, that is what I was suggesting as a possible alternative: start from
> > the highest possible address in DT + 1 and up to the EXT_REGION_END
> > suitably aligned. But that wouldn't solve the <4GB issue.
> > 
> > > > > +                goto out;
> > > > > +            }
> > > > > +
> > > > > +            start = addr & PAGE_MASK;
> > > > > +            end = PAGE_ALIGN(addr + size) - 1;
> > > > > +            res = rangeset_remove_range(mem_holes, start, end);
> > > > > +            if ( res )
> > > > > +            {
> > > > > +                printk(XENLOG_ERR "Failed to remove:
> > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > +                       start, end);
> > > > > +                goto out;
> > > > > +            }
> > > > > +        }
> > > > > +    }
> > > > As is, it will result in a myriad of small ranges which is unuseful and
> > > > slow to parse. I suggest to simplify it by removing a larger region than
> > > > strictly necessary. For instance, you could remove a 1GB-aligned and
> > > > 1GB-multiple region for each range. That way, you are going to get fewer
> > > > large free ranges instance of many small ones which we don't need.
> > > I agree with you that a lot of small ranges increase the bookkeeping in
> > > Dom0
> > > and there is also a theoretical (?) possibility
> > > that small ranges occupy all space we provide for extended regions
> > > (NR_MEM_BANKS)...
> > > But, let's consider my setup as an example again, but when the IOMMU is
> > > enabled for Dom0 ("holes found in DT").
> > > 
> > > --- The RAM configuration is the same:
> > > 
> > > (XEN) RAM: 0000000048000000 - 00000000bfffffff <--- RAM bank 0
> > > (XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1
> > > (XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2
> > > (XEN) RAM: 0000000700000000 - 000000077fffffff <--- RAM bank 3
> > > 
> > > --- There are a lot of various platform devices with reg property
> > > described in
> > > DT, I will probably not post all IO ranges here, just say that mostly all
> > > of
> > > them to be mapped at 0xE0000000-0xFFFFFFFF.
> > > 
> > > --- As we only pick up ranges with size >= 2MB, the calculated extended
> > > regions are (based on 40-bit IPA):
> > > 
> > > (XEN) Extended region 0: 0x40000000->0x47e00000
> > > (XEN) Extended region 1: 0xc0000000->0xe6000000
> > > (XEN) Extended region 2: 0xe7000000->0xe7200000
> > > (XEN) Extended region 3: 0xe7400000->0xe7600000
> > > (XEN) Extended region 4: 0xe7800000->0xec000000
> > > (XEN) Extended region 5: 0xec200000->0xec400000
> > > (XEN) Extended region 6: 0xec800000->0xee000000
> > > (XEN) Extended region 7: 0xee600000->0xee800000
> > > (XEN) Extended region 8: 0xeea00000->0xf1000000
> > > (XEN) Extended region 9: 0xf1200000->0xfd000000
> > > (XEN) Extended region 10: 0xfd200000->0xfd800000
> > > (XEN) Extended region 11: 0xfda00000->0xfe000000
> > > (XEN) Extended region 12: 0xfe200000->0xfe600000
> > > (XEN) Extended region 13: 0xfec00000->0xff800000
> > > (XEN) Extended region 14: 0x100000000->0x500000000
> > > (XEN) Extended region 15: 0x580000000->0x600000000
> > > (XEN) Extended region 16: 0x680000000->0x700000000
> > > (XEN) Extended region 17: 0x780000000->0x10000000000
> > > 
> > > So, if I *correctly* understood your idea about removing 1GB-aligned
> > > 1GB-multiple region for each range we would get the following:
> > > 
> > > (XEN) Extended region 0: 0x100000000->0x500000000
> > > (XEN) Extended region 1: 0x580000000->0x600000000
> > > (XEN) Extended region 2: 0x680000000->0x700000000
> > > (XEN) Extended region 3: 0x780000000->0x10000000000
> > > 
> > > As you can see there are no extended regions below 4GB at all. I assume,
> > > it
> > > would be good to provide them for 1:1 mapped Dom0 (for 32-bit DMA
> > > devices?)
> > > What else worries me is that IPA size could be 36 or even 32. So, I am
> > > afraid,
> > > we might even fail to find extended regions above 4GB.
> > > 
> > > 
> > > I think, if 2MB is considered small enough to bother with, probably we
> > > should
> > > go with something in between (16MB, 32MB, 64MB).
> > > For example, we can take into the account ranges with size >= 16MB:
> > > 
> > > (XEN) Extended region 0: 0x40000000->0x47e00000
> > > (XEN) Extended region 1: 0xc0000000->0xe6000000
> > > (XEN) Extended region 2: 0xe7800000->0xec000000
> > > (XEN) Extended region 3: 0xec800000->0xee000000
> > > (XEN) Extended region 4: 0xeea00000->0xf1000000
> > > (XEN) Extended region 5: 0xf1200000->0xfd000000
> > > (XEN) Extended region 6: 0x100000000->0x500000000
> > > (XEN) Extended region 7: 0x580000000->0x600000000
> > > (XEN) Extended region 8: 0x680000000->0x700000000
> > > (XEN) Extended region 9: 0x780000000->0x10000000000
> > > 
> > > Any thoughts?
> > Yeah maybe an intermediate value would be best. I'd go with 64MB.
> 
> I completely agree.
> 
> So what I got on my setup with that value.
> 
> 1. IOMMU is enabled for Dom0:
> 
> (XEN) Extended region 0: 0x40000000->0x47e00000
> (XEN) Extended region 1: 0xc0000000->0xe6000000
> (XEN) Extended region 2: 0xe7800000->0xec000000
> (XEN) Extended region 3: 0xf1200000->0xfd000000
> (XEN) Extended region 4: 0x100000000->0x500000000
> (XEN) Extended region 5: 0x580000000->0x600000000
> (XEN) Extended region 6: 0x680000000->0x700000000
> (XEN) Extended region 7: 0x780000000->0x10000000000
> 
> 2. IOMMU is disabled for Dom0:
> 
> (XEN) Extended region 0: 0x48000000->0x54000000
> (XEN) Extended region 1: 0x57000000->0x60000000
> (XEN) Extended region 2: 0x70000000->0x78000000
> (XEN) Extended region 3: 0x78200000->0xc0000000
> (XEN) Extended region 4: 0x500000000->0x580000000
> (XEN) Extended region 5: 0x600000000->0x680000000
> (XEN) Extended region 6: 0x700000000->0x780000000
> 
> Which is not bad.

Yeah I think that's good.
Oleksandr Tyshchenko Sept. 17, 2021, 7:28 a.m. UTC | #6
On 17.09.21 00:30, Stefano Stabellini wrote:

Hi Stefano

> On Thu, 16 Sep 2021, Oleksandr wrote:
>>> On Wed, 15 Sep 2021, Oleksandr wrote:
>>>>> On Fri, 10 Sep 2021, Oleksandr Tyshchenko wrote:
>>>>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>>>
>>>>>> The extended region (safe range) is a region of guest physical
>>>>>> address space which is unused and could be safely used to create
>>>>>> grant/foreign mappings instead of wasting real RAM pages from
>>>>>> the domain memory for establishing these mappings.
>>>>>>
>>>>>> The extended regions are chosen at the domain creation time and
>>>>>> advertised to it via "reg" property under hypervisor node in
>>>>>> the guest device-tree. As region 0 is reserved for grant table
>>>>>> space (always present), the indexes for extended regions are 1...N.
>>>>>> If extended regions could not be allocated for some reason,
>>>>>> Xen doesn't fail and behaves as usual, so only inserts region 0.
>>>>>>
>>>>>> Please note the following limitations:
>>>>>> - The extended region feature is only supported for 64-bit domain.
>>>>>> - The ACPI case is not covered.
>>>>>>
>>>>>> ***
>>>>>>
>>>>>> As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
>>>>>> the algorithm to choose extended regions for it is different
>>>>>> in comparison with the algorithm for non-direct mapped DomU.
>>>>>> What is more, that extended regions should be chosen differently
>>>>>> whether IOMMU is enabled or not.
>>>>>>
>>>>>> Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
>>>>>> holes found in host device-tree if otherwise. Make sure that
>>>>>> extended regions are 2MB-aligned and located within maximum possible
>>>>>> addressable physical memory range. The maximum number of extended
>>>>>> regions is 128.
>>>>>>
>>>>>> Suggested-by: Julien Grall <jgrall@amazon.com>
>>>>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>>> ---
>>>>>> Changes since RFC:
>>>>>>       - update patch description
>>>>>>       - drop uneeded "extended-region" DT property
>>>>>> ---
>>>>>>
>>>>>>     xen/arch/arm/domain_build.c | 226
>>>>>> +++++++++++++++++++++++++++++++++++++++++++-
>>>>>>     1 file changed, 224 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>>>>>> index 206038d..070ec27 100644
>>>>>> --- a/xen/arch/arm/domain_build.c
>>>>>> +++ b/xen/arch/arm/domain_build.c
>>>>>> @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct
>>>>>> domain *d,
>>>>>>         return res;
>>>>>>     }
>>>>>>     +static int __init add_ext_regions(unsigned long s, unsigned long
>>>>>> e,
>>>>>> void *data)
>>>>>> +{
>>>>>> +    struct meminfo *ext_regions = data;
>>>>>> +    paddr_t start, size;
>>>>>> +
>>>>>> +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    /* Both start and size of the extended region should be 2MB
>>>>>> aligned
>>>>>> */
>>>>>> +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
>>>>>> +    if ( start > e )
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    size = (e - start + 1) & ~(SZ_2M - 1);
>>>>>> +    if ( !size )
>>>>>> +        return 0;
>>>>> Can't you align size as well?
>>>>>
>>>>>      size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);
>>>> I am sorry, I don't entirely get what you really meant here. We get both
>>>> start
>>>> and size 2MB-aligned by the calculations above
>>>> (when calculating an alignment, we need to make sure that "start_passed <=
>>>> start_aligned && size_aligned <= size_passed").
>>>> If I add the proposing string after, I will reduce the already aligned
>>>> size by
>>>> 2MB.
>>>> If I replace the size calculation with the following, I will get the
>>>> reduced
>>>> size even if the passed region is initially 2MB-aligned, so doesn't need
>>>> to be
>>>> adjusted.
>>>> size = e - s + 1;
>>>> size = (size - (SZ_2M - 1)) & ~(SZ_2M - 1);
>>> Sorry I misread your original code, I think it was working as intended
>>> except for the "+1". I think it should be:
>>>
>>>     size = (e - start) & ~(SZ_2M - 1);
>> But why without "+1"? Isn't "e" here the *last address* of passed range?
>> Without "+1" I get non entirely correct calculations, last valid 2MB is
>> missed.
> You are right: the "+1" should not be needed if this was "end",
> following the normal definition of end. However, add_ext_regions is
> called by rangeset_report_ranges, so end here is not actually "end", it
> is "end-1".

Yes.


>
> For clarity, I would ask you to rewrite it like this:
>
> /*
>   * e is actually "end-1" because it is called by rangeset functions
>   * which are inclusive of the last address.
>   */
> e += 1;
> size = (e - start) & ~(SZ_2M - 1);

Ack, will do.


>
>
>> [snip]
>> (XEN) Extended region 14: 0x580000000->0x5ffe00000
>> (XEN) Extended region 15: 0x680000000->0x6ffe00000
>> (XEN) Extended region 16: 0x780000000->0xffffe00000
>>
>> But should get:
>>
>> [snip]
>> (XEN) Extended region 15: 0x580000000->0x600000000
>> (XEN) Extended region 16: 0x680000000->0x700000000
>> (XEN) Extended region 17: 0x780000000->0x10000000000
>>
>> Let's consider how a hole between (for example) RAM bank 1 and bank 2 is
>> calculated:
>> (XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1 with size
>> 0x80000000
>> (XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2 with size
>> 0x80000000
>> So the hole size should also be 0x80000000.
>> If we pass these RAM banks to rangeset_remove_range() one by one:
>> 1: s = 0x500000000 e = 0x57FFFFFFF
>> 2. s = 0x600000000 e = 0x67FFFFFFF
>> we get s = 0x580000000 e = 0x5FFFFFFFF in add_ext_regions(), where "e" is the
>> last address of the hole (not the first address out of the hole), so I think,
>> that for proper size calculation we need to add 1 to "e - s". Or I really
>> missed something?
>>
>>
>>>>>> + */
>>>>>> +#define EXT_REGION_START   0x40000000ULL
>>>>>> +#define EXT_REGION_END     0x80003fffffffULL
>>>>>> +
>>>>>> +static int __init find_unallocated_memory(const struct kernel_info
>>>>>> *kinfo,
>>>>>> +                                          struct meminfo
>>>>>> *ext_regions)
>>>>>> +{
>>>>>> +    const struct meminfo *assign_mem = &kinfo->mem;
>>>>>> +    struct rangeset *unalloc_mem;
>>>>>> +    paddr_t start, end;
>>>>>> +    unsigned int i;
>>>>>> +    int res;
>>>>>> +
>>>>>> +    dt_dprintk("Find unallocated memory for extended regions\n");
>>>>>> +
>>>>>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>>>>>> +    if ( !unalloc_mem )
>>>>>> +        return -ENOMEM;
>>>>>> +
>>>>>> +    /* Start with all available RAM */
>>>>>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>>>>> +    {
>>>>>> +        start = bootinfo.mem.bank[i].start;
>>>>>> +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size
>>>>>> - 1;
>>>>> Is the -1 needed? Isn't it going to screw up the size calculation later?
>>>> I thought, it was needed. The calculation seems correct.
>>> I think that normally for an example MMIO region:
>>>
>>> start = 0x48000000
>>> size  = 0x40000000
>>> end   = 0x88000000
>>>
>>> So end = start + size and points to the first address out of the range.
>>> In other words, 0x88000000 doesn't actually belong to the MMIO region in
>>> the example.
>>>
>>> But here you are passing addresses to rangeset_add_range and other
>>> rangeset functions and I think rangeset takes *inclusive* addresses as
>>> input. So you need to pass start and end-1 because end-1 is the last
>>> address of the MMIO region.
>>>
>>> In fact you can see for instance in map_range_to_domain:
>>>
>>>           res = iomem_permit_access(d, paddr_to_pfn(addr),
>>>                   paddr_to_pfn(PAGE_ALIGN(addr + len - 1)));
>>>
>>> Where iomem_permit_access is based on rangeset. So for clarity, I would
>>> do:
>>>
>>> start = assign_mem->bank[i].start;
>>> end = assign_mem->bank[i].start + assign_mem->bank[i].size;
>>> res = rangeset_remove_range(unalloc_mem, start, end - 1);
>>>
>>> So that we don't get confused on the meaning of "end" which everywhere
>>> else means the first address not in range.
>> I got your point, I will update the code if it much cleaner.
>>
>>
>>>>>> +        res = rangeset_add_range(unalloc_mem, start, end);
>>>>>> +        if ( res )
>>>>>> +        {
>>>>>> +            printk(XENLOG_ERR "Failed to add:
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +                   start, end);
>>>>>> +            goto out;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Remove RAM assigned to Dom0 */
>>>>>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>>>>> +    {
>>>>>> +        start = assign_mem->bank[i].start;
>>>>>> +        end = assign_mem->bank[i].start + assign_mem->bank[i].size -
>>>>>> 1;
>>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>> +        if ( res )
>>>>>> +        {
>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +                   start, end);
>>>>>> +            goto out;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Remove reserved-memory regions */
>>>>>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>>>>>> +    {
>>>>>> +        start = bootinfo.reserved_mem.bank[i].start;
>>>>>> +        end = bootinfo.reserved_mem.bank[i].start +
>>>>>> +            bootinfo.reserved_mem.bank[i].size - 1;
>>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>> +        if ( res )
>>>>>> +        {
>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +                   start, end);
>>>>>> +            goto out;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Remove grant table region */
>>>>>> +    start = kinfo->gnttab_start;
>>>>>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>>>>>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>> +    if ( res )
>>>>>> +    {
>>>>>> +        printk(XENLOG_ERR "Failed to remove:
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +               start, end);
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +
>>>>>> +    start = EXT_REGION_START;
>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>>>>>> +                                 add_ext_regions, ext_regions);
>>>>>> +    if ( res )
>>>>>> +        ext_regions->nr_banks = 0;
>>>>>> +    else if ( !ext_regions->nr_banks )
>>>>>> +        res = -ENOENT;
>>>>>> +
>>>>>> +out:
>>>>>> +    rangeset_destroy(unalloc_mem);
>>>>>> +
>>>>>> +    return res;
>>>>>> +}
>>>>>> +
>>>>>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>>>>>> +                                    struct meminfo *ext_regions)
>>>>>> +{
>>>>>> +    struct dt_device_node *np;
>>>>>> +    struct rangeset *mem_holes;
>>>>>> +    paddr_t start, end;
>>>>>> +    unsigned int i;
>>>>>> +    int res;
>>>>>> +
>>>>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>>>>> +
>>>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>>>> +    if ( !mem_holes )
>>>>>> +        return -ENOMEM;
>>>>>> +
>>>>>> +    /* Start with maximum possible addressable physical memory range
>>>>>> */
>>>>>> +    start = EXT_REGION_START;
>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>>>> +    if ( res )
>>>>>> +    {
>>>>>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +               start, end);
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Remove all regions described by "reg" property (MMIO, RAM,
>>>>>> etc) */
>>>>>> +    dt_for_each_device_node( dt_host, np )
>>>>> Don't you need something like device_tree_for_each_node ?
>>>>> dt_for_each_device_node won't go down any deeper in the tree?
>>>> Thank you for pointing this out, I will investigate and update the patch.
>>>>
>>>>
>>>>> Alternatively, maybe we could simply record the highest possible address
>>>>> of any memory/device/anything as we scan the device tree with
>>>>> handle_node. Then we can use that as the starting point here.
>>>> I also don't like the idea to scan the DT much, but I failed to find an
>>>> effective solution how to avoid that.
>>>> Yes, we can record the highest possible address, but I am afraid, I didn't
>>>> entirely get a suggestion. Is the suggestion to provide a single region
>>>> starting from highest possible address + 1 and up to the EXT_REGION_END
>>>> suitably aligned? Could you please clarify?
>>> Yes, that is what I was suggesting as a possible alternative: start from
>>> the highest possible address in DT + 1 and up to the EXT_REGION_END
>>> suitably aligned. But that wouldn't solve the <4GB issue.
>>>
>>>>>> +                goto out;
>>>>>> +            }
>>>>>> +
>>>>>> +            start = addr & PAGE_MASK;
>>>>>> +            end = PAGE_ALIGN(addr + size) - 1;
>>>>>> +            res = rangeset_remove_range(mem_holes, start, end);
>>>>>> +            if ( res )
>>>>>> +            {
>>>>>> +                printk(XENLOG_ERR "Failed to remove:
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +                       start, end);
>>>>>> +                goto out;
>>>>>> +            }
>>>>>> +        }
>>>>>> +    }
>>>>> As is, it will result in a myriad of small ranges which is unuseful and
>>>>> slow to parse. I suggest to simplify it by removing a larger region than
>>>>> strictly necessary. For instance, you could remove a 1GB-aligned and
>>>>> 1GB-multiple region for each range. That way, you are going to get fewer
>>>>> large free ranges instance of many small ones which we don't need.
>>>> I agree with you that a lot of small ranges increase the bookkeeping in
>>>> Dom0
>>>> and there is also a theoretical (?) possibility
>>>> that small ranges occupy all space we provide for extended regions
>>>> (NR_MEM_BANKS)...
>>>> But, let's consider my setup as an example again, but when the IOMMU is
>>>> enabled for Dom0 ("holes found in DT").
>>>>
>>>> --- The RAM configuration is the same:
>>>>
>>>> (XEN) RAM: 0000000048000000 - 00000000bfffffff <--- RAM bank 0
>>>> (XEN) RAM: 0000000500000000 - 000000057fffffff <--- RAM bank 1
>>>> (XEN) RAM: 0000000600000000 - 000000067fffffff <--- RAM bank 2
>>>> (XEN) RAM: 0000000700000000 - 000000077fffffff <--- RAM bank 3
>>>>
>>>> --- There are a lot of various platform devices with reg property
>>>> described in
>>>> DT, I will probably not post all IO ranges here, just say that mostly all
>>>> of
>>>> them to be mapped at 0xE0000000-0xFFFFFFFF.
>>>>
>>>> --- As we only pick up ranges with size >= 2MB, the calculated extended
>>>> regions are (based on 40-bit IPA):
>>>>
>>>> (XEN) Extended region 0: 0x40000000->0x47e00000
>>>> (XEN) Extended region 1: 0xc0000000->0xe6000000
>>>> (XEN) Extended region 2: 0xe7000000->0xe7200000
>>>> (XEN) Extended region 3: 0xe7400000->0xe7600000
>>>> (XEN) Extended region 4: 0xe7800000->0xec000000
>>>> (XEN) Extended region 5: 0xec200000->0xec400000
>>>> (XEN) Extended region 6: 0xec800000->0xee000000
>>>> (XEN) Extended region 7: 0xee600000->0xee800000
>>>> (XEN) Extended region 8: 0xeea00000->0xf1000000
>>>> (XEN) Extended region 9: 0xf1200000->0xfd000000
>>>> (XEN) Extended region 10: 0xfd200000->0xfd800000
>>>> (XEN) Extended region 11: 0xfda00000->0xfe000000
>>>> (XEN) Extended region 12: 0xfe200000->0xfe600000
>>>> (XEN) Extended region 13: 0xfec00000->0xff800000
>>>> (XEN) Extended region 14: 0x100000000->0x500000000
>>>> (XEN) Extended region 15: 0x580000000->0x600000000
>>>> (XEN) Extended region 16: 0x680000000->0x700000000
>>>> (XEN) Extended region 17: 0x780000000->0x10000000000
>>>>
>>>> So, if I *correctly* understood your idea about removing 1GB-aligned
>>>> 1GB-multiple region for each range we would get the following:
>>>>
>>>> (XEN) Extended region 0: 0x100000000->0x500000000
>>>> (XEN) Extended region 1: 0x580000000->0x600000000
>>>> (XEN) Extended region 2: 0x680000000->0x700000000
>>>> (XEN) Extended region 3: 0x780000000->0x10000000000
>>>>
>>>> As you can see there are no extended regions below 4GB at all. I assume,
>>>> it
>>>> would be good to provide them for 1:1 mapped Dom0 (for 32-bit DMA
>>>> devices?)
>>>> What else worries me is that IPA size could be 36 or even 32. So, I am
>>>> afraid,
>>>> we might even fail to find extended regions above 4GB.
>>>>
>>>>
>>>> I think, if 2MB is considered small enough to bother with, probably we
>>>> should
>>>> go with something in between (16MB, 32MB, 64MB).
>>>> For example, we can take into the account ranges with size >= 16MB:
>>>>
>>>> (XEN) Extended region 0: 0x40000000->0x47e00000
>>>> (XEN) Extended region 1: 0xc0000000->0xe6000000
>>>> (XEN) Extended region 2: 0xe7800000->0xec000000
>>>> (XEN) Extended region 3: 0xec800000->0xee000000
>>>> (XEN) Extended region 4: 0xeea00000->0xf1000000
>>>> (XEN) Extended region 5: 0xf1200000->0xfd000000
>>>> (XEN) Extended region 6: 0x100000000->0x500000000
>>>> (XEN) Extended region 7: 0x580000000->0x600000000
>>>> (XEN) Extended region 8: 0x680000000->0x700000000
>>>> (XEN) Extended region 9: 0x780000000->0x10000000000
>>>>
>>>> Any thoughts?
>>> Yeah maybe an intermediate value would be best. I'd go with 64MB.
>> I completely agree.
>>
>> So what I got on my setup with that value.
>>
>> 1. IOMMU is enabled for Dom0:
>>
>> (XEN) Extended region 0: 0x40000000->0x47e00000
>> (XEN) Extended region 1: 0xc0000000->0xe6000000
>> (XEN) Extended region 2: 0xe7800000->0xec000000
>> (XEN) Extended region 3: 0xf1200000->0xfd000000
>> (XEN) Extended region 4: 0x100000000->0x500000000
>> (XEN) Extended region 5: 0x580000000->0x600000000
>> (XEN) Extended region 6: 0x680000000->0x700000000
>> (XEN) Extended region 7: 0x780000000->0x10000000000
>>
>> 2. IOMMU is disabled for Dom0:
>>
>> (XEN) Extended region 0: 0x48000000->0x54000000
>> (XEN) Extended region 1: 0x57000000->0x60000000
>> (XEN) Extended region 2: 0x70000000->0x78000000
>> (XEN) Extended region 3: 0x78200000->0xc0000000
>> (XEN) Extended region 4: 0x500000000->0x580000000
>> (XEN) Extended region 5: 0x600000000->0x680000000
>> (XEN) Extended region 6: 0x700000000->0x780000000
>>
>> Which is not bad.
> Yeah I think that's good.
Oleksandr Tyshchenko Sept. 17, 2021, 2:08 p.m. UTC | #7
On 15.09.21 22:10, Oleksandr wrote:

Hi Stefano.

[snip]
>
>>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>>> +                                    struct meminfo *ext_regions)
>>> +{
>>> +    struct dt_device_node *np;
>>> +    struct rangeset *mem_holes;
>>> +    paddr_t start, end;
>>> +    unsigned int i;
>>> +    int res;
>>> +
>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>> +
>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>> +    if ( !mem_holes )
>>> +        return -ENOMEM;
>>> +
>>> +    /* Start with maximum possible addressable physical memory 
>>> range */
>>> +    start = EXT_REGION_START;
>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>> +    res = rangeset_add_range(mem_holes, start, end);
>>> +    if ( res )
>>> +    {
>>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>> +               start, end);
>>> +        goto out;
>>> +    }
>>> +
>>> +    /* Remove all regions described by "reg" property (MMIO, RAM, 
>>> etc) */
>>> +    dt_for_each_device_node( dt_host, np )
>> Don't you need something like device_tree_for_each_node ?
>> dt_for_each_device_node won't go down any deeper in the tree?
>
> Thank you for pointing this out, I will investigate and update the patch.

I have checked, dt_for_each_device_node( dt_host, np ) iterates all 
nodes, so nothing will be skipped.

As an example for this node:

         hdmi@fead0000 {
             compatible = "renesas,r8a7795-hdmi", "renesas,rcar-gen3-hdmi";
             reg = <0x0 0xfead0000 0x0 0x10000>;
             interrupts = <0x0 0x185 0x4>;
             clocks = <0xc 0x1 0x2d9 0xc 0x0 0x28>;
             clock-names = "iahb", "isfr";
             power-domains = <0x9 0x20>;
             resets = <0xc 0x2d9>;
             status = "okay";
             iommus = <0x50 0xc>;
             xen,passthrough;

             ports {
                 #address-cells = <0x1>;
                 #size-cells = <0x0>;

                 port@0 {
                     reg = <0x0>;

                     endpoint {
                         remote-endpoint = <0xb1>;
                         phandle = <0xc1>;
                     };
                 };

                 port@1 {
                     reg = <0x1>;

                     endpoint {
                         remote-endpoint = <0xb2>;
                         phandle = <0xd1>;
                     };
                 };

                 port@2 {
                     reg = <0x2>;

                     endpoint {
                         remote-endpoint = <0x6f>;
                         phandle = <0x6e>;
                     };
                 };
             };
         };


(XEN) process /soc/hdmi@fead0000
(XEN) ---number_of_address = 1
(XEN) -------0: 0xfead0000->0xfeae0000
(XEN) process /soc/hdmi@fead0000/ports
(XEN) ---number_of_address = 0
(XEN) process /soc/hdmi@fead0000/ports/port@0
(XEN) ---number_of_address = 0
(XEN) process /soc/hdmi@fead0000/ports/port@0/endpoint
(XEN) ---number_of_address = 0
(XEN) process /soc/hdmi@fead0000/ports/port@1
(XEN) ---number_of_address = 0
(XEN) process /soc/hdmi@fead0000/ports/port@1/endpoint
(XEN) ---number_of_address = 0
(XEN) process /soc/hdmi@fead0000/ports/port@2
(XEN) ---number_of_address = 0
(XEN) process /soc/hdmi@fead0000/ports/port@2/endpoint
(XEN) ---number_of_address = 0




[snip]
Julien Grall Sept. 17, 2021, 3:48 p.m. UTC | #8
Hi Oleksandr,

On 10/09/2021 23:18, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> 
> The extended region (safe range) is a region of guest physical
> address space which is unused and could be safely used to create
> grant/foreign mappings instead of wasting real RAM pages from
> the domain memory for establishing these mappings.
> 
> The extended regions are chosen at the domain creation time and
> advertised to it via "reg" property under hypervisor node in
> the guest device-tree. As region 0 is reserved for grant table
> space (always present), the indexes for extended regions are 1...N.
> If extended regions could not be allocated for some reason,
> Xen doesn't fail and behaves as usual, so only inserts region 0.
> 
> Please note the following limitations:
> - The extended region feature is only supported for 64-bit domain.
> - The ACPI case is not covered.

I understand the ACPI is not covered because we would need to create a 
new binding. But I am not sure to understand why 32-bit domain is not 
supported. Can you explain it?

> 
> ***
> 
> As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
> the algorithm to choose extended regions for it is different
> in comparison with the algorithm for non-direct mapped DomU.
> What is more, that extended regions should be chosen differently
> whether IOMMU is enabled or not.
> 
> Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
> holes found in host device-tree if otherwise. 

For the case when the IOMMU is disabled, this will only work if dom0 
cannot allocate memory outside of the original range. This is currently 
the case... but I think this should be spelled out in at least the 
commit message.

> Make sure that
> extended regions are 2MB-aligned and located within maximum possible
> addressable physical memory range. The maximum number of extended
> regions is 128.

Please explain how this limit was chosen.

> 
> Suggested-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> ---
> Changes since RFC:
>     - update patch description
>     - drop uneeded "extended-region" DT property
> ---
> 
>   xen/arch/arm/domain_build.c | 226 +++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 224 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> index 206038d..070ec27 100644
> --- a/xen/arch/arm/domain_build.c
> +++ b/xen/arch/arm/domain_build.c
> @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct domain *d,
>       return res;
>   }
>   
> +static int __init add_ext_regions(unsigned long s, unsigned long e, void *data)
> +{
> +    struct meminfo *ext_regions = data;
> +    paddr_t start, size;
> +
> +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
> +        return 0;
> +
> +    /* Both start and size of the extended region should be 2MB aligned */
> +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
> +    if ( start > e )
> +        return 0;
> +
> +    size = (e - start + 1) & ~(SZ_2M - 1);
> +    if ( !size )
> +        return 0;
> +
> +    ext_regions->bank[ext_regions->nr_banks].start = start;
> +    ext_regions->bank[ext_regions->nr_banks].size = size;
> +    ext_regions->nr_banks ++;
> +
> +    return 0;
> +}
> +
> +/*
> + * The extended regions will be prevalidated by the memory hotplug path
> + * in Linux which requires for any added address range to be within maximum
> + * possible addressable physical memory range for which the linear mapping
> + * could be created.
> + * For 48-bit VA space size the maximum addressable range are:

When I read "maximum", I understand an upper limit. But below, you are 
providing a range. So should you drop "maximum"?

Also, this is tailored to Linux using 48-bit VA. How about other limits?

> + * 0x40000000 - 0x80003fffffff
> + */
> +#define EXT_REGION_START   0x40000000ULL

I am probably missing something here.... There are platform out there 
with memory starting at 0 (IIRC ZynqMP is one example). So wouldn't this 
potentially rule out the extended region on such platform?

> +#define EXT_REGION_END     0x80003fffffffULL
> +
> +static int __init find_unallocated_memory(const struct kernel_info *kinfo,
> +                                          struct meminfo *ext_regions)
> +{
> +    const struct meminfo *assign_mem = &kinfo->mem;
> +    struct rangeset *unalloc_mem;
> +    paddr_t start, end;
> +    unsigned int i;
> +    int res;

We technically already know which range of memory is unused. This is 
pretty much any region in the freelist of the page allocator. So how 
about walking the freelist instead?

The advantage is we don't need to worry about modifying the function 
when adding new memory type.

One disavantage is this will not cover *all* the unused memory as this 
is doing. But I think this is an acceptable downside.

> +
> +    dt_dprintk("Find unallocated memory for extended regions\n");
> +
> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> +    if ( !unalloc_mem )
> +        return -ENOMEM;
> +
> +    /* Start with all available RAM */
> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> +    {
> +        start = bootinfo.mem.bank[i].start;
> +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
> +        res = rangeset_add_range(unalloc_mem, start, end);
> +        if ( res )
> +        {
> +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> +                   start, end);
> +            goto out;
> +        }
> +    }
> +
> +    /* Remove RAM assigned to Dom0 */
> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> +    {
> +        start = assign_mem->bank[i].start;
> +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
> +        res = rangeset_remove_range(unalloc_mem, start, end);
> +        if ( res )
> +        {
> +            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> +                   start, end);
> +            goto out;
> +        }
> +    }
> +
> +    /* Remove reserved-memory regions */
> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> +    {
> +        start = bootinfo.reserved_mem.bank[i].start;
> +        end = bootinfo.reserved_mem.bank[i].start +
> +            bootinfo.reserved_mem.bank[i].size - 1;
> +        res = rangeset_remove_range(unalloc_mem, start, end);
> +        if ( res )
> +        {
> +            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> +                   start, end);
> +            goto out;
> +        }
> +    }
> +
> +    /* Remove grant table region */
> +    start = kinfo->gnttab_start;
> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> +    res = rangeset_remove_range(unalloc_mem, start, end);
> +    if ( res )
> +    {
> +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> +               start, end);
> +        goto out;
> +    }
> +
> +    start = EXT_REGION_START;
> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> +    res = rangeset_report_ranges(unalloc_mem, start, end,
> +                                 add_ext_regions, ext_regions);
> +    if ( res )
> +        ext_regions->nr_banks = 0;
> +    else if ( !ext_regions->nr_banks )
> +        res = -ENOENT;
> +
> +out:
> +    rangeset_destroy(unalloc_mem);
> +
> +    return res;
> +}
> +
> +static int __init find_memory_holes(const struct kernel_info *kinfo,
> +                                    struct meminfo *ext_regions)
> +{
> +    struct dt_device_node *np;
> +    struct rangeset *mem_holes;
> +    paddr_t start, end;
> +    unsigned int i;
> +    int res;
> +
> +    dt_dprintk("Find memory holes for extended regions\n");
> +
> +    mem_holes = rangeset_new(NULL, NULL, 0);
> +    if ( !mem_holes )
> +        return -ENOMEM;
> +
> +    /* Start with maximum possible addressable physical memory range */
> +    start = EXT_REGION_START;
> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> +    res = rangeset_add_range(mem_holes, start, end);
> +    if ( res )
> +    {
> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> +               start, end);
> +        goto out;
> +    }
> +
> +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */

Well... The loop below is not going to handle all the regions described 
in the property "reg". Instead, it will cover a subset of "reg" where 
the memory is addressable.

You will also need to cover "ranges" that will describe the BARs for the 
PCI devices.

> +    dt_for_each_device_node( dt_host, np )
> +    {
> +        unsigned int naddr;
> +        u64 addr, size;
> +
> +        naddr = dt_number_of_address(np);
> +
> +        for ( i = 0; i < naddr; i++ )
> +        {
> +            res = dt_device_get_address(np, i, &addr, &size);
> +            if ( res )
> +            {
> +                printk(XENLOG_ERR "Unable to retrieve address %u for %s\n",
> +                       i, dt_node_full_name(np));
> +                goto out;
> +            }
> +
> +            start = addr & PAGE_MASK;
> +            end = PAGE_ALIGN(addr + size) - 1;
> +            res = rangeset_remove_range(mem_holes, start, end);
> +            if ( res )
> +            {
> +                printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> +                       start, end);
> +                goto out;
> +            }
> +        }
> +    }
> +
> +    start = EXT_REGION_START;
> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> +    res = rangeset_report_ranges(mem_holes, start, end,
> +                                 add_ext_regions,  ext_regions);
> +    if ( res )
> +        ext_regions->nr_banks = 0;
> +    else if ( !ext_regions->nr_banks )
> +        res = -ENOENT;
> +
> +out:
> +    rangeset_destroy(mem_holes);
> +
> +    return res;
> +}
> +
>   static int __init make_hypervisor_node(struct domain *d,
>                                          const struct kernel_info *kinfo,
>                                          int addrcells, int sizecells)
> @@ -731,11 +921,13 @@ static int __init make_hypervisor_node(struct domain *d,
>       const char compat[] =
>           "xen,xen-"__stringify(XEN_VERSION)"."__stringify(XEN_SUBVERSION)"\0"
>           "xen,xen";
> -    __be32 reg[4];
> +    __be32 reg[(NR_MEM_BANKS + 1) * 4];

This is a fairly large allocation on the stack. Could we move to a 
dynamic allocation?

>       gic_interrupt_t intr;
>       __be32 *cells;
>       int res;
>       void *fdt = kinfo->fdt;
> +    struct meminfo *ext_regions;
> +    unsigned int i;
>   
>       dt_dprintk("Create hypervisor node\n");
>   
> @@ -757,12 +949,42 @@ static int __init make_hypervisor_node(struct domain *d,
>       if ( res )
>           return res;
>   
> +    ext_regions = xzalloc(struct meminfo);
> +    if ( !ext_regions )
> +        return -ENOMEM;
> +
> +    if ( is_32bit_domain(d) )
> +        printk(XENLOG_WARNING "The extended region is only supported for 64-bit guest\n");
> +    else
> +    {
> +        if ( !is_iommu_enabled(d) )
> +            res = find_unallocated_memory(kinfo, ext_regions);
> +        else
> +            res = find_memory_holes(kinfo, ext_regions);
> +
> +        if ( res )
> +            printk(XENLOG_WARNING "Failed to allocate extended regions\n");
> +    }
> +
>       /* reg 0 is grant table space */
>       cells = &reg[0];
>       dt_child_set_range(&cells, addrcells, sizecells,
>                          kinfo->gnttab_start, kinfo->gnttab_size);
> +    /* reg 1...N are extended regions */
> +    for ( i = 0; i < ext_regions->nr_banks; i++ )
> +    {
> +        u64 start = ext_regions->bank[i].start;
> +        u64 size = ext_regions->bank[i].size;
> +
> +        dt_dprintk("Extended region %d: %#"PRIx64"->%#"PRIx64"\n",
> +                   i, start, start + size);
> +
> +        dt_child_set_range(&cells, addrcells, sizecells, start, size);
> +    }
> +    xfree(ext_regions);
> +
>       res = fdt_property(fdt, "reg", reg,
> -                       dt_cells_to_size(addrcells + sizecells));
> +                       dt_cells_to_size(addrcells + sizecells) * (i + 1));
>       if ( res )
>           return res;
>   
> 

Cheers,
Julien Grall Sept. 17, 2021, 3:52 p.m. UTC | #9
On 16/09/2021 00:10, Oleksandr wrote:
>>> + * The extended regions will be prevalidated by the memory hotplug path
>>> + * in Linux which requires for any added address range to be within 
>>> maximum
>>> + * possible addressable physical memory range for which the linear 
>>> mapping
>>> + * could be created.
>>> + * For 48-bit VA space size the maximum addressable range are:
>>> + * 0x40000000 - 0x80003fffffff
>> Please don't make Linux-specific comments in Xen code for interfaces
>> that are supposed to be OS-agnostic.
> 
> You are right. I just wanted to describe where these magic numbers come 
> from.
> Someone might question why, for example, "0 ... max_gpaddr" can't be 
> used. I will move
> that Linux-specific comments to the commit message to keep some 
> justification of these numbers.

Please keep some rationale in the code. This is a lot easier to 
understand the code without having to play the git blame game.

Cheers,
Oleksandr Tyshchenko Sept. 17, 2021, 7:51 p.m. UTC | #10
On 17.09.21 18:48, Julien Grall wrote:
> Hi Oleksandr,

Hi Julien


>
> On 10/09/2021 23:18, Oleksandr Tyshchenko wrote:
>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>
>> The extended region (safe range) is a region of guest physical
>> address space which is unused and could be safely used to create
>> grant/foreign mappings instead of wasting real RAM pages from
>> the domain memory for establishing these mappings.
>>
>> The extended regions are chosen at the domain creation time and
>> advertised to it via "reg" property under hypervisor node in
>> the guest device-tree. As region 0 is reserved for grant table
>> space (always present), the indexes for extended regions are 1...N.
>> If extended regions could not be allocated for some reason,
>> Xen doesn't fail and behaves as usual, so only inserts region 0.
>>
>> Please note the following limitations:
>> - The extended region feature is only supported for 64-bit domain.
>> - The ACPI case is not covered.
>
> I understand the ACPI is not covered because we would need to create a 
> new binding. But I am not sure to understand why 32-bit domain is not 
> supported. Can you explain it?

The 32-bit domain is not supported for simplifying things from the 
beginning. It is a little bit difficult to get everything working at 
start. As I understand from discussion at [1] we can afford that 
simplification. However, I should have mentioned that 32-bit domain is 
not supported "for now".

>
>>
>> ***
>>
>> As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
>> the algorithm to choose extended regions for it is different
>> in comparison with the algorithm for non-direct mapped DomU.
>> What is more, that extended regions should be chosen differently
>> whether IOMMU is enabled or not.
>>
>> Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
>> holes found in host device-tree if otherwise. 
>
> For the case when the IOMMU is disabled, this will only work if dom0 
> cannot allocate memory outside of the original range. This is 
> currently the case... but I think this should be spelled out in at 
> least the commit message.

Agree, will update commit description.


>
>
>> Make sure that
>> extended regions are 2MB-aligned and located within maximum possible
>> addressable physical memory range. The maximum number of extended
>> regions is 128.
>
> Please explain how this limit was chosen.
Well, I decided to not introduce new data struct and etc to represent 
extended regions but reuse existing struct meminfo
used for memory/reserved-memory and, as I though, perfectly fitted. So, 
that limit come from NR_MEM_BANKS which is 128.


>
>>
>> Suggested-by: Julien Grall <jgrall@amazon.com>
>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>> ---
>> Changes since RFC:
>>     - update patch description
>>     - drop uneeded "extended-region" DT property
>> ---
>>
>>   xen/arch/arm/domain_build.c | 226 
>> +++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 224 insertions(+), 2 deletions(-)
>>
>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>> index 206038d..070ec27 100644
>> --- a/xen/arch/arm/domain_build.c
>> +++ b/xen/arch/arm/domain_build.c
>> @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct 
>> domain *d,
>>       return res;
>>   }
>>   +static int __init add_ext_regions(unsigned long s, unsigned long 
>> e, void *data)
>> +{
>> +    struct meminfo *ext_regions = data;
>> +    paddr_t start, size;
>> +
>> +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
>> +        return 0;
>> +
>> +    /* Both start and size of the extended region should be 2MB 
>> aligned */
>> +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
>> +    if ( start > e )
>> +        return 0;
>> +
>> +    size = (e - start + 1) & ~(SZ_2M - 1);
>> +    if ( !size )
>> +        return 0;
>> +
>> +    ext_regions->bank[ext_regions->nr_banks].start = start;
>> +    ext_regions->bank[ext_regions->nr_banks].size = size;
>> +    ext_regions->nr_banks ++;
>> +
>> +    return 0;
>> +}
>> +
>> +/*
>> + * The extended regions will be prevalidated by the memory hotplug path
>> + * in Linux which requires for any added address range to be within 
>> maximum
>> + * possible addressable physical memory range for which the linear 
>> mapping
>> + * could be created.
>> + * For 48-bit VA space size the maximum addressable range are:
>
> When I read "maximum", I understand an upper limit. But below, you are 
> providing a range. So should you drop "maximum"?

yes, it is a little bit confusing.


>
>
> Also, this is tailored to Linux using 48-bit VA. How about other limits?
These limits are calculated at [2]. Sorry, I didn't investigate yet what 
values would be for other CONFIG_ARM64_VA_BITS_XXX. Also looks like some 
configs depend on 16K/64K pages...
I will try to investigate and provide limits later on.


>
>
>> + * 0x40000000 - 0x80003fffffff
>> + */
>> +#define EXT_REGION_START   0x40000000ULL
>
> I am probably missing something here.... There are platform out there 
> with memory starting at 0 (IIRC ZynqMP is one example). So wouldn't 
> this potentially rule out the extended region on such platform?

 From my understanding the extended region cannot be in 0...0x40000000 
range. If these platforms have memory above first GB, I believe the 
extended region(s) can be allocated for them.


>
>
>> +#define EXT_REGION_END 0x80003fffffffULL
>> +
>> +static int __init find_unallocated_memory(const struct kernel_info 
>> *kinfo,
>> +                                          struct meminfo *ext_regions)
>> +{
>> +    const struct meminfo *assign_mem = &kinfo->mem;
>> +    struct rangeset *unalloc_mem;
>> +    paddr_t start, end;
>> +    unsigned int i;
>> +    int res;
>
> We technically already know which range of memory is unused. This is 
> pretty much any region in the freelist of the page allocator. So how 
> about walking the freelist instead?

ok, I will investigate the page allocator code (right now I have no 
understanding of how to do that). BTW, I have just grepped "freelist" 
through the code and all page context related appearances are in x86 
code only.

>
> The advantage is we don't need to worry about modifying the function 
> when adding new memory type.
>
> One disavantage is this will not cover *all* the unused memory as this 
> is doing. But I think this is an acceptable downside.
>
>> +
>> +    dt_dprintk("Find unallocated memory for extended regions\n");
>> +
>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>> +    if ( !unalloc_mem )
>> +        return -ENOMEM;
>> +
>> +    /* Start with all available RAM */
>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>> +    {
>> +        start = bootinfo.mem.bank[i].start;
>> +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size 
>> - 1;
>> +        res = rangeset_add_range(unalloc_mem, start, end);
>> +        if ( res )
>> +        {
>> +            printk(XENLOG_ERR "Failed to add: 
>> %#"PRIx64"->%#"PRIx64"\n",
>> +                   start, end);
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* Remove RAM assigned to Dom0 */
>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>> +    {
>> +        start = assign_mem->bank[i].start;
>> +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>> +        if ( res )
>> +        {
>> +            printk(XENLOG_ERR "Failed to remove: 
>> %#"PRIx64"->%#"PRIx64"\n",
>> +                   start, end);
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* Remove reserved-memory regions */
>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>> +    {
>> +        start = bootinfo.reserved_mem.bank[i].start;
>> +        end = bootinfo.reserved_mem.bank[i].start +
>> +            bootinfo.reserved_mem.bank[i].size - 1;
>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>> +        if ( res )
>> +        {
>> +            printk(XENLOG_ERR "Failed to remove: 
>> %#"PRIx64"->%#"PRIx64"\n",
>> +                   start, end);
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /* Remove grant table region */
>> +    start = kinfo->gnttab_start;
>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>> +    if ( res )
>> +    {
>> +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>> +               start, end);
>> +        goto out;
>> +    }
>> +
>> +    start = EXT_REGION_START;
>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>> +                                 add_ext_regions, ext_regions);
>> +    if ( res )
>> +        ext_regions->nr_banks = 0;
>> +    else if ( !ext_regions->nr_banks )
>> +        res = -ENOENT;
>> +
>> +out:
>> +    rangeset_destroy(unalloc_mem);
>> +
>> +    return res;
>> +}
>> +
>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>> +                                    struct meminfo *ext_regions)
>> +{
>> +    struct dt_device_node *np;
>> +    struct rangeset *mem_holes;
>> +    paddr_t start, end;
>> +    unsigned int i;
>> +    int res;
>> +
>> +    dt_dprintk("Find memory holes for extended regions\n");
>> +
>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>> +    if ( !mem_holes )
>> +        return -ENOMEM;
>> +
>> +    /* Start with maximum possible addressable physical memory range */
>> +    start = EXT_REGION_START;
>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>> +    res = rangeset_add_range(mem_holes, start, end);
>> +    if ( res )
>> +    {
>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>> +               start, end);
>> +        goto out;
>> +    }
>> +
>> +    /* Remove all regions described by "reg" property (MMIO, RAM, 
>> etc) */
>
> Well... The loop below is not going to handle all the regions 
> described in the property "reg". Instead, it will cover a subset of 
> "reg" where the memory is addressable.

As I understand, we are only interested in subset of "reg" where the 
memory is addressable.


>
>
> You will also need to cover "ranges" that will describe the BARs for 
> the PCI devices.
Good point. Could you please clarify how to recognize whether it is a 
PCI device as long as PCI support is not merged? Or just to find any 
device nodes with non-empty "ranges" property
and retrieve addresses?


>
>
>> +    dt_for_each_device_node( dt_host, np )
>> +    {
>> +        unsigned int naddr;
>> +        u64 addr, size;
>> +
>> +        naddr = dt_number_of_address(np);
>> +
>> +        for ( i = 0; i < naddr; i++ )
>> +        {
>> +            res = dt_device_get_address(np, i, &addr, &size);
>> +            if ( res )
>> +            {
>> +                printk(XENLOG_ERR "Unable to retrieve address %u for 
>> %s\n",
>> +                       i, dt_node_full_name(np));
>> +                goto out;
>> +            }
>> +
>> +            start = addr & PAGE_MASK;
>> +            end = PAGE_ALIGN(addr + size) - 1;
>> +            res = rangeset_remove_range(mem_holes, start, end);
>> +            if ( res )
>> +            {
>> +                printk(XENLOG_ERR "Failed to remove: 
>> %#"PRIx64"->%#"PRIx64"\n",
>> +                       start, end);
>> +                goto out;
>> +            }
>> +        }
>> +    }
>> +
>> +    start = EXT_REGION_START;
>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>> +    res = rangeset_report_ranges(mem_holes, start, end,
>> +                                 add_ext_regions, ext_regions);
>> +    if ( res )
>> +        ext_regions->nr_banks = 0;
>> +    else if ( !ext_regions->nr_banks )
>> +        res = -ENOENT;
>> +
>> +out:
>> +    rangeset_destroy(mem_holes);
>> +
>> +    return res;
>> +}
>> +
>>   static int __init make_hypervisor_node(struct domain *d,
>>                                          const struct kernel_info 
>> *kinfo,
>>                                          int addrcells, int sizecells)
>> @@ -731,11 +921,13 @@ static int __init make_hypervisor_node(struct 
>> domain *d,
>>       const char compat[] =
>> "xen,xen-"__stringify(XEN_VERSION)"."__stringify(XEN_SUBVERSION)"\0"
>>           "xen,xen";
>> -    __be32 reg[4];
>> +    __be32 reg[(NR_MEM_BANKS + 1) * 4];
>
> This is a fairly large allocation on the stack. Could we move to a 
> dynamic allocation?

Of course, will do.


>
>
>>       gic_interrupt_t intr;
>>       __be32 *cells;
>>       int res;
>>       void *fdt = kinfo->fdt;
>> +    struct meminfo *ext_regions;
>> +    unsigned int i;
>>         dt_dprintk("Create hypervisor node\n");
>>   @@ -757,12 +949,42 @@ static int __init make_hypervisor_node(struct 
>> domain *d,
>>       if ( res )
>>           return res;
>>   +    ext_regions = xzalloc(struct meminfo);
>> +    if ( !ext_regions )
>> +        return -ENOMEM;
>> +
>> +    if ( is_32bit_domain(d) )
>> +        printk(XENLOG_WARNING "The extended region is only supported 
>> for 64-bit guest\n");
>> +    else
>> +    {
>> +        if ( !is_iommu_enabled(d) )
>> +            res = find_unallocated_memory(kinfo, ext_regions);
>> +        else
>> +            res = find_memory_holes(kinfo, ext_regions);
>> +
>> +        if ( res )
>> +            printk(XENLOG_WARNING "Failed to allocate extended 
>> regions\n");
>> +    }
>> +
>>       /* reg 0 is grant table space */
>>       cells = &reg[0];
>>       dt_child_set_range(&cells, addrcells, sizecells,
>>                          kinfo->gnttab_start, kinfo->gnttab_size);
>> +    /* reg 1...N are extended regions */
>> +    for ( i = 0; i < ext_regions->nr_banks; i++ )
>> +    {
>> +        u64 start = ext_regions->bank[i].start;
>> +        u64 size = ext_regions->bank[i].size;
>> +
>> +        dt_dprintk("Extended region %d: %#"PRIx64"->%#"PRIx64"\n",
>> +                   i, start, start + size);
>> +
>> +        dt_child_set_range(&cells, addrcells, sizecells, start, size);
>> +    }
>> +    xfree(ext_regions);
>> +
>>       res = fdt_property(fdt, "reg", reg,
>> -                       dt_cells_to_size(addrcells + sizecells));
>> +                       dt_cells_to_size(addrcells + sizecells) * (i 
>> + 1));
>>       if ( res )
>>           return res;
>>
>
> Cheers,

[1] 
https://lore.kernel.org/xen-devel/cb1c8fd4-a4c5-c18e-c8db-f8e317d95526@xen.org/

[2] 
https://elixir.bootlin.com/linux/v5.15-rc1/source/arch/arm64/mm/mmu.c#L1448


Thank you.
Oleksandr Tyshchenko Sept. 17, 2021, 8:13 p.m. UTC | #11
On 17.09.21 18:52, Julien Grall wrote:

Hi Julien

>
>
> On 16/09/2021 00:10, Oleksandr wrote:
>>>> + * The extended regions will be prevalidated by the memory hotplug 
>>>> path
>>>> + * in Linux which requires for any added address range to be 
>>>> within maximum
>>>> + * possible addressable physical memory range for which the linear 
>>>> mapping
>>>> + * could be created.
>>>> + * For 48-bit VA space size the maximum addressable range are:
>>>> + * 0x40000000 - 0x80003fffffff
>>> Please don't make Linux-specific comments in Xen code for interfaces
>>> that are supposed to be OS-agnostic.
>>
>> You are right. I just wanted to describe where these magic numbers 
>> come from.
>> Someone might question why, for example, "0 ... max_gpaddr" can't be 
>> used. I will move
>> that Linux-specific comments to the commit message to keep some 
>> justification of these numbers.
>
> Please keep some rationale in the code. This is a lot easier to 
> understand the code without having to play the git blame game.

ok, to be honest I failed to find how to express OS-depended constraints 
in a OS-agnostic way.


>
>
> Cheers,
>
Stefano Stabellini Sept. 17, 2021, 9:56 p.m. UTC | #12
On Fri, 17 Sep 2021, Oleksandr wrote:
> > > +
> > > +    dt_dprintk("Find unallocated memory for extended regions\n");
> > > +
> > > +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> > > +    if ( !unalloc_mem )
> > > +        return -ENOMEM;
> > > +
> > > +    /* Start with all available RAM */
> > > +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > > +    {
> > > +        start = bootinfo.mem.bank[i].start;
> > > +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
> > > +        res = rangeset_add_range(unalloc_mem, start, end);
> > > +        if ( res )
> > > +        {
> > > +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > > +                   start, end);
> > > +            goto out;
> > > +        }
> > > +    }
> > > +
> > > +    /* Remove RAM assigned to Dom0 */
> > > +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > > +    {
> > > +        start = assign_mem->bank[i].start;
> > > +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
> > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > +        if ( res )
> > > +        {
> > > +            printk(XENLOG_ERR "Failed to remove:
> > > %#"PRIx64"->%#"PRIx64"\n",
> > > +                   start, end);
> > > +            goto out;
> > > +        }
> > > +    }
> > > +
> > > +    /* Remove reserved-memory regions */
> > > +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> > > +    {
> > > +        start = bootinfo.reserved_mem.bank[i].start;
> > > +        end = bootinfo.reserved_mem.bank[i].start +
> > > +            bootinfo.reserved_mem.bank[i].size - 1;
> > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > +        if ( res )
> > > +        {
> > > +            printk(XENLOG_ERR "Failed to remove:
> > > %#"PRIx64"->%#"PRIx64"\n",
> > > +                   start, end);
> > > +            goto out;
> > > +        }
> > > +    }
> > > +
> > > +    /* Remove grant table region */
> > > +    start = kinfo->gnttab_start;
> > > +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> > > +    res = rangeset_remove_range(unalloc_mem, start, end);
> > > +    if ( res )
> > > +    {
> > > +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> > > +               start, end);
> > > +        goto out;
> > > +    }
> > > +
> > > +    start = EXT_REGION_START;
> > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > +    res = rangeset_report_ranges(unalloc_mem, start, end,
> > > +                                 add_ext_regions, ext_regions);
> > > +    if ( res )
> > > +        ext_regions->nr_banks = 0;
> > > +    else if ( !ext_regions->nr_banks )
> > > +        res = -ENOENT;
> > > +
> > > +out:
> > > +    rangeset_destroy(unalloc_mem);
> > > +
> > > +    return res;
> > > +}
> > > +
> > > +static int __init find_memory_holes(const struct kernel_info *kinfo,
> > > +                                    struct meminfo *ext_regions)
> > > +{
> > > +    struct dt_device_node *np;
> > > +    struct rangeset *mem_holes;
> > > +    paddr_t start, end;
> > > +    unsigned int i;
> > > +    int res;
> > > +
> > > +    dt_dprintk("Find memory holes for extended regions\n");
> > > +
> > > +    mem_holes = rangeset_new(NULL, NULL, 0);
> > > +    if ( !mem_holes )
> > > +        return -ENOMEM;
> > > +
> > > +    /* Start with maximum possible addressable physical memory range */
> > > +    start = EXT_REGION_START;
> > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > +    res = rangeset_add_range(mem_holes, start, end);
> > > +    if ( res )
> > > +    {
> > > +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > > +               start, end);
> > > +        goto out;
> > > +    }
> > > +
> > > +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
> > 
> > Well... The loop below is not going to handle all the regions described in
> > the property "reg". Instead, it will cover a subset of "reg" where the
> > memory is addressable.
> 
> As I understand, we are only interested in subset of "reg" where the memory is
> addressable.
> 
> 
> > 
> > 
> > You will also need to cover "ranges" that will describe the BARs for the PCI
> > devices.
> Good point.

Yes, very good point!


> Could you please clarify how to recognize whether it is a PCI
> device as long as PCI support is not merged? Or just to find any device nodes
> with non-empty "ranges" property
> and retrieve addresses?

Normally any bus can have a ranges property with the aperture and
possible address translations, including /amba (compatible =
"simple-bus"). However, in these cases dt_device_get_address already
takes care of it, see xen/common/device_tree.c:dt_device_get_address.

The PCI bus is special for 2 reasons:
- the ranges property has a different format
- the bus is hot-pluggable

So I think the only one that we need to treat specially is PCI.

As far as I am aware PCI is the only bus (or maybe just the only bus
that we support?) where ranges means the aperture.
Stefano Stabellini Sept. 17, 2021, 10:37 p.m. UTC | #13
On Fri, 17 Sep 2021, Stefano Stabellini wrote:
> On Fri, 17 Sep 2021, Oleksandr wrote:
> > > > +
> > > > +    dt_dprintk("Find unallocated memory for extended regions\n");
> > > > +
> > > > +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> > > > +    if ( !unalloc_mem )
> > > > +        return -ENOMEM;
> > > > +
> > > > +    /* Start with all available RAM */
> > > > +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > > > +    {
> > > > +        start = bootinfo.mem.bank[i].start;
> > > > +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
> > > > +        res = rangeset_add_range(unalloc_mem, start, end);
> > > > +        if ( res )
> > > > +        {
> > > > +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > > > +                   start, end);
> > > > +            goto out;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    /* Remove RAM assigned to Dom0 */
> > > > +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > > > +    {
> > > > +        start = assign_mem->bank[i].start;
> > > > +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
> > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > +        if ( res )
> > > > +        {
> > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > +                   start, end);
> > > > +            goto out;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    /* Remove reserved-memory regions */
> > > > +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> > > > +    {
> > > > +        start = bootinfo.reserved_mem.bank[i].start;
> > > > +        end = bootinfo.reserved_mem.bank[i].start +
> > > > +            bootinfo.reserved_mem.bank[i].size - 1;
> > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > +        if ( res )
> > > > +        {
> > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > +                   start, end);
> > > > +            goto out;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    /* Remove grant table region */
> > > > +    start = kinfo->gnttab_start;
> > > > +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> > > > +    res = rangeset_remove_range(unalloc_mem, start, end);
> > > > +    if ( res )
> > > > +    {
> > > > +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> > > > +               start, end);
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    start = EXT_REGION_START;
> > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > +    res = rangeset_report_ranges(unalloc_mem, start, end,
> > > > +                                 add_ext_regions, ext_regions);
> > > > +    if ( res )
> > > > +        ext_regions->nr_banks = 0;
> > > > +    else if ( !ext_regions->nr_banks )
> > > > +        res = -ENOENT;
> > > > +
> > > > +out:
> > > > +    rangeset_destroy(unalloc_mem);
> > > > +
> > > > +    return res;
> > > > +}
> > > > +
> > > > +static int __init find_memory_holes(const struct kernel_info *kinfo,
> > > > +                                    struct meminfo *ext_regions)
> > > > +{
> > > > +    struct dt_device_node *np;
> > > > +    struct rangeset *mem_holes;
> > > > +    paddr_t start, end;
> > > > +    unsigned int i;
> > > > +    int res;
> > > > +
> > > > +    dt_dprintk("Find memory holes for extended regions\n");
> > > > +
> > > > +    mem_holes = rangeset_new(NULL, NULL, 0);
> > > > +    if ( !mem_holes )
> > > > +        return -ENOMEM;
> > > > +
> > > > +    /* Start with maximum possible addressable physical memory range */
> > > > +    start = EXT_REGION_START;
> > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > +    res = rangeset_add_range(mem_holes, start, end);
> > > > +    if ( res )
> > > > +    {
> > > > +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > > > +               start, end);
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
> > > 
> > > Well... The loop below is not going to handle all the regions described in
> > > the property "reg". Instead, it will cover a subset of "reg" where the
> > > memory is addressable.
> > 
> > As I understand, we are only interested in subset of "reg" where the memory is
> > addressable.
> > 
> > 
> > > 
> > > 
> > > You will also need to cover "ranges" that will describe the BARs for the PCI
> > > devices.
> > Good point.
> 
> Yes, very good point!
> 
> 
> > Could you please clarify how to recognize whether it is a PCI
> > device as long as PCI support is not merged? Or just to find any device nodes
> > with non-empty "ranges" property
> > and retrieve addresses?
> 
> Normally any bus can have a ranges property with the aperture and
> possible address translations, including /amba (compatible =
> "simple-bus"). However, in these cases dt_device_get_address already
> takes care of it, see xen/common/device_tree.c:dt_device_get_address.
> 
> The PCI bus is special for 2 reasons:
> - the ranges property has a different format
> - the bus is hot-pluggable
> 
> So I think the only one that we need to treat specially is PCI.
> 
> As far as I am aware PCI is the only bus (or maybe just the only bus
> that we support?) where ranges means the aperture.

Now that I think about this, there is another "hotpluggable" scenario we
need to think about:

[1] https://marc.info/?l=xen-devel&m=163056546214978

Xilinx devices have FPGA regions with apertures currently not described
in device tree, where things can programmed in PL at runtime making new
devices appear with new MMIO regions out of thin air.

Now let me start by saying that yes, the entire programmable region
aperture could probably be described in device tree, however, in
reality it is not currently done in any of the device trees we use
(including the upstream device trees in linux.git).

So, we have a problem :-(


I can work toward getting the right info on device tree, but in reality
that is going to take time and for now the device tree doesn't have the
FPGA aperture in it. So if we accept this series as is, it is going to
stop features like [1] from working.

If we cannot come up with any better plans, I think it would be better
to drop find_memory_holes, only rely on find_unallocated_memory even
when the IOMMU is on. One idea is that we could add on top of the
regions found by find_unallocated_memory any MMIO regions marked as
xen,passthrough: they are safe because they are not going to dom0 anyway.

The only alternative I can think of is to have a per-board
enable/disable toggle for the extend region but it would be very ugly.
Oleksandr Tyshchenko Sept. 18, 2021, 4:59 p.m. UTC | #14
Hi Julien.


[snip]


>>
>>
>>> +#define EXT_REGION_END 0x80003fffffffULL
>>> +
>>> +static int __init find_unallocated_memory(const struct kernel_info 
>>> *kinfo,
>>> +                                          struct meminfo *ext_regions)
>>> +{
>>> +    const struct meminfo *assign_mem = &kinfo->mem;
>>> +    struct rangeset *unalloc_mem;
>>> +    paddr_t start, end;
>>> +    unsigned int i;
>>> +    int res;
>>
>> We technically already know which range of memory is unused. This is 
>> pretty much any region in the freelist of the page allocator. So how 
>> about walking the freelist instead?
>
> ok, I will investigate the page allocator code (right now I have no 
> understanding of how to do that). BTW, I have just grepped "freelist" 
> through the code and all page context related appearances are in x86 
> code only.
>
>>
>> The advantage is we don't need to worry about modifying the function 
>> when adding new memory type.
>>
>> One disavantage is this will not cover *all* the unused memory as 
>> this is doing. But I think this is an acceptable downside.

I did some investigations and create test patch. Although, I am not 100% 
sure this is exactly what you meant, but I will provide results anyway.

1. Below the extended regions (unallocated memory, regions >=64MB ) 
calculated by my initial method (bootinfo.mem - kinfo->mem - 
bootinfo.reserved_mem - kinfo->gnttab):

(XEN) Extended region 0: 0x48000000->0x54000000
(XEN) Extended region 1: 0x57000000->0x60000000
(XEN) Extended region 2: 0x70000000->0x78000000
(XEN) Extended region 3: 0x78200000->0xc0000000
(XEN) Extended region 4: 0x500000000->0x580000000
(XEN) Extended region 5: 0x600000000->0x680000000
(XEN) Extended region 6: 0x700000000->0x780000000

2. Below the extended regions (unallocated memory, regions >=64MB) 
calculated by new method (free memory in page allocator):

(XEN) Extended region 0: 0x48000000->0x54000000
(XEN) Extended region 1: 0x58000000->0x60000000
(XEN) Extended region 2: 0x70000000->0x78000000
(XEN) Extended region 3: 0x78200000->0x84000000
(XEN) Extended region 4: 0x86000000->0x8a000000
(XEN) Extended region 5: 0x8c200000->0xc0000000
(XEN) Extended region 6: 0x500000000->0x580000000
(XEN) Extended region 7: 0x600000000->0x680000000
(XEN) Extended region 8: 0x700000000->0x765e00000

Some thoughts regarding that.

1. A few ranges below 4GB are absent in resulting extended regions. I 
assume, this is because of the modules:

(XEN) Checking for initrd in /chosen
(XEN) Initrd 0000000084000040-0000000085effc48
(XEN) RAM: 0000000048000000 - 00000000bfffffff
(XEN) RAM: 0000000500000000 - 000000057fffffff
(XEN) RAM: 0000000600000000 - 000000067fffffff
(XEN) RAM: 0000000700000000 - 000000077fffffff
(XEN)
(XEN) MODULE[0]: 0000000078080000 - 00000000781d74c8 Xen
(XEN) MODULE[1]: 0000000057fe7000 - 0000000057ffd080 Device Tree
(XEN) MODULE[2]: 0000000084000040 - 0000000085effc48 Ramdisk
(XEN) MODULE[3]: 000000008a000000 - 000000008c000000 Kernel
(XEN) MODULE[4]: 000000008c000000 - 000000008c010000 XSM
(XEN)  RESVD[0]: 0000000084000040 - 0000000085effc48
(XEN)  RESVD[1]: 0000000054000000 - 0000000056ffffff

2. Also, it worth mentioning that relatively large chunk (~417MB) of 
memory above 4GB is absent (to be precise, at the end of last RAM bank), 
which I assume, used for Xen internals.
We could really use it for extended regions.
Below free regions in the heap (for last RAM bank) just in case:

(XEN) heap[node=0][zone=23][order=5] 0x00000765ec0000-0x00000765ee0000
(XEN) heap[node=0][zone=23][order=6] 0x00000765e80000-0x00000765ec0000
(XEN) heap[node=0][zone=23][order=7] 0x00000765e00000-0x00000765e80000
(XEN) heap[node=0][zone=23][order=9] 0x00000765c00000-0x00000765e00000
(XEN) heap[node=0][zone=23][order=10] 0x00000765800000-0x00000765c00000
(XEN) heap[node=0][zone=23][order=11] 0x00000765000000-0x00000765800000
(XEN) heap[node=0][zone=23][order=12] 0x00000764000000-0x00000765000000
(XEN) heap[node=0][zone=23][order=14] 0x00000760000000-0x00000764000000
(XEN) heap[node=0][zone=23][order=17] 0x00000740000000-0x00000760000000
(XEN) heap[node=0][zone=23][order=18] 0x00000540000000-0x00000580000000
(XEN) heap[node=0][zone=23][order=18] 0x00000500000000-0x00000540000000
(XEN) heap[node=0][zone=23][order=18] 0x00000640000000-0x00000680000000
(XEN) heap[node=0][zone=23][order=18] 0x00000600000000-0x00000640000000
(XEN) heap[node=0][zone=23][order=18] 0x00000700000000-0x00000740000000

Yes, you already pointed out this disadvantage, so if it is an 
acceptable downside, I am absolutely OK.


3. Common code updates. There is a question how to properly make a 
connection between common allocator internals and Arm's code for 
creating DT. I didn’t come up with anything better
than creating for_each_avail_page() for invoking a callback with page 
and its order.

**********

Below the proposed changes on top of the initial patch, would this be 
acceptable in general?

diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index 523eb19..1e58fc5 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -753,16 +753,33 @@ static int __init add_ext_regions(unsigned long s, 
unsigned long e, void *data)
      return 0;
  }

+static int __init add_unalloc_mem(struct page_info *page, unsigned int 
order,
+                                  void *data)
+{
+    struct rangeset *unalloc_mem = data;
+    paddr_t start, end;
+    int res;
+
+    start = page_to_maddr(page);
+    end = start + pfn_to_paddr(1UL << order);
+    res = rangeset_add_range(unalloc_mem, start, end - 1);
+    if ( res )
+    {
+        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
+               start, end);
+        return res;
+    }
+
+    return 0;
+}
+
  #define EXT_REGION_START   0x40000000ULL
  #define EXT_REGION_END     0x80003fffffffULL

-static int __init find_unallocated_memory(const struct kernel_info *kinfo,
-                                          struct meminfo *ext_regions)
+static int __init find_unallocated_memory(struct meminfo *ext_regions)
  {
-    const struct meminfo *assign_mem = &kinfo->mem;
      struct rangeset *unalloc_mem;
      paddr_t start, end;
-    unsigned int i;
      int res;

      dt_dprintk("Find unallocated memory for extended regions\n");
@@ -771,59 +788,9 @@ static int __init find_unallocated_memory(const 
struct kernel_info *kinfo,
      if ( !unalloc_mem )
          return -ENOMEM;

-    /* Start with all available RAM */
-    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
-    {
-        start = bootinfo.mem.bank[i].start;
-        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size;
-        res = rangeset_add_range(unalloc_mem, start, end - 1);
-        if ( res )
-        {
-            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
-                   start, end);
-            goto out;
-        }
-    }
-
-    /* Remove RAM assigned to Dom0 */
-    for ( i = 0; i < assign_mem->nr_banks; i++ )
-    {
-        start = assign_mem->bank[i].start;
-        end = assign_mem->bank[i].start + assign_mem->bank[i].size;
-        res = rangeset_remove_range(unalloc_mem, start, end - 1);
-        if ( res )
-        {
-            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
-                   start, end);
-            goto out;
-        }
-    }
-
-    /* Remove reserved-memory regions */
-    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
-    {
-        start = bootinfo.reserved_mem.bank[i].start;
-        end = bootinfo.reserved_mem.bank[i].start +
-            bootinfo.reserved_mem.bank[i].size;
-        res = rangeset_remove_range(unalloc_mem, start, end - 1);
-        if ( res )
-        {
-            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
-                   start, end);
-            goto out;
-        }
-    }
-
-    /* Remove grant table region */
-    start = kinfo->gnttab_start;
-    end = kinfo->gnttab_start + kinfo->gnttab_size;
-    res = rangeset_remove_range(unalloc_mem, start, end - 1);
+    res = for_each_avail_page(add_unalloc_mem, unalloc_mem);
      if ( res )
-    {
-        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
-               start, end);
          goto out;
-    }

      start = EXT_REGION_START;
      end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
@@ -840,8 +807,7 @@ out:
      return res;
  }

-static int __init find_memory_holes(const struct kernel_info *kinfo,
-                                    struct meminfo *ext_regions)
+static int __init find_memory_holes(struct meminfo *ext_regions)
  {
      struct dt_device_node *np;
      struct rangeset *mem_holes;
@@ -961,9 +927,9 @@ static int __init make_hypervisor_node(struct domain *d,
      else
      {
          if ( !is_iommu_enabled(d) )
-            res = find_unallocated_memory(kinfo, ext_regions);
+            res = find_unallocated_memory(ext_regions);
          else
-            res = find_memory_holes(kinfo, ext_regions);
+            res = find_memory_holes(ext_regions);

          if ( res )
              printk(XENLOG_WARNING "Failed to allocate extended 
regions\n");
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 8fad139..7cd1020 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1572,6 +1572,40 @@ static int reserve_heap_page(struct page_info *pg)

  }

+/* TODO heap_lock? */
+int for_each_avail_page(int (*cb)(struct page_info *, unsigned int, 
void *),
+                        void *data)
+{
+    unsigned int node, zone, order;
+    int ret;
+
+    for ( node = 0; node < MAX_NUMNODES; node++ )
+    {
+        if ( !avail[node] )
+            continue;
+
+        for ( zone = 0; zone < NR_ZONES; zone++ )
+        {
+            for ( order = 0; order <= MAX_ORDER; order++ )
+            {
+                struct page_info *head, *tmp;
+
+                if ( page_list_empty(&heap(node, zone, order)) )
+                    continue;
+
+                page_list_for_each_safe ( head, tmp, &heap(node, zone, 
order) )
+                {
+                    ret = cb(head, order, data);
+                    if ( ret )
+                        return ret;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
  int offline_page(mfn_t mfn, int broken, uint32_t *status)
  {
      unsigned long old_info = 0;
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 667f9da..64dd3e2 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -123,6 +123,9 @@ unsigned int online_page(mfn_t mfn, uint32_t *status);
  int offline_page(mfn_t mfn, int broken, uint32_t *status);
  int query_page_offline(mfn_t mfn, uint32_t *status);

+int for_each_avail_page(int (*cb)(struct page_info *, unsigned int, 
void *),
+                        void *data);
+
  void heap_init_late(void);

  int assign_pages(


[snip]
Julien Grall Sept. 19, 2021, 2 p.m. UTC | #15
Hi,

On 18/09/2021 00:51, Oleksandr wrote:
> On 17.09.21 18:48, Julien Grall wrote:
>> On 10/09/2021 23:18, Oleksandr Tyshchenko wrote:
>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>
>>> The extended region (safe range) is a region of guest physical
>>> address space which is unused and could be safely used to create
>>> grant/foreign mappings instead of wasting real RAM pages from
>>> the domain memory for establishing these mappings.
>>>
>>> The extended regions are chosen at the domain creation time and
>>> advertised to it via "reg" property under hypervisor node in
>>> the guest device-tree. As region 0 is reserved for grant table
>>> space (always present), the indexes for extended regions are 1...N.
>>> If extended regions could not be allocated for some reason,
>>> Xen doesn't fail and behaves as usual, so only inserts region 0.
>>>
>>> Please note the following limitations:
>>> - The extended region feature is only supported for 64-bit domain.
>>> - The ACPI case is not covered.
>>
>> I understand the ACPI is not covered because we would need to create a 
>> new binding. But I am not sure to understand why 32-bit domain is not 
>> supported. Can you explain it?
> 
> The 32-bit domain is not supported for simplifying things from the 
> beginning. It is a little bit difficult to get everything working at 
> start. As I understand from discussion at [1] we can afford that 
> simplification. However, I should have mentioned that 32-bit domain is 
> not supported "for now".

Right, I forgot that. This is where it is useful to write down the 
decision in the commit message.

> 
>>
>>>
>>> ***
>>>
>>> As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
>>> the algorithm to choose extended regions for it is different
>>> in comparison with the algorithm for non-direct mapped DomU.
>>> What is more, that extended regions should be chosen differently
>>> whether IOMMU is enabled or not.
>>>
>>> Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
>>> holes found in host device-tree if otherwise. 
>>
>> For the case when the IOMMU is disabled, this will only work if dom0 
>> cannot allocate memory outside of the original range. This is 
>> currently the case... but I think this should be spelled out in at 
>> least the commit message.
> 
> Agree, will update commit description.
> 
> 
>>
>>
>>> Make sure that
>>> extended regions are 2MB-aligned and located within maximum possible
>>> addressable physical memory range. The maximum number of extended
>>> regions is 128.
>>
>> Please explain how this limit was chosen.
> Well, I decided to not introduce new data struct and etc to represent 
> extended regions but reuse existing struct meminfo
> used for memory/reserved-memory and, as I though, perfectly fitted. So, 
> that limit come from NR_MEM_BANKS which is 128.

Ok. So this is an artificial limit. Please make it clear in the commit 
message.

> 
>>
>>>
>>> Suggested-by: Julien Grall <jgrall@amazon.com>
>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>> ---
>>> Changes since RFC:
>>>     - update patch description
>>>     - drop uneeded "extended-region" DT property
>>> ---
>>>
>>>   xen/arch/arm/domain_build.c | 226 
>>> +++++++++++++++++++++++++++++++++++++++++++-
>>>   1 file changed, 224 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>>> index 206038d..070ec27 100644
>>> --- a/xen/arch/arm/domain_build.c
>>> +++ b/xen/arch/arm/domain_build.c
>>> @@ -724,6 +724,196 @@ static int __init make_memory_node(const struct 
>>> domain *d,
>>>       return res;
>>>   }
>>>   +static int __init add_ext_regions(unsigned long s, unsigned long 
>>> e, void *data)
>>> +{
>>> +    struct meminfo *ext_regions = data;
>>> +    paddr_t start, size;
>>> +
>>> +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
>>> +        return 0;
>>> +
>>> +    /* Both start and size of the extended region should be 2MB 
>>> aligned */
>>> +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
>>> +    if ( start > e )
>>> +        return 0;
>>> +
>>> +    size = (e - start + 1) & ~(SZ_2M - 1);
>>> +    if ( !size )
>>> +        return 0;
>>> +
>>> +    ext_regions->bank[ext_regions->nr_banks].start = start;
>>> +    ext_regions->bank[ext_regions->nr_banks].size = size;
>>> +    ext_regions->nr_banks ++;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +/*
>>> + * The extended regions will be prevalidated by the memory hotplug path
>>> + * in Linux which requires for any added address range to be within 
>>> maximum
>>> + * possible addressable physical memory range for which the linear 
>>> mapping
>>> + * could be created.
>>> + * For 48-bit VA space size the maximum addressable range are:
>>
>> When I read "maximum", I understand an upper limit. But below, you are 
>> providing a range. So should you drop "maximum"?
> 
> yes, it is a little bit confusing.
> 
> 
>>
>>
>> Also, this is tailored to Linux using 48-bit VA. How about other limits?
> These limits are calculated at [2]. Sorry, I didn't investigate yet what 
> values would be for other CONFIG_ARM64_VA_BITS_XXX. Also looks like some 
> configs depend on 16K/64K pages...
> I will try to investigate and provide limits later on.

I have thought a bit more about it. At the moment, you are relying on 
Xen to find a range that is addressable by the OS. This can be quite 
complex as different OS may have different requirement. So how about 
letting the OS to filter the ranges based on its limitations?

> 
> 
>>
>>
>>> + * 0x40000000 - 0x80003fffffff
>>> + */
>>> +#define EXT_REGION_START   0x40000000ULL
>>
>> I am probably missing something here.... There are platform out there 
>> with memory starting at 0 (IIRC ZynqMP is one example). So wouldn't 
>> this potentially rule out the extended region on such platform?
> 
>  From my understanding the extended region cannot be in 0...0x40000000 
> range. If these platforms have memory above first GB, I believe the 
> extended region(s) can be allocated for them.

Do you mean "cannot"? Technically this is a limitation of the current 
version of Linux. Tomorrow, someone may be able to remove that 
limitations. So, as mentionned above, maybe Xen should not do the filtering.

>>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>>> +                                    struct meminfo *ext_regions)
>>> +{
>>> +    struct dt_device_node *np;
>>> +    struct rangeset *mem_holes;
>>> +    paddr_t start, end;
>>> +    unsigned int i;
>>> +    int res;
>>> +
>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>> +
>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>> +    if ( !mem_holes )
>>> +        return -ENOMEM;
>>> +
>>> +    /* Start with maximum possible addressable physical memory range */
>>> +    start = EXT_REGION_START;
>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>> +    res = rangeset_add_range(mem_holes, start, end);
>>> +    if ( res )
>>> +    {
>>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>> +               start, end);
>>> +        goto out;
>>> +    }
>>> +
>>> +    /* Remove all regions described by "reg" property (MMIO, RAM, 
>>> etc) */
>>
>> Well... The loop below is not going to handle all the regions 
>> described in the property "reg". Instead, it will cover a subset of 
>> "reg" where the memory is addressable.
> 
> As I understand, we are only interested in subset of "reg" where the 
> memory is addressable.

Right... That's not what your comment is saying.

Cheers,
Julien Grall Sept. 19, 2021, 2:34 p.m. UTC | #16
Hi Stefano,

On 18/09/2021 03:37, Stefano Stabellini wrote:
> On Fri, 17 Sep 2021, Stefano Stabellini wrote:
>> On Fri, 17 Sep 2021, Oleksandr wrote:
>>>>> +
>>>>> +    dt_dprintk("Find unallocated memory for extended regions\n");
>>>>> +
>>>>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>>>>> +    if ( !unalloc_mem )
>>>>> +        return -ENOMEM;
>>>>> +
>>>>> +    /* Start with all available RAM */
>>>>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>>>> +    {
>>>>> +        start = bootinfo.mem.bank[i].start;
>>>>> +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
>>>>> +        res = rangeset_add_range(unalloc_mem, start, end);
>>>>> +        if ( res )
>>>>> +        {
>>>>> +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>>>> +                   start, end);
>>>>> +            goto out;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    /* Remove RAM assigned to Dom0 */
>>>>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>>>> +    {
>>>>> +        start = assign_mem->bank[i].start;
>>>>> +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>> +        if ( res )
>>>>> +        {
>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>> +                   start, end);
>>>>> +            goto out;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    /* Remove reserved-memory regions */
>>>>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>>>>> +    {
>>>>> +        start = bootinfo.reserved_mem.bank[i].start;
>>>>> +        end = bootinfo.reserved_mem.bank[i].start +
>>>>> +            bootinfo.reserved_mem.bank[i].size - 1;
>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>> +        if ( res )
>>>>> +        {
>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>> +                   start, end);
>>>>> +            goto out;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    /* Remove grant table region */
>>>>> +    start = kinfo->gnttab_start;
>>>>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>>>>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>>>>> +    if ( res )
>>>>> +    {
>>>>> +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>>>>> +               start, end);
>>>>> +        goto out;
>>>>> +    }
>>>>> +
>>>>> +    start = EXT_REGION_START;
>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>>>>> +                                 add_ext_regions, ext_regions);
>>>>> +    if ( res )
>>>>> +        ext_regions->nr_banks = 0;
>>>>> +    else if ( !ext_regions->nr_banks )
>>>>> +        res = -ENOENT;
>>>>> +
>>>>> +out:
>>>>> +    rangeset_destroy(unalloc_mem);
>>>>> +
>>>>> +    return res;
>>>>> +}
>>>>> +
>>>>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>>>>> +                                    struct meminfo *ext_regions)
>>>>> +{
>>>>> +    struct dt_device_node *np;
>>>>> +    struct rangeset *mem_holes;
>>>>> +    paddr_t start, end;
>>>>> +    unsigned int i;
>>>>> +    int res;
>>>>> +
>>>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>>>> +
>>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>>> +    if ( !mem_holes )
>>>>> +        return -ENOMEM;
>>>>> +
>>>>> +    /* Start with maximum possible addressable physical memory range */
>>>>> +    start = EXT_REGION_START;
>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>>> +    if ( res )
>>>>> +    {
>>>>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>>>> +               start, end);
>>>>> +        goto out;
>>>>> +    }
>>>>> +
>>>>> +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
>>>>
>>>> Well... The loop below is not going to handle all the regions described in
>>>> the property "reg". Instead, it will cover a subset of "reg" where the
>>>> memory is addressable.
>>>
>>> As I understand, we are only interested in subset of "reg" where the memory is
>>> addressable.
>>>
>>>
>>>>
>>>>
>>>> You will also need to cover "ranges" that will describe the BARs for the PCI
>>>> devices.
>>> Good point.
>>
>> Yes, very good point!
>>
>>
>>> Could you please clarify how to recognize whether it is a PCI
>>> device as long as PCI support is not merged? Or just to find any device nodes
>>> with non-empty "ranges" property
>>> and retrieve addresses?
>>
>> Normally any bus can have a ranges property with the aperture and
>> possible address translations, including /amba (compatible =
>> "simple-bus"). However, in these cases dt_device_get_address already
>> takes care of it, see xen/common/device_tree.c:dt_device_get_address.
>>
>> The PCI bus is special for 2 reasons:
>> - the ranges property has a different format
>> - the bus is hot-pluggable
>>
>> So I think the only one that we need to treat specially is PCI.
>>
>> As far as I am aware PCI is the only bus (or maybe just the only bus
>> that we support?) where ranges means the aperture.
> 
> Now that I think about this, there is another "hotpluggable" scenario we
> need to think about:
> 
> [1] https://marc.info/?l=xen-devel&m=163056546214978
> 
> Xilinx devices have FPGA regions with apertures currently not described
> in device tree, where things can programmed in PL at runtime making new
> devices appear with new MMIO regions out of thin air.
> 
> Now let me start by saying that yes, the entire programmable region
> aperture could probably be described in device tree, however, in
> reality it is not currently done in any of the device trees we use
> (including the upstream device trees in linux.git).

This is rather annoying, but not unheard. There are a couple of 
platforms where the MMIOs are not fully described in the DT.

In fact, we have a callback 'specific_mappings' which create additional 
mappings (e.g. on the omap5) for dom0.

> 
> So, we have a problem :-(
> 
> 
> I can work toward getting the right info on device tree, but in reality
> that is going to take time and for now the device tree doesn't have the
> FPGA aperture in it. So if we accept this series as is, it is going to
> stop features like [1] from working. >
> If we cannot come up with any better plans, I think it would be better
> to drop find_memory_holes, only rely on find_unallocated_memory even
> when the IOMMU is on. One idea is that we could add on top of the
> regions found by find_unallocated_memory any MMIO regions marked as
> xen,passthrough: they are safe because they are not going to dom0 anyway.

(Oleksandr, it looks like some rationale about the different approach is 
missing in the commit message. Can you add it?)

When the IOMMU is on, Xen will do an extra mapping with GFN == MFN for 
every grant mapping in dom0. This is because Linux will always program 
the device with the MFN as it doesn't know whether the device has been 
protected by the hypervisor.

Therefore we can't use find_unallocated_memory() with the IOMMU on as it 
stands.

> 
> The only alternative I can think of is to have a per-board
> enable/disable toggle for the extend region but it would be very ugly.
At least, for your board, you seem to know the list of regions that are 
reserved for future use. So how about adding a per-board list of regions 
that should not be allocated?

This will also include anything mentioned in 'specific_mappings'.

Cheers,
Oleksandr Tyshchenko Sept. 19, 2021, 5:59 p.m. UTC | #17
On 19.09.21 17:00, Julien Grall wrote:
> Hi,

Hi Julien


>
> On 18/09/2021 00:51, Oleksandr wrote:
>> On 17.09.21 18:48, Julien Grall wrote:
>>> On 10/09/2021 23:18, Oleksandr Tyshchenko wrote:
>>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>
>>>> The extended region (safe range) is a region of guest physical
>>>> address space which is unused and could be safely used to create
>>>> grant/foreign mappings instead of wasting real RAM pages from
>>>> the domain memory for establishing these mappings.
>>>>
>>>> The extended regions are chosen at the domain creation time and
>>>> advertised to it via "reg" property under hypervisor node in
>>>> the guest device-tree. As region 0 is reserved for grant table
>>>> space (always present), the indexes for extended regions are 1...N.
>>>> If extended regions could not be allocated for some reason,
>>>> Xen doesn't fail and behaves as usual, so only inserts region 0.
>>>>
>>>> Please note the following limitations:
>>>> - The extended region feature is only supported for 64-bit domain.
>>>> - The ACPI case is not covered.
>>>
>>> I understand the ACPI is not covered because we would need to create 
>>> a new binding. But I am not sure to understand why 32-bit domain is 
>>> not supported. Can you explain it?
>>
>> The 32-bit domain is not supported for simplifying things from the 
>> beginning. It is a little bit difficult to get everything working at 
>> start. As I understand from discussion at [1] we can afford that 
>> simplification. However, I should have mentioned that 32-bit domain 
>> is not supported "for now".
>
> Right, I forgot that. This is where it is useful to write down the 
> decision in the commit message.

ok, will do.


>
>
>>
>>>
>>>>
>>>> ***
>>>>
>>>> As Dom0 is direct mapped domain on Arm (e.g. MFN == GFN)
>>>> the algorithm to choose extended regions for it is different
>>>> in comparison with the algorithm for non-direct mapped DomU.
>>>> What is more, that extended regions should be chosen differently
>>>> whether IOMMU is enabled or not.
>>>>
>>>> Provide RAM not assigned to Dom0 if IOMMU is disabled or memory
>>>> holes found in host device-tree if otherwise. 
>>>
>>> For the case when the IOMMU is disabled, this will only work if dom0 
>>> cannot allocate memory outside of the original range. This is 
>>> currently the case... but I think this should be spelled out in at 
>>> least the commit message.
>>
>> Agree, will update commit description.
>>
>>
>>>
>>>
>>>> Make sure that
>>>> extended regions are 2MB-aligned and located within maximum possible
>>>> addressable physical memory range. The maximum number of extended
>>>> regions is 128.
>>>
>>> Please explain how this limit was chosen.
>> Well, I decided to not introduce new data struct and etc to represent 
>> extended regions but reuse existing struct meminfo
>> used for memory/reserved-memory and, as I though, perfectly fitted. 
>> So, that limit come from NR_MEM_BANKS which is 128.
>
> Ok. So this is an artificial limit. Please make it clear in the commit 
> message.

ok, will do


>
>
>>
>>>
>>>>
>>>> Suggested-by: Julien Grall <jgrall@amazon.com>
>>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>> ---
>>>> Changes since RFC:
>>>>     - update patch description
>>>>     - drop uneeded "extended-region" DT property
>>>> ---
>>>>
>>>>   xen/arch/arm/domain_build.c | 226 
>>>> +++++++++++++++++++++++++++++++++++++++++++-
>>>>   1 file changed, 224 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>>>> index 206038d..070ec27 100644
>>>> --- a/xen/arch/arm/domain_build.c
>>>> +++ b/xen/arch/arm/domain_build.c
>>>> @@ -724,6 +724,196 @@ static int __init make_memory_node(const 
>>>> struct domain *d,
>>>>       return res;
>>>>   }
>>>>   +static int __init add_ext_regions(unsigned long s, unsigned long 
>>>> e, void *data)
>>>> +{
>>>> +    struct meminfo *ext_regions = data;
>>>> +    paddr_t start, size;
>>>> +
>>>> +    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
>>>> +        return 0;
>>>> +
>>>> +    /* Both start and size of the extended region should be 2MB 
>>>> aligned */
>>>> +    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
>>>> +    if ( start > e )
>>>> +        return 0;
>>>> +
>>>> +    size = (e - start + 1) & ~(SZ_2M - 1);
>>>> +    if ( !size )
>>>> +        return 0;
>>>> +
>>>> +    ext_regions->bank[ext_regions->nr_banks].start = start;
>>>> +    ext_regions->bank[ext_regions->nr_banks].size = size;
>>>> +    ext_regions->nr_banks ++;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The extended regions will be prevalidated by the memory hotplug 
>>>> path
>>>> + * in Linux which requires for any added address range to be 
>>>> within maximum
>>>> + * possible addressable physical memory range for which the linear 
>>>> mapping
>>>> + * could be created.
>>>> + * For 48-bit VA space size the maximum addressable range are:
>>>
>>> When I read "maximum", I understand an upper limit. But below, you 
>>> are providing a range. So should you drop "maximum"?
>>
>> yes, it is a little bit confusing.
>>
>>
>>>
>>>
>>> Also, this is tailored to Linux using 48-bit VA. How about other 
>>> limits?
>> These limits are calculated at [2]. Sorry, I didn't investigate yet 
>> what values would be for other CONFIG_ARM64_VA_BITS_XXX. Also looks 
>> like some configs depend on 16K/64K pages...
>> I will try to investigate and provide limits later on.

I have rebuilt Linux with CONFIG_ARM64_VA_BITS_39=y and printed the limits.

These are: 0x40000000 - 0x403fffffff


>>
>
> I have thought a bit more about it. At the moment, you are relying on 
> Xen to find a range that is addressable by the OS. This can be quite 
> complex as different OS may have different requirement. So how about 
> letting the OS to filter the ranges based on its limitations?

I think, it is a nice idea, thank you. So, I will drop OS specific 
limits (EXT_REGION_*) from the patch.

>
>
>>
>>
>>>
>>>
>>>> + * 0x40000000 - 0x80003fffffff
>>>> + */
>>>> +#define EXT_REGION_START   0x40000000ULL
>>>
>>> I am probably missing something here.... There are platform out 
>>> there with memory starting at 0 (IIRC ZynqMP is one example). So 
>>> wouldn't this potentially rule out the extended region on such 
>>> platform?
>>
>>  From my understanding the extended region cannot be in 
>> 0...0x40000000 range. If these platforms have memory above first GB, 
>> I believe the extended region(s) can be allocated for them.
>
> Do you mean "cannot"? 
No. I think, there was some misunderstanding from my size. Initially, I 
got this as "extended region feature cannot be used on such platform in 
general", so I tried to say that if these platforms
also had some RAM *above* 0x40000000 then extended region could be 
allocated for them in principle, we only won't be able to take advantage 
of 0...0x40000000...


> Technically this is a limitation of the current version of Linux. 
> Tomorrow, someone may be able to remove that limitations. So, as 
> mentionned above, maybe Xen should not do the filtering.

I got it, sounds reasonable.


>
>
>>>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>>>> +                                    struct meminfo *ext_regions)
>>>> +{
>>>> +    struct dt_device_node *np;
>>>> +    struct rangeset *mem_holes;
>>>> +    paddr_t start, end;
>>>> +    unsigned int i;
>>>> +    int res;
>>>> +
>>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>>> +
>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>> +    if ( !mem_holes )
>>>> +        return -ENOMEM;
>>>> +
>>>> +    /* Start with maximum possible addressable physical memory 
>>>> range */
>>>> +    start = EXT_REGION_START;
>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>> +    if ( res )
>>>> +    {
>>>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>>> +               start, end);
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    /* Remove all regions described by "reg" property (MMIO, RAM, 
>>>> etc) */
>>>
>>> Well... The loop below is not going to handle all the regions 
>>> described in the property "reg". Instead, it will cover a subset of 
>>> "reg" where the memory is addressable.
>>
>> As I understand, we are only interested in subset of "reg" where the 
>> memory is addressable.
>
> Right... That's not what your comment is saying.

ok, will update.


>
>
> Cheers,
>
Oleksandr Tyshchenko Sept. 19, 2021, 8:18 p.m. UTC | #18
On 19.09.21 17:34, Julien Grall wrote:



> Hi Stefano,

Hi Julien


>
> On 18/09/2021 03:37, Stefano Stabellini wrote:
>> On Fri, 17 Sep 2021, Stefano Stabellini wrote:
>>> On Fri, 17 Sep 2021, Oleksandr wrote:
>>>>>> +
>>>>>> +    dt_dprintk("Find unallocated memory for extended regions\n");
>>>>>> +
>>>>>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>>>>>> +    if ( !unalloc_mem )
>>>>>> +        return -ENOMEM;
>>>>>> +
>>>>>> +    /* Start with all available RAM */
>>>>>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>>>>> +    {
>>>>>> +        start = bootinfo.mem.bank[i].start;
>>>>>> +        end = bootinfo.mem.bank[i].start + 
>>>>>> bootinfo.mem.bank[i].size - 1;
>>>>>> +        res = rangeset_add_range(unalloc_mem, start, end);
>>>>>> +        if ( res )
>>>>>> +        {
>>>>>> +            printk(XENLOG_ERR "Failed to add: 
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +                   start, end);
>>>>>> +            goto out;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Remove RAM assigned to Dom0 */
>>>>>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>>>>> +    {
>>>>>> +        start = assign_mem->bank[i].start;
>>>>>> +        end = assign_mem->bank[i].start + 
>>>>>> assign_mem->bank[i].size - 1;
>>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>> +        if ( res )
>>>>>> +        {
>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +                   start, end);
>>>>>> +            goto out;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Remove reserved-memory regions */
>>>>>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>>>>>> +    {
>>>>>> +        start = bootinfo.reserved_mem.bank[i].start;
>>>>>> +        end = bootinfo.reserved_mem.bank[i].start +
>>>>>> +            bootinfo.reserved_mem.bank[i].size - 1;
>>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>> +        if ( res )
>>>>>> +        {
>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +                   start, end);
>>>>>> +            goto out;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Remove grant table region */
>>>>>> +    start = kinfo->gnttab_start;
>>>>>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>>>>>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>> +    if ( res )
>>>>>> +    {
>>>>>> +        printk(XENLOG_ERR "Failed to remove: 
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +               start, end);
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +
>>>>>> +    start = EXT_REGION_START;
>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>>>>>> +                                 add_ext_regions, ext_regions);
>>>>>> +    if ( res )
>>>>>> +        ext_regions->nr_banks = 0;
>>>>>> +    else if ( !ext_regions->nr_banks )
>>>>>> +        res = -ENOENT;
>>>>>> +
>>>>>> +out:
>>>>>> +    rangeset_destroy(unalloc_mem);
>>>>>> +
>>>>>> +    return res;
>>>>>> +}
>>>>>> +
>>>>>> +static int __init find_memory_holes(const struct kernel_info 
>>>>>> *kinfo,
>>>>>> +                                    struct meminfo *ext_regions)
>>>>>> +{
>>>>>> +    struct dt_device_node *np;
>>>>>> +    struct rangeset *mem_holes;
>>>>>> +    paddr_t start, end;
>>>>>> +    unsigned int i;
>>>>>> +    int res;
>>>>>> +
>>>>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>>>>> +
>>>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>>>> +    if ( !mem_holes )
>>>>>> +        return -ENOMEM;
>>>>>> +
>>>>>> +    /* Start with maximum possible addressable physical memory 
>>>>>> range */
>>>>>> +    start = EXT_REGION_START;
>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>>>> +    if ( res )
>>>>>> +    {
>>>>>> +        printk(XENLOG_ERR "Failed to add: 
>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>> +               start, end);
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Remove all regions described by "reg" property (MMIO, 
>>>>>> RAM, etc) */
>>>>>
>>>>> Well... The loop below is not going to handle all the regions 
>>>>> described in
>>>>> the property "reg". Instead, it will cover a subset of "reg" where 
>>>>> the
>>>>> memory is addressable.
>>>>
>>>> As I understand, we are only interested in subset of "reg" where 
>>>> the memory is
>>>> addressable.
>>>>
>>>>
>>>>>
>>>>>
>>>>> You will also need to cover "ranges" that will describe the BARs 
>>>>> for the PCI
>>>>> devices.
>>>> Good point.
>>>
>>> Yes, very good point!
>>>
>>>
>>>> Could you please clarify how to recognize whether it is a PCI
>>>> device as long as PCI support is not merged? Or just to find any 
>>>> device nodes
>>>> with non-empty "ranges" property
>>>> and retrieve addresses?
>>>
>>> Normally any bus can have a ranges property with the aperture and
>>> possible address translations, including /amba (compatible =
>>> "simple-bus"). However, in these cases dt_device_get_address already
>>> takes care of it, see xen/common/device_tree.c:dt_device_get_address.
>>>
>>> The PCI bus is special for 2 reasons:
>>> - the ranges property has a different format
>>> - the bus is hot-pluggable
>>>
>>> So I think the only one that we need to treat specially is PCI.
>>>
>>> As far as I am aware PCI is the only bus (or maybe just the only bus
>>> that we support?) where ranges means the aperture.
>>
>> Now that I think about this, there is another "hotpluggable" scenario we
>> need to think about:
>>
>> [1] https://marc.info/?l=xen-devel&m=163056546214978
>>
>> Xilinx devices have FPGA regions with apertures currently not described
>> in device tree, where things can programmed in PL at runtime making new
>> devices appear with new MMIO regions out of thin air.
>>
>> Now let me start by saying that yes, the entire programmable region
>> aperture could probably be described in device tree, however, in
>> reality it is not currently done in any of the device trees we use
>> (including the upstream device trees in linux.git).
>
> This is rather annoying, but not unheard. There are a couple of 
> platforms where the MMIOs are not fully described in the DT.
>
> In fact, we have a callback 'specific_mappings' which create 
> additional mappings (e.g. on the omap5) for dom0.
>
>>
>> So, we have a problem :-(
>>
>>
>> I can work toward getting the right info on device tree, but in reality
>> that is going to take time and for now the device tree doesn't have the
>> FPGA aperture in it. So if we accept this series as is, it is going to
>> stop features like [1] from working. >
>> If we cannot come up with any better plans, I think it would be better
>> to drop find_memory_holes, only rely on find_unallocated_memory even
>> when the IOMMU is on. One idea is that we could add on top of the
>> regions found by find_unallocated_memory any MMIO regions marked as
>> xen,passthrough: they are safe because they are not going to dom0 
>> anyway.
>
> (Oleksandr, it looks like some rationale about the different approach 
> is missing in the commit message. Can you add it?)

Yes sure, but let me please clarify what is different approach in this 
context. Is it to *also* take into the account MMIO regions of the 
devices for passthrough for case when IOMMU is off (in addition to 
unallocated memory)? If yes, I wonder whether we will gain much with 
that according to that device's MMIO regions are usually not big enough 
and we stick to allocate extended regions with bigger size (> 64MB).


>
>
> When the IOMMU is on, Xen will do an extra mapping with GFN == MFN for 
> every grant mapping in dom0. This is because Linux will always program 
> the device with the MFN as it doesn't know whether the device has been 
> protected by the hypervisor.
>
> Therefore we can't use find_unallocated_memory() with the IOMMU on as 
> it stands.
>
>>
>> The only alternative I can think of is to have a per-board
>> enable/disable toggle for the extend region but it would be very ugly.
> At least, for your board, you seem to know the list of regions that 
> are reserved for future use. So how about adding a per-board list of 
> regions that should not be allocated?
>
> This will also include anything mentioned in 'specific_mappings'.
>
> Cheers,
>
Stefano Stabellini Sept. 20, 2021, 11:21 p.m. UTC | #19
On Sun, 19 Sep 2021, Oleksandr wrote:
> > On 18/09/2021 03:37, Stefano Stabellini wrote:
> > > On Fri, 17 Sep 2021, Stefano Stabellini wrote:
> > > > On Fri, 17 Sep 2021, Oleksandr wrote:
> > > > > > > +
> > > > > > > +    dt_dprintk("Find unallocated memory for extended regions\n");
> > > > > > > +
> > > > > > > +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> > > > > > > +    if ( !unalloc_mem )
> > > > > > > +        return -ENOMEM;
> > > > > > > +
> > > > > > > +    /* Start with all available RAM */
> > > > > > > +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > > > > > > +    {
> > > > > > > +        start = bootinfo.mem.bank[i].start;
> > > > > > > +        end = bootinfo.mem.bank[i].start +
> > > > > > > bootinfo.mem.bank[i].size - 1;
> > > > > > > +        res = rangeset_add_range(unalloc_mem, start, end);
> > > > > > > +        if ( res )
> > > > > > > +        {
> > > > > > > +            printk(XENLOG_ERR "Failed to add:
> > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > +                   start, end);
> > > > > > > +            goto out;
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    /* Remove RAM assigned to Dom0 */
> > > > > > > +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > > > > > > +    {
> > > > > > > +        start = assign_mem->bank[i].start;
> > > > > > > +        end = assign_mem->bank[i].start +
> > > > > > > assign_mem->bank[i].size - 1;
> > > > > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > > +        if ( res )
> > > > > > > +        {
> > > > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > +                   start, end);
> > > > > > > +            goto out;
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    /* Remove reserved-memory regions */
> > > > > > > +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> > > > > > > +    {
> > > > > > > +        start = bootinfo.reserved_mem.bank[i].start;
> > > > > > > +        end = bootinfo.reserved_mem.bank[i].start +
> > > > > > > +            bootinfo.reserved_mem.bank[i].size - 1;
> > > > > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > > +        if ( res )
> > > > > > > +        {
> > > > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > +                   start, end);
> > > > > > > +            goto out;
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    /* Remove grant table region */
> > > > > > > +    start = kinfo->gnttab_start;
> > > > > > > +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> > > > > > > +    res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > > +    if ( res )
> > > > > > > +    {
> > > > > > > +        printk(XENLOG_ERR "Failed to remove:
> > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > +               start, end);
> > > > > > > +        goto out;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    start = EXT_REGION_START;
> > > > > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > > > > +    res = rangeset_report_ranges(unalloc_mem, start, end,
> > > > > > > +                                 add_ext_regions, ext_regions);
> > > > > > > +    if ( res )
> > > > > > > +        ext_regions->nr_banks = 0;
> > > > > > > +    else if ( !ext_regions->nr_banks )
> > > > > > > +        res = -ENOENT;
> > > > > > > +
> > > > > > > +out:
> > > > > > > +    rangeset_destroy(unalloc_mem);
> > > > > > > +
> > > > > > > +    return res;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int __init find_memory_holes(const struct kernel_info
> > > > > > > *kinfo,
> > > > > > > +                                    struct meminfo *ext_regions)
> > > > > > > +{
> > > > > > > +    struct dt_device_node *np;
> > > > > > > +    struct rangeset *mem_holes;
> > > > > > > +    paddr_t start, end;
> > > > > > > +    unsigned int i;
> > > > > > > +    int res;
> > > > > > > +
> > > > > > > +    dt_dprintk("Find memory holes for extended regions\n");
> > > > > > > +
> > > > > > > +    mem_holes = rangeset_new(NULL, NULL, 0);
> > > > > > > +    if ( !mem_holes )
> > > > > > > +        return -ENOMEM;
> > > > > > > +
> > > > > > > +    /* Start with maximum possible addressable physical memory
> > > > > > > range */
> > > > > > > +    start = EXT_REGION_START;
> > > > > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > > > > +    res = rangeset_add_range(mem_holes, start, end);
> > > > > > > +    if ( res )
> > > > > > > +    {
> > > > > > > +        printk(XENLOG_ERR "Failed to add:
> > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > +               start, end);
> > > > > > > +        goto out;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    /* Remove all regions described by "reg" property (MMIO, RAM,
> > > > > > > etc) */
> > > > > > 
> > > > > > Well... The loop below is not going to handle all the regions
> > > > > > described in
> > > > > > the property "reg". Instead, it will cover a subset of "reg" where
> > > > > > the
> > > > > > memory is addressable.
> > > > > 
> > > > > As I understand, we are only interested in subset of "reg" where the
> > > > > memory is
> > > > > addressable.
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > You will also need to cover "ranges" that will describe the BARs for
> > > > > > the PCI
> > > > > > devices.
> > > > > Good point.
> > > > 
> > > > Yes, very good point!
> > > > 
> > > > 
> > > > > Could you please clarify how to recognize whether it is a PCI
> > > > > device as long as PCI support is not merged? Or just to find any
> > > > > device nodes
> > > > > with non-empty "ranges" property
> > > > > and retrieve addresses?
> > > > 
> > > > Normally any bus can have a ranges property with the aperture and
> > > > possible address translations, including /amba (compatible =
> > > > "simple-bus"). However, in these cases dt_device_get_address already
> > > > takes care of it, see xen/common/device_tree.c:dt_device_get_address.
> > > > 
> > > > The PCI bus is special for 2 reasons:
> > > > - the ranges property has a different format
> > > > - the bus is hot-pluggable
> > > > 
> > > > So I think the only one that we need to treat specially is PCI.
> > > > 
> > > > As far as I am aware PCI is the only bus (or maybe just the only bus
> > > > that we support?) where ranges means the aperture.
> > > 
> > > Now that I think about this, there is another "hotpluggable" scenario we
> > > need to think about:
> > > 
> > > [1] https://marc.info/?l=xen-devel&m=163056546214978
> > > 
> > > Xilinx devices have FPGA regions with apertures currently not described
> > > in device tree, where things can programmed in PL at runtime making new
> > > devices appear with new MMIO regions out of thin air.
> > > 
> > > Now let me start by saying that yes, the entire programmable region
> > > aperture could probably be described in device tree, however, in
> > > reality it is not currently done in any of the device trees we use
> > > (including the upstream device trees in linux.git).
> > 
> > This is rather annoying, but not unheard. There are a couple of platforms
> > where the MMIOs are not fully described in the DT.
> > 
> > In fact, we have a callback 'specific_mappings' which create additional
> > mappings (e.g. on the omap5) for dom0.
> > 
> > > 
> > > So, we have a problem :-(
> > > 
> > > 
> > > I can work toward getting the right info on device tree, but in reality
> > > that is going to take time and for now the device tree doesn't have the
> > > FPGA aperture in it. So if we accept this series as is, it is going to
> > > stop features like [1] from working. >
> > > If we cannot come up with any better plans, I think it would be better
> > > to drop find_memory_holes, only rely on find_unallocated_memory even
> > > when the IOMMU is on. One idea is that we could add on top of the
> > > regions found by find_unallocated_memory any MMIO regions marked as
> > > xen,passthrough: they are safe because they are not going to dom0 anyway.
> > 
> > (Oleksandr, it looks like some rationale about the different approach is
> > missing in the commit message. Can you add it?)
> 
> Yes sure, but let me please clarify what is different approach in this
> context. Is it to *also* take into the account MMIO regions of the devices for
> passthrough for case when IOMMU is off (in addition to unallocated memory)? If
> yes, I wonder whether we will gain much with that according to that device's
> MMIO regions are usually not big enough and we stick to allocate extended
> regions with bigger size (> 64MB).

That's fair enough. There are a couple of counter examples where the
MMIO regions for the device to assign are quite large, for instance a
GPU, Xilinx AIEngine, or the PCIe Root Complex with the entire aperture,
but maybe they are not that common. I am not sure if it is worth
scanning the tree for xen,passthrough regions every time at boot for
this.
Stefano Stabellini Sept. 20, 2021, 11:55 p.m. UTC | #20
On Sun, 19 Sep 2021, Julien Grall wrote:
> On 18/09/2021 03:37, Stefano Stabellini wrote:
> > On Fri, 17 Sep 2021, Stefano Stabellini wrote:
> > > On Fri, 17 Sep 2021, Oleksandr wrote:
> > > > > > +
> > > > > > +    dt_dprintk("Find unallocated memory for extended regions\n");
> > > > > > +
> > > > > > +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> > > > > > +    if ( !unalloc_mem )
> > > > > > +        return -ENOMEM;
> > > > > > +
> > > > > > +    /* Start with all available RAM */
> > > > > > +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > > > > > +    {
> > > > > > +        start = bootinfo.mem.bank[i].start;
> > > > > > +        end = bootinfo.mem.bank[i].start +
> > > > > > bootinfo.mem.bank[i].size - 1;
> > > > > > +        res = rangeset_add_range(unalloc_mem, start, end);
> > > > > > +        if ( res )
> > > > > > +        {
> > > > > > +            printk(XENLOG_ERR "Failed to add:
> > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > +                   start, end);
> > > > > > +            goto out;
> > > > > > +        }
> > > > > > +    }
> > > > > > +
> > > > > > +    /* Remove RAM assigned to Dom0 */
> > > > > > +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > > > > > +    {
> > > > > > +        start = assign_mem->bank[i].start;
> > > > > > +        end = assign_mem->bank[i].start + assign_mem->bank[i].size
> > > > > > - 1;
> > > > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > +        if ( res )
> > > > > > +        {
> > > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > +                   start, end);
> > > > > > +            goto out;
> > > > > > +        }
> > > > > > +    }
> > > > > > +
> > > > > > +    /* Remove reserved-memory regions */
> > > > > > +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> > > > > > +    {
> > > > > > +        start = bootinfo.reserved_mem.bank[i].start;
> > > > > > +        end = bootinfo.reserved_mem.bank[i].start +
> > > > > > +            bootinfo.reserved_mem.bank[i].size - 1;
> > > > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > +        if ( res )
> > > > > > +        {
> > > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > +                   start, end);
> > > > > > +            goto out;
> > > > > > +        }
> > > > > > +    }
> > > > > > +
> > > > > > +    /* Remove grant table region */
> > > > > > +    start = kinfo->gnttab_start;
> > > > > > +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> > > > > > +    res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > +    if ( res )
> > > > > > +    {
> > > > > > +        printk(XENLOG_ERR "Failed to remove:
> > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > +               start, end);
> > > > > > +        goto out;
> > > > > > +    }
> > > > > > +
> > > > > > +    start = EXT_REGION_START;
> > > > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > > > +    res = rangeset_report_ranges(unalloc_mem, start, end,
> > > > > > +                                 add_ext_regions, ext_regions);
> > > > > > +    if ( res )
> > > > > > +        ext_regions->nr_banks = 0;
> > > > > > +    else if ( !ext_regions->nr_banks )
> > > > > > +        res = -ENOENT;
> > > > > > +
> > > > > > +out:
> > > > > > +    rangeset_destroy(unalloc_mem);
> > > > > > +
> > > > > > +    return res;
> > > > > > +}
> > > > > > +
> > > > > > +static int __init find_memory_holes(const struct kernel_info
> > > > > > *kinfo,
> > > > > > +                                    struct meminfo *ext_regions)
> > > > > > +{
> > > > > > +    struct dt_device_node *np;
> > > > > > +    struct rangeset *mem_holes;
> > > > > > +    paddr_t start, end;
> > > > > > +    unsigned int i;
> > > > > > +    int res;
> > > > > > +
> > > > > > +    dt_dprintk("Find memory holes for extended regions\n");
> > > > > > +
> > > > > > +    mem_holes = rangeset_new(NULL, NULL, 0);
> > > > > > +    if ( !mem_holes )
> > > > > > +        return -ENOMEM;
> > > > > > +
> > > > > > +    /* Start with maximum possible addressable physical memory
> > > > > > range */
> > > > > > +    start = EXT_REGION_START;
> > > > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > > > +    res = rangeset_add_range(mem_holes, start, end);
> > > > > > +    if ( res )
> > > > > > +    {
> > > > > > +        printk(XENLOG_ERR "Failed to add:
> > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > +               start, end);
> > > > > > +        goto out;
> > > > > > +    }
> > > > > > +
> > > > > > +    /* Remove all regions described by "reg" property (MMIO, RAM,
> > > > > > etc) */
> > > > > 
> > > > > Well... The loop below is not going to handle all the regions
> > > > > described in
> > > > > the property "reg". Instead, it will cover a subset of "reg" where the
> > > > > memory is addressable.
> > > > 
> > > > As I understand, we are only interested in subset of "reg" where the
> > > > memory is
> > > > addressable.
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > You will also need to cover "ranges" that will describe the BARs for
> > > > > the PCI
> > > > > devices.
> > > > Good point.
> > > 
> > > Yes, very good point!
> > > 
> > > 
> > > > Could you please clarify how to recognize whether it is a PCI
> > > > device as long as PCI support is not merged? Or just to find any device
> > > > nodes
> > > > with non-empty "ranges" property
> > > > and retrieve addresses?
> > > 
> > > Normally any bus can have a ranges property with the aperture and
> > > possible address translations, including /amba (compatible =
> > > "simple-bus"). However, in these cases dt_device_get_address already
> > > takes care of it, see xen/common/device_tree.c:dt_device_get_address.
> > > 
> > > The PCI bus is special for 2 reasons:
> > > - the ranges property has a different format
> > > - the bus is hot-pluggable
> > > 
> > > So I think the only one that we need to treat specially is PCI.
> > > 
> > > As far as I am aware PCI is the only bus (or maybe just the only bus
> > > that we support?) where ranges means the aperture.
> > 
> > Now that I think about this, there is another "hotpluggable" scenario we
> > need to think about:
> > 
> > [1] https://marc.info/?l=xen-devel&m=163056546214978
> > 
> > Xilinx devices have FPGA regions with apertures currently not described
> > in device tree, where things can programmed in PL at runtime making new
> > devices appear with new MMIO regions out of thin air.
> > 
> > Now let me start by saying that yes, the entire programmable region
> > aperture could probably be described in device tree, however, in
> > reality it is not currently done in any of the device trees we use
> > (including the upstream device trees in linux.git).
> 
> This is rather annoying, but not unheard. There are a couple of platforms
> where the MMIOs are not fully described in the DT.
> 
> In fact, we have a callback 'specific_mappings' which create additional
> mappings (e.g. on the omap5) for dom0.

Just for clarity this is a bit different because it is not an
MMIO-region yet. It is only a *potential* MMIO region. Basically it is
nothing until the Programmable Logic gets programmed. But the
Programmable Logic only uses addresses within a given range, thankfully,
and we know the range beforehand.

 
> > So, we have a problem :-(
> > 
> > 
> > I can work toward getting the right info on device tree, but in reality
> > that is going to take time and for now the device tree doesn't have the
> > FPGA aperture in it. So if we accept this series as is, it is going to
> > stop features like [1] from working. >
> > If we cannot come up with any better plans, I think it would be better
> > to drop find_memory_holes, only rely on find_unallocated_memory even
> > when the IOMMU is on. One idea is that we could add on top of the
> > regions found by find_unallocated_memory any MMIO regions marked as
> > xen,passthrough: they are safe because they are not going to dom0 anyway.
> 
> (Oleksandr, it looks like some rationale about the different approach is
> missing in the commit message. Can you add it?)
> 
> When the IOMMU is on, Xen will do an extra mapping with GFN == MFN for every
> grant mapping in dom0. This is because Linux will always program the device
> with the MFN as it doesn't know whether the device has been protected by the
> hypervisor.
> 
> Therefore we can't use find_unallocated_memory() with the IOMMU on as it
> stands.
> 
> > The only alternative I can think of is to have a per-board
> > enable/disable toggle for the extend region but it would be very ugly.
> At least, for your board, you seem to know the list of regions that are
> reserved for future use.
>
> So how about adding a per-board list of regions that
> should not be allocated?
> 
> This will also include anything mentioned in 'specific_mappings'.

I am OK with that. I should be able to find the potential address ranges
for Xilinx boards. However, the ranges might be different for different
boards (for different family of boards, not for every little different
revision). Hopefully, Xilinx is the worst case as the hardware is
actually programmable.
Oleksandr Tyshchenko Sept. 21, 2021, 6:14 p.m. UTC | #21
On 21.09.21 02:21, Stefano Stabellini wrote:

Hi Stefano

> On Sun, 19 Sep 2021, Oleksandr wrote:
>>> On 18/09/2021 03:37, Stefano Stabellini wrote:
>>>> On Fri, 17 Sep 2021, Stefano Stabellini wrote:
>>>>> On Fri, 17 Sep 2021, Oleksandr wrote:
>>>>>>>> +
>>>>>>>> +    dt_dprintk("Find unallocated memory for extended regions\n");
>>>>>>>> +
>>>>>>>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>>>>>>>> +    if ( !unalloc_mem )
>>>>>>>> +        return -ENOMEM;
>>>>>>>> +
>>>>>>>> +    /* Start with all available RAM */
>>>>>>>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>>>>>>> +    {
>>>>>>>> +        start = bootinfo.mem.bank[i].start;
>>>>>>>> +        end = bootinfo.mem.bank[i].start +
>>>>>>>> bootinfo.mem.bank[i].size - 1;
>>>>>>>> +        res = rangeset_add_range(unalloc_mem, start, end);
>>>>>>>> +        if ( res )
>>>>>>>> +        {
>>>>>>>> +            printk(XENLOG_ERR "Failed to add:
>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>> +                   start, end);
>>>>>>>> +            goto out;
>>>>>>>> +        }
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    /* Remove RAM assigned to Dom0 */
>>>>>>>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>>>>>>> +    {
>>>>>>>> +        start = assign_mem->bank[i].start;
>>>>>>>> +        end = assign_mem->bank[i].start +
>>>>>>>> assign_mem->bank[i].size - 1;
>>>>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>>>> +        if ( res )
>>>>>>>> +        {
>>>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>> +                   start, end);
>>>>>>>> +            goto out;
>>>>>>>> +        }
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    /* Remove reserved-memory regions */
>>>>>>>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>>>>>>>> +    {
>>>>>>>> +        start = bootinfo.reserved_mem.bank[i].start;
>>>>>>>> +        end = bootinfo.reserved_mem.bank[i].start +
>>>>>>>> +            bootinfo.reserved_mem.bank[i].size - 1;
>>>>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>>>> +        if ( res )
>>>>>>>> +        {
>>>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>> +                   start, end);
>>>>>>>> +            goto out;
>>>>>>>> +        }
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    /* Remove grant table region */
>>>>>>>> +    start = kinfo->gnttab_start;
>>>>>>>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>>>>>>>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>>>> +    if ( res )
>>>>>>>> +    {
>>>>>>>> +        printk(XENLOG_ERR "Failed to remove:
>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>> +               start, end);
>>>>>>>> +        goto out;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    start = EXT_REGION_START;
>>>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>>>>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>>>>>>>> +                                 add_ext_regions, ext_regions);
>>>>>>>> +    if ( res )
>>>>>>>> +        ext_regions->nr_banks = 0;
>>>>>>>> +    else if ( !ext_regions->nr_banks )
>>>>>>>> +        res = -ENOENT;
>>>>>>>> +
>>>>>>>> +out:
>>>>>>>> +    rangeset_destroy(unalloc_mem);
>>>>>>>> +
>>>>>>>> +    return res;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int __init find_memory_holes(const struct kernel_info
>>>>>>>> *kinfo,
>>>>>>>> +                                    struct meminfo *ext_regions)
>>>>>>>> +{
>>>>>>>> +    struct dt_device_node *np;
>>>>>>>> +    struct rangeset *mem_holes;
>>>>>>>> +    paddr_t start, end;
>>>>>>>> +    unsigned int i;
>>>>>>>> +    int res;
>>>>>>>> +
>>>>>>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>>>>>>> +
>>>>>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>>>>>> +    if ( !mem_holes )
>>>>>>>> +        return -ENOMEM;
>>>>>>>> +
>>>>>>>> +    /* Start with maximum possible addressable physical memory
>>>>>>>> range */
>>>>>>>> +    start = EXT_REGION_START;
>>>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>>>>>> +    if ( res )
>>>>>>>> +    {
>>>>>>>> +        printk(XENLOG_ERR "Failed to add:
>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>> +               start, end);
>>>>>>>> +        goto out;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    /* Remove all regions described by "reg" property (MMIO, RAM,
>>>>>>>> etc) */
>>>>>>> Well... The loop below is not going to handle all the regions
>>>>>>> described in
>>>>>>> the property "reg". Instead, it will cover a subset of "reg" where
>>>>>>> the
>>>>>>> memory is addressable.
>>>>>> As I understand, we are only interested in subset of "reg" where the
>>>>>> memory is
>>>>>> addressable.
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> You will also need to cover "ranges" that will describe the BARs for
>>>>>>> the PCI
>>>>>>> devices.
>>>>>> Good point.
>>>>> Yes, very good point!
>>>>>
>>>>>
>>>>>> Could you please clarify how to recognize whether it is a PCI
>>>>>> device as long as PCI support is not merged? Or just to find any
>>>>>> device nodes
>>>>>> with non-empty "ranges" property
>>>>>> and retrieve addresses?
>>>>> Normally any bus can have a ranges property with the aperture and
>>>>> possible address translations, including /amba (compatible =
>>>>> "simple-bus"). However, in these cases dt_device_get_address already
>>>>> takes care of it, see xen/common/device_tree.c:dt_device_get_address.
>>>>>
>>>>> The PCI bus is special for 2 reasons:
>>>>> - the ranges property has a different format
>>>>> - the bus is hot-pluggable
>>>>>
>>>>> So I think the only one that we need to treat specially is PCI.
>>>>>
>>>>> As far as I am aware PCI is the only bus (or maybe just the only bus
>>>>> that we support?) where ranges means the aperture.
>>>> Now that I think about this, there is another "hotpluggable" scenario we
>>>> need to think about:
>>>>
>>>> [1] https://marc.info/?l=xen-devel&m=163056546214978
>>>>
>>>> Xilinx devices have FPGA regions with apertures currently not described
>>>> in device tree, where things can programmed in PL at runtime making new
>>>> devices appear with new MMIO regions out of thin air.
>>>>
>>>> Now let me start by saying that yes, the entire programmable region
>>>> aperture could probably be described in device tree, however, in
>>>> reality it is not currently done in any of the device trees we use
>>>> (including the upstream device trees in linux.git).
>>> This is rather annoying, but not unheard. There are a couple of platforms
>>> where the MMIOs are not fully described in the DT.
>>>
>>> In fact, we have a callback 'specific_mappings' which create additional
>>> mappings (e.g. on the omap5) for dom0.
>>>
>>>> So, we have a problem :-(
>>>>
>>>>
>>>> I can work toward getting the right info on device tree, but in reality
>>>> that is going to take time and for now the device tree doesn't have the
>>>> FPGA aperture in it. So if we accept this series as is, it is going to
>>>> stop features like [1] from working. >
>>>> If we cannot come up with any better plans, I think it would be better
>>>> to drop find_memory_holes, only rely on find_unallocated_memory even
>>>> when the IOMMU is on. One idea is that we could add on top of the
>>>> regions found by find_unallocated_memory any MMIO regions marked as
>>>> xen,passthrough: they are safe because they are not going to dom0 anyway.
>>> (Oleksandr, it looks like some rationale about the different approach is
>>> missing in the commit message. Can you add it?)
>> Yes sure, but let me please clarify what is different approach in this
>> context. Is it to *also* take into the account MMIO regions of the devices for
>> passthrough for case when IOMMU is off (in addition to unallocated memory)? If
>> yes, I wonder whether we will gain much with that according to that device's
>> MMIO regions are usually not big enough and we stick to allocate extended
>> regions with bigger size (> 64MB).
> That's fair enough. There are a couple of counter examples where the
> MMIO regions for the device to assign are quite large, for instance a
> GPU, Xilinx AIEngine, or the PCIe Root Complex with the entire aperture,
> but maybe they are not that common. I am not sure if it is worth
> scanning the tree for xen,passthrough regions every time at boot for
> this.

ok, I will add a few sentences to commit message about this different 
approach for now. At least this could be implemented later on if there 
is a need.
Oleksandr Tyshchenko Sept. 21, 2021, 7:43 p.m. UTC | #22
On 18.09.21 00:56, Stefano Stabellini wrote:

Hi Stefano

> On Fri, 17 Sep 2021, Oleksandr wrote:
>>>> +
>>>> +    dt_dprintk("Find unallocated memory for extended regions\n");
>>>> +
>>>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>>>> +    if ( !unalloc_mem )
>>>> +        return -ENOMEM;
>>>> +
>>>> +    /* Start with all available RAM */
>>>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>>> +    {
>>>> +        start = bootinfo.mem.bank[i].start;
>>>> +        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
>>>> +        res = rangeset_add_range(unalloc_mem, start, end);
>>>> +        if ( res )
>>>> +        {
>>>> +            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>>> +                   start, end);
>>>> +            goto out;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Remove RAM assigned to Dom0 */
>>>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>>> +    {
>>>> +        start = assign_mem->bank[i].start;
>>>> +        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>> +        if ( res )
>>>> +        {
>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>> +                   start, end);
>>>> +            goto out;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Remove reserved-memory regions */
>>>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>>>> +    {
>>>> +        start = bootinfo.reserved_mem.bank[i].start;
>>>> +        end = bootinfo.reserved_mem.bank[i].start +
>>>> +            bootinfo.reserved_mem.bank[i].size - 1;
>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>> +        if ( res )
>>>> +        {
>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>> +                   start, end);
>>>> +            goto out;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Remove grant table region */
>>>> +    start = kinfo->gnttab_start;
>>>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>>>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>>>> +    if ( res )
>>>> +    {
>>>> +        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>>>> +               start, end);
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    start = EXT_REGION_START;
>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>>>> +                                 add_ext_regions, ext_regions);
>>>> +    if ( res )
>>>> +        ext_regions->nr_banks = 0;
>>>> +    else if ( !ext_regions->nr_banks )
>>>> +        res = -ENOENT;
>>>> +
>>>> +out:
>>>> +    rangeset_destroy(unalloc_mem);
>>>> +
>>>> +    return res;
>>>> +}
>>>> +
>>>> +static int __init find_memory_holes(const struct kernel_info *kinfo,
>>>> +                                    struct meminfo *ext_regions)
>>>> +{
>>>> +    struct dt_device_node *np;
>>>> +    struct rangeset *mem_holes;
>>>> +    paddr_t start, end;
>>>> +    unsigned int i;
>>>> +    int res;
>>>> +
>>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>>> +
>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>> +    if ( !mem_holes )
>>>> +        return -ENOMEM;
>>>> +
>>>> +    /* Start with maximum possible addressable physical memory range */
>>>> +    start = EXT_REGION_START;
>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>> +    if ( res )
>>>> +    {
>>>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>>> +               start, end);
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
>>> Well... The loop below is not going to handle all the regions described in
>>> the property "reg". Instead, it will cover a subset of "reg" where the
>>> memory is addressable.
>> As I understand, we are only interested in subset of "reg" where the memory is
>> addressable.
>>
>>
>>>
>>> You will also need to cover "ranges" that will describe the BARs for the PCI
>>> devices.
>> Good point.
> Yes, very good point!
>
>
>> Could you please clarify how to recognize whether it is a PCI
>> device as long as PCI support is not merged? Or just to find any device nodes
>> with non-empty "ranges" property
>> and retrieve addresses?
> Normally any bus can have a ranges property with the aperture and
> possible address translations, including /amba (compatible =
> "simple-bus"). However, in these cases dt_device_get_address already
> takes care of it, see xen/common/device_tree.c:dt_device_get_address.
>
> The PCI bus is special for 2 reasons:
> - the ranges property has a different format
> - the bus is hot-pluggable
>
> So I think the only one that we need to treat specially is PCI.
>
> As far as I am aware PCI is the only bus (or maybe just the only bus
> that we support?) where ranges means the aperture.
Thank you for the clarification. I need to find device node with 
non-empty ranges property
(and make sure that device_type property is "pci"), after that I need to 
read the context of ranges property and translate it.
Stefano Stabellini Sept. 21, 2021, 10 p.m. UTC | #23
On Tue, 21 Sep 2021, Oleksandr wrote:
> On 21.09.21 02:21, Stefano Stabellini wrote:
> > On Sun, 19 Sep 2021, Oleksandr wrote:
> > > > On 18/09/2021 03:37, Stefano Stabellini wrote:
> > > > > On Fri, 17 Sep 2021, Stefano Stabellini wrote:
> > > > > > On Fri, 17 Sep 2021, Oleksandr wrote:
> > > > > > > > > +
> > > > > > > > > +    dt_dprintk("Find unallocated memory for extended
> > > > > > > > > regions\n");
> > > > > > > > > +
> > > > > > > > > +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> > > > > > > > > +    if ( !unalloc_mem )
> > > > > > > > > +        return -ENOMEM;
> > > > > > > > > +
> > > > > > > > > +    /* Start with all available RAM */
> > > > > > > > > +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > > > > > > > > +    {
> > > > > > > > > +        start = bootinfo.mem.bank[i].start;
> > > > > > > > > +        end = bootinfo.mem.bank[i].start +
> > > > > > > > > bootinfo.mem.bank[i].size - 1;
> > > > > > > > > +        res = rangeset_add_range(unalloc_mem, start, end);
> > > > > > > > > +        if ( res )
> > > > > > > > > +        {
> > > > > > > > > +            printk(XENLOG_ERR "Failed to add:
> > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > +                   start, end);
> > > > > > > > > +            goto out;
> > > > > > > > > +        }
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    /* Remove RAM assigned to Dom0 */
> > > > > > > > > +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > > > > > > > > +    {
> > > > > > > > > +        start = assign_mem->bank[i].start;
> > > > > > > > > +        end = assign_mem->bank[i].start +
> > > > > > > > > assign_mem->bank[i].size - 1;
> > > > > > > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > > > > +        if ( res )
> > > > > > > > > +        {
> > > > > > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > +                   start, end);
> > > > > > > > > +            goto out;
> > > > > > > > > +        }
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    /* Remove reserved-memory regions */
> > > > > > > > > +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> > > > > > > > > +    {
> > > > > > > > > +        start = bootinfo.reserved_mem.bank[i].start;
> > > > > > > > > +        end = bootinfo.reserved_mem.bank[i].start +
> > > > > > > > > +            bootinfo.reserved_mem.bank[i].size - 1;
> > > > > > > > > +        res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > > > > +        if ( res )
> > > > > > > > > +        {
> > > > > > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > +                   start, end);
> > > > > > > > > +            goto out;
> > > > > > > > > +        }
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    /* Remove grant table region */
> > > > > > > > > +    start = kinfo->gnttab_start;
> > > > > > > > > +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> > > > > > > > > +    res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > > > > +    if ( res )
> > > > > > > > > +    {
> > > > > > > > > +        printk(XENLOG_ERR "Failed to remove:
> > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > +               start, end);
> > > > > > > > > +        goto out;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    start = EXT_REGION_START;
> > > > > > > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > > > > > > +    res = rangeset_report_ranges(unalloc_mem, start, end,
> > > > > > > > > +                                 add_ext_regions,
> > > > > > > > > ext_regions);
> > > > > > > > > +    if ( res )
> > > > > > > > > +        ext_regions->nr_banks = 0;
> > > > > > > > > +    else if ( !ext_regions->nr_banks )
> > > > > > > > > +        res = -ENOENT;
> > > > > > > > > +
> > > > > > > > > +out:
> > > > > > > > > +    rangeset_destroy(unalloc_mem);
> > > > > > > > > +
> > > > > > > > > +    return res;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int __init find_memory_holes(const struct kernel_info
> > > > > > > > > *kinfo,
> > > > > > > > > +                                    struct meminfo
> > > > > > > > > *ext_regions)
> > > > > > > > > +{
> > > > > > > > > +    struct dt_device_node *np;
> > > > > > > > > +    struct rangeset *mem_holes;
> > > > > > > > > +    paddr_t start, end;
> > > > > > > > > +    unsigned int i;
> > > > > > > > > +    int res;
> > > > > > > > > +
> > > > > > > > > +    dt_dprintk("Find memory holes for extended regions\n");
> > > > > > > > > +
> > > > > > > > > +    mem_holes = rangeset_new(NULL, NULL, 0);
> > > > > > > > > +    if ( !mem_holes )
> > > > > > > > > +        return -ENOMEM;
> > > > > > > > > +
> > > > > > > > > +    /* Start with maximum possible addressable physical
> > > > > > > > > memory
> > > > > > > > > range */
> > > > > > > > > +    start = EXT_REGION_START;
> > > > > > > > > +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > > > > > > > > +    res = rangeset_add_range(mem_holes, start, end);
> > > > > > > > > +    if ( res )
> > > > > > > > > +    {
> > > > > > > > > +        printk(XENLOG_ERR "Failed to add:
> > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > +               start, end);
> > > > > > > > > +        goto out;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    /* Remove all regions described by "reg" property (MMIO,
> > > > > > > > > RAM,
> > > > > > > > > etc) */
> > > > > > > > Well... The loop below is not going to handle all the regions
> > > > > > > > described in
> > > > > > > > the property "reg". Instead, it will cover a subset of "reg"
> > > > > > > > where
> > > > > > > > the
> > > > > > > > memory is addressable.
> > > > > > > As I understand, we are only interested in subset of "reg" where
> > > > > > > the
> > > > > > > memory is
> > > > > > > addressable.
> > > > > > > 
> > > > > > > 
> > > > > > > > 
> > > > > > > > You will also need to cover "ranges" that will describe the BARs
> > > > > > > > for
> > > > > > > > the PCI
> > > > > > > > devices.
> > > > > > > Good point.
> > > > > > Yes, very good point!
> > > > > > 
> > > > > > 
> > > > > > > Could you please clarify how to recognize whether it is a PCI
> > > > > > > device as long as PCI support is not merged? Or just to find any
> > > > > > > device nodes
> > > > > > > with non-empty "ranges" property
> > > > > > > and retrieve addresses?
> > > > > > Normally any bus can have a ranges property with the aperture and
> > > > > > possible address translations, including /amba (compatible =
> > > > > > "simple-bus"). However, in these cases dt_device_get_address already
> > > > > > takes care of it, see
> > > > > > xen/common/device_tree.c:dt_device_get_address.
> > > > > > 
> > > > > > The PCI bus is special for 2 reasons:
> > > > > > - the ranges property has a different format
> > > > > > - the bus is hot-pluggable
> > > > > > 
> > > > > > So I think the only one that we need to treat specially is PCI.
> > > > > > 
> > > > > > As far as I am aware PCI is the only bus (or maybe just the only bus
> > > > > > that we support?) where ranges means the aperture.
> > > > > Now that I think about this, there is another "hotpluggable" scenario
> > > > > we
> > > > > need to think about:
> > > > > 
> > > > > [1] https://marc.info/?l=xen-devel&m=163056546214978
> > > > > 
> > > > > Xilinx devices have FPGA regions with apertures currently not
> > > > > described
> > > > > in device tree, where things can programmed in PL at runtime making
> > > > > new
> > > > > devices appear with new MMIO regions out of thin air.
> > > > > 
> > > > > Now let me start by saying that yes, the entire programmable region
> > > > > aperture could probably be described in device tree, however, in
> > > > > reality it is not currently done in any of the device trees we use
> > > > > (including the upstream device trees in linux.git).
> > > > This is rather annoying, but not unheard. There are a couple of
> > > > platforms
> > > > where the MMIOs are not fully described in the DT.
> > > > 
> > > > In fact, we have a callback 'specific_mappings' which create additional
> > > > mappings (e.g. on the omap5) for dom0.
> > > > 
> > > > > So, we have a problem :-(
> > > > > 
> > > > > 
> > > > > I can work toward getting the right info on device tree, but in
> > > > > reality
> > > > > that is going to take time and for now the device tree doesn't have
> > > > > the
> > > > > FPGA aperture in it. So if we accept this series as is, it is going to
> > > > > stop features like [1] from working. >
> > > > > If we cannot come up with any better plans, I think it would be better
> > > > > to drop find_memory_holes, only rely on find_unallocated_memory even
> > > > > when the IOMMU is on. One idea is that we could add on top of the
> > > > > regions found by find_unallocated_memory any MMIO regions marked as
> > > > > xen,passthrough: they are safe because they are not going to dom0
> > > > > anyway.
> > > > (Oleksandr, it looks like some rationale about the different approach is
> > > > missing in the commit message. Can you add it?)
> > > Yes sure, but let me please clarify what is different approach in this
> > > context. Is it to *also* take into the account MMIO regions of the devices
> > > for
> > > passthrough for case when IOMMU is off (in addition to unallocated
> > > memory)? If
> > > yes, I wonder whether we will gain much with that according to that
> > > device's
> > > MMIO regions are usually not big enough and we stick to allocate extended
> > > regions with bigger size (> 64MB).
> > That's fair enough. There are a couple of counter examples where the
> > MMIO regions for the device to assign are quite large, for instance a
> > GPU, Xilinx AIEngine, or the PCIe Root Complex with the entire aperture,
> > but maybe they are not that common. I am not sure if it is worth
> > scanning the tree for xen,passthrough regions every time at boot for
> > this.
> 
> ok, I will add a few sentences to commit message about this different approach
> for now. At least this could be implemented later on if there is a need.

One thing that worries me about this is that if we take an old Xen with
this series and run it on a new board, it might cause problems. At the
very least [1] wouldn't work.

Can we have a Xen command line argument to disable extended regions as
an emergecy toggle?


[1] https://marc.info/?l=xen-devel&m=163056546214978
Oleksandr Tyshchenko Sept. 22, 2021, 6:18 p.m. UTC | #24
Hi Stefano


[snip]


>>>
>>>
>>>>
>>>> You will also need to cover "ranges" that will describe the BARs 
>>>> for the PCI
>>>> devices.
>>> Good point.
>> Yes, very good point!
>>
>>
>>> Could you please clarify how to recognize whether it is a PCI
>>> device as long as PCI support is not merged? Or just to find any 
>>> device nodes
>>> with non-empty "ranges" property
>>> and retrieve addresses?
>> Normally any bus can have a ranges property with the aperture and
>> possible address translations, including /amba (compatible =
>> "simple-bus"). However, in these cases dt_device_get_address already
>> takes care of it, see xen/common/device_tree.c:dt_device_get_address.
>>
>> The PCI bus is special for 2 reasons:
>> - the ranges property has a different format
>> - the bus is hot-pluggable
>>
>> So I think the only one that we need to treat specially is PCI.
>>
>> As far as I am aware PCI is the only bus (or maybe just the only bus
>> that we support?) where ranges means the aperture.
> Thank you for the clarification. I need to find device node with 
> non-empty ranges property
> (and make sure that device_type property is "pci"), after that I need 
> to read the context of ranges property and translate it.
>
>

OK, I experimented with that and managed to parse ranges property for 
PCI host bridge node.

I tested on my setup where the host device tree contains two PCI host 
bridge nodes with the following:

pcie@fe000000 {
...
             ranges = <0x1000000 0x0 0x0 0x0 0xfe100000 0x0 0x100000 
0x2000000 0x0 0xfe200000 0x0 0xfe200000 0x0 0x200000 0x2000000 0x0 
0x30000000 0x0 0x30000000 0x0 0x8000000 0x42000000 0x0 0x38000000 0x0 
0x38000000 0x0 0x8000000>;
...
};

pcie@ee800000 {
...
             ranges = <0x1000000 0x0 0x0 0x0 0xee900000 0x0 0x100000 
0x2000000 0x0 0xeea00000 0x0 0xeea00000 0x0 0x200000 0x2000000 0x0 
0xc0000000 0x0 0xc0000000 0x0 0x8000000 0x42000000 0x0 0xc8000000 0x0 
0xc8000000 0x0 0x8000000>;
...
};

So Xen retrieves the *CPU addresses* from the ranges:

(XEN) dev /soc/pcie@fe000000 range_size 7 nr_ranges 4
(XEN) 0: addr=fe100000, size=100000
(XEN) 1: addr=fe200000, size=200000
(XEN) 2: addr=30000000, size=8000000
(XEN) 3: addr=38000000, size=8000000
(XEN) dev /soc/pcie@ee800000 range_size 7 nr_ranges 4
(XEN) 0: addr=ee900000, size=100000
(XEN) 1: addr=eea00000, size=200000
(XEN) 2: addr=c0000000, size=8000000
(XEN) 3: addr=c8000000, size=8000000

The code below covers ranges property in the context of finding memory 
holes (to be squashed with current patch):

diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index d37156a..7d20c10 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -834,6 +834,8 @@ static int __init find_memory_holes(struct meminfo 
*ext_regions)
      {
          unsigned int naddr;
          u64 addr, size;
+        const __be32 *ranges;
+        u32 len;

          naddr = dt_number_of_address(np);

@@ -857,6 +859,41 @@ static int __init find_memory_holes(struct meminfo 
*ext_regions)
                  goto out;
              }
          }
+
+        /*
+         * Also looking for non-empty ranges property which would 
likely mean
+         * that we deal with PCI host bridge device and the property here
+         * describes the BARs for the PCI devices.
+         */
+        ranges = dt_get_property(np, "ranges", &len);
+        if ( ranges && len )
+        {
+            unsigned int range_size, nr_ranges;
+            int na, ns, pna;
+
+            pna = dt_n_addr_cells(np);
+            na = dt_child_n_addr_cells(np);
+            ns = dt_child_n_size_cells(np);
+            range_size = pna + na + ns;
+            nr_ranges = len / sizeof(__be32) / range_size;
+
+            for ( i = 0; i < nr_ranges; i++, ranges += range_size )
+            {
+                /* Skip the child address and get the parent (CPU) 
address */
+                addr = dt_read_number(ranges + na, pna);
+                size = dt_read_number(ranges + na + pna, ns);
+
+                start = addr & PAGE_MASK;
+                end = PAGE_ALIGN(addr + size);
+                res = rangeset_remove_range(mem_holes, start, end - 1);
+                if ( res )
+                {
+                    printk(XENLOG_ERR "Failed to remove: 
%#"PRIx64"->%#"PRIx64"\n",
+                           start, end);
+                    goto out;
+                }
+            }
+        }
      }

      start = 0;
Oleksandr Tyshchenko Sept. 22, 2021, 6:25 p.m. UTC | #25
On 22.09.21 01:00, Stefano Stabellini wrote:

Hi Stefano

> On Tue, 21 Sep 2021, Oleksandr wrote:
>> On 21.09.21 02:21, Stefano Stabellini wrote:
>>> On Sun, 19 Sep 2021, Oleksandr wrote:
>>>>> On 18/09/2021 03:37, Stefano Stabellini wrote:
>>>>>> On Fri, 17 Sep 2021, Stefano Stabellini wrote:
>>>>>>> On Fri, 17 Sep 2021, Oleksandr wrote:
>>>>>>>>>> +
>>>>>>>>>> +    dt_dprintk("Find unallocated memory for extended
>>>>>>>>>> regions\n");
>>>>>>>>>> +
>>>>>>>>>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>>>>>>>>>> +    if ( !unalloc_mem )
>>>>>>>>>> +        return -ENOMEM;
>>>>>>>>>> +
>>>>>>>>>> +    /* Start with all available RAM */
>>>>>>>>>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>>>>>>>>> +    {
>>>>>>>>>> +        start = bootinfo.mem.bank[i].start;
>>>>>>>>>> +        end = bootinfo.mem.bank[i].start +
>>>>>>>>>> bootinfo.mem.bank[i].size - 1;
>>>>>>>>>> +        res = rangeset_add_range(unalloc_mem, start, end);
>>>>>>>>>> +        if ( res )
>>>>>>>>>> +        {
>>>>>>>>>> +            printk(XENLOG_ERR "Failed to add:
>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>> +                   start, end);
>>>>>>>>>> +            goto out;
>>>>>>>>>> +        }
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    /* Remove RAM assigned to Dom0 */
>>>>>>>>>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>>>>>>>>> +    {
>>>>>>>>>> +        start = assign_mem->bank[i].start;
>>>>>>>>>> +        end = assign_mem->bank[i].start +
>>>>>>>>>> assign_mem->bank[i].size - 1;
>>>>>>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>>>>>> +        if ( res )
>>>>>>>>>> +        {
>>>>>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>> +                   start, end);
>>>>>>>>>> +            goto out;
>>>>>>>>>> +        }
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    /* Remove reserved-memory regions */
>>>>>>>>>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>>>>>>>>>> +    {
>>>>>>>>>> +        start = bootinfo.reserved_mem.bank[i].start;
>>>>>>>>>> +        end = bootinfo.reserved_mem.bank[i].start +
>>>>>>>>>> +            bootinfo.reserved_mem.bank[i].size - 1;
>>>>>>>>>> +        res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>>>>>> +        if ( res )
>>>>>>>>>> +        {
>>>>>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>> +                   start, end);
>>>>>>>>>> +            goto out;
>>>>>>>>>> +        }
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    /* Remove grant table region */
>>>>>>>>>> +    start = kinfo->gnttab_start;
>>>>>>>>>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>>>>>>>>>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>>>>>> +    if ( res )
>>>>>>>>>> +    {
>>>>>>>>>> +        printk(XENLOG_ERR "Failed to remove:
>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>> +               start, end);
>>>>>>>>>> +        goto out;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    start = EXT_REGION_START;
>>>>>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>>>>>>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>>>>>>>>>> +                                 add_ext_regions,
>>>>>>>>>> ext_regions);
>>>>>>>>>> +    if ( res )
>>>>>>>>>> +        ext_regions->nr_banks = 0;
>>>>>>>>>> +    else if ( !ext_regions->nr_banks )
>>>>>>>>>> +        res = -ENOENT;
>>>>>>>>>> +
>>>>>>>>>> +out:
>>>>>>>>>> +    rangeset_destroy(unalloc_mem);
>>>>>>>>>> +
>>>>>>>>>> +    return res;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static int __init find_memory_holes(const struct kernel_info
>>>>>>>>>> *kinfo,
>>>>>>>>>> +                                    struct meminfo
>>>>>>>>>> *ext_regions)
>>>>>>>>>> +{
>>>>>>>>>> +    struct dt_device_node *np;
>>>>>>>>>> +    struct rangeset *mem_holes;
>>>>>>>>>> +    paddr_t start, end;
>>>>>>>>>> +    unsigned int i;
>>>>>>>>>> +    int res;
>>>>>>>>>> +
>>>>>>>>>> +    dt_dprintk("Find memory holes for extended regions\n");
>>>>>>>>>> +
>>>>>>>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>>>>>>>> +    if ( !mem_holes )
>>>>>>>>>> +        return -ENOMEM;
>>>>>>>>>> +
>>>>>>>>>> +    /* Start with maximum possible addressable physical
>>>>>>>>>> memory
>>>>>>>>>> range */
>>>>>>>>>> +    start = EXT_REGION_START;
>>>>>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>>>>>>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>>>>>>>> +    if ( res )
>>>>>>>>>> +    {
>>>>>>>>>> +        printk(XENLOG_ERR "Failed to add:
>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>> +               start, end);
>>>>>>>>>> +        goto out;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    /* Remove all regions described by "reg" property (MMIO,
>>>>>>>>>> RAM,
>>>>>>>>>> etc) */
>>>>>>>>> Well... The loop below is not going to handle all the regions
>>>>>>>>> described in
>>>>>>>>> the property "reg". Instead, it will cover a subset of "reg"
>>>>>>>>> where
>>>>>>>>> the
>>>>>>>>> memory is addressable.
>>>>>>>> As I understand, we are only interested in subset of "reg" where
>>>>>>>> the
>>>>>>>> memory is
>>>>>>>> addressable.
>>>>>>>>
>>>>>>>>
>>>>>>>>> You will also need to cover "ranges" that will describe the BARs
>>>>>>>>> for
>>>>>>>>> the PCI
>>>>>>>>> devices.
>>>>>>>> Good point.
>>>>>>> Yes, very good point!
>>>>>>>
>>>>>>>
>>>>>>>> Could you please clarify how to recognize whether it is a PCI
>>>>>>>> device as long as PCI support is not merged? Or just to find any
>>>>>>>> device nodes
>>>>>>>> with non-empty "ranges" property
>>>>>>>> and retrieve addresses?
>>>>>>> Normally any bus can have a ranges property with the aperture and
>>>>>>> possible address translations, including /amba (compatible =
>>>>>>> "simple-bus"). However, in these cases dt_device_get_address already
>>>>>>> takes care of it, see
>>>>>>> xen/common/device_tree.c:dt_device_get_address.
>>>>>>>
>>>>>>> The PCI bus is special for 2 reasons:
>>>>>>> - the ranges property has a different format
>>>>>>> - the bus is hot-pluggable
>>>>>>>
>>>>>>> So I think the only one that we need to treat specially is PCI.
>>>>>>>
>>>>>>> As far as I am aware PCI is the only bus (or maybe just the only bus
>>>>>>> that we support?) where ranges means the aperture.
>>>>>> Now that I think about this, there is another "hotpluggable" scenario
>>>>>> we
>>>>>> need to think about:
>>>>>>
>>>>>> [1] https://marc.info/?l=xen-devel&m=163056546214978
>>>>>>
>>>>>> Xilinx devices have FPGA regions with apertures currently not
>>>>>> described
>>>>>> in device tree, where things can programmed in PL at runtime making
>>>>>> new
>>>>>> devices appear with new MMIO regions out of thin air.
>>>>>>
>>>>>> Now let me start by saying that yes, the entire programmable region
>>>>>> aperture could probably be described in device tree, however, in
>>>>>> reality it is not currently done in any of the device trees we use
>>>>>> (including the upstream device trees in linux.git).
>>>>> This is rather annoying, but not unheard. There are a couple of
>>>>> platforms
>>>>> where the MMIOs are not fully described in the DT.
>>>>>
>>>>> In fact, we have a callback 'specific_mappings' which create additional
>>>>> mappings (e.g. on the omap5) for dom0.
>>>>>
>>>>>> So, we have a problem :-(
>>>>>>
>>>>>>
>>>>>> I can work toward getting the right info on device tree, but in
>>>>>> reality
>>>>>> that is going to take time and for now the device tree doesn't have
>>>>>> the
>>>>>> FPGA aperture in it. So if we accept this series as is, it is going to
>>>>>> stop features like [1] from working. >
>>>>>> If we cannot come up with any better plans, I think it would be better
>>>>>> to drop find_memory_holes, only rely on find_unallocated_memory even
>>>>>> when the IOMMU is on. One idea is that we could add on top of the
>>>>>> regions found by find_unallocated_memory any MMIO regions marked as
>>>>>> xen,passthrough: they are safe because they are not going to dom0
>>>>>> anyway.
>>>>> (Oleksandr, it looks like some rationale about the different approach is
>>>>> missing in the commit message. Can you add it?)
>>>> Yes sure, but let me please clarify what is different approach in this
>>>> context. Is it to *also* take into the account MMIO regions of the devices
>>>> for
>>>> passthrough for case when IOMMU is off (in addition to unallocated
>>>> memory)? If
>>>> yes, I wonder whether we will gain much with that according to that
>>>> device's
>>>> MMIO regions are usually not big enough and we stick to allocate extended
>>>> regions with bigger size (> 64MB).
>>> That's fair enough. There are a couple of counter examples where the
>>> MMIO regions for the device to assign are quite large, for instance a
>>> GPU, Xilinx AIEngine, or the PCIe Root Complex with the entire aperture,
>>> but maybe they are not that common. I am not sure if it is worth
>>> scanning the tree for xen,passthrough regions every time at boot for
>>> this.
>> ok, I will add a few sentences to commit message about this different approach
>> for now. At least this could be implemented later on if there is a need.
> One thing that worries me about this is that if we take an old Xen with
> this series and run it on a new board, it might cause problems. At the
> very least [1] wouldn't work.

I got it.


>
> Can we have a Xen command line argument to disable extended regions as
> an emergecy toggle?

I think, yes. If no preference for the argument name I will name it 
"no-ext-region".

>
>
> [1] https://marc.info/?l=xen-devel&m=163056546214978
Stefano Stabellini Sept. 22, 2021, 8:50 p.m. UTC | #26
On Wed, 22 Sep 2021, Oleksandr wrote:
> On 22.09.21 01:00, Stefano Stabellini wrote:
> > On Tue, 21 Sep 2021, Oleksandr wrote:
> > > On 21.09.21 02:21, Stefano Stabellini wrote:
> > > > On Sun, 19 Sep 2021, Oleksandr wrote:
> > > > > > On 18/09/2021 03:37, Stefano Stabellini wrote:
> > > > > > > On Fri, 17 Sep 2021, Stefano Stabellini wrote:
> > > > > > > > On Fri, 17 Sep 2021, Oleksandr wrote:
> > > > > > > > > > > +
> > > > > > > > > > > +    dt_dprintk("Find unallocated memory for extended
> > > > > > > > > > > regions\n");
> > > > > > > > > > > +
> > > > > > > > > > > +    unalloc_mem = rangeset_new(NULL, NULL, 0);
> > > > > > > > > > > +    if ( !unalloc_mem )
> > > > > > > > > > > +        return -ENOMEM;
> > > > > > > > > > > +
> > > > > > > > > > > +    /* Start with all available RAM */
> > > > > > > > > > > +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > > > > > > > > > > +    {
> > > > > > > > > > > +        start = bootinfo.mem.bank[i].start;
> > > > > > > > > > > +        end = bootinfo.mem.bank[i].start +
> > > > > > > > > > > bootinfo.mem.bank[i].size - 1;
> > > > > > > > > > > +        res = rangeset_add_range(unalloc_mem, start,
> > > > > > > > > > > end);
> > > > > > > > > > > +        if ( res )
> > > > > > > > > > > +        {
> > > > > > > > > > > +            printk(XENLOG_ERR "Failed to add:
> > > > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > > > +                   start, end);
> > > > > > > > > > > +            goto out;
> > > > > > > > > > > +        }
> > > > > > > > > > > +    }
> > > > > > > > > > > +
> > > > > > > > > > > +    /* Remove RAM assigned to Dom0 */
> > > > > > > > > > > +    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > > > > > > > > > > +    {
> > > > > > > > > > > +        start = assign_mem->bank[i].start;
> > > > > > > > > > > +        end = assign_mem->bank[i].start +
> > > > > > > > > > > assign_mem->bank[i].size - 1;
> > > > > > > > > > > +        res = rangeset_remove_range(unalloc_mem, start,
> > > > > > > > > > > end);
> > > > > > > > > > > +        if ( res )
> > > > > > > > > > > +        {
> > > > > > > > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > > > +                   start, end);
> > > > > > > > > > > +            goto out;
> > > > > > > > > > > +        }
> > > > > > > > > > > +    }
> > > > > > > > > > > +
> > > > > > > > > > > +    /* Remove reserved-memory regions */
> > > > > > > > > > > +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++
> > > > > > > > > > > )
> > > > > > > > > > > +    {
> > > > > > > > > > > +        start = bootinfo.reserved_mem.bank[i].start;
> > > > > > > > > > > +        end = bootinfo.reserved_mem.bank[i].start +
> > > > > > > > > > > +            bootinfo.reserved_mem.bank[i].size - 1;
> > > > > > > > > > > +        res = rangeset_remove_range(unalloc_mem, start,
> > > > > > > > > > > end);
> > > > > > > > > > > +        if ( res )
> > > > > > > > > > > +        {
> > > > > > > > > > > +            printk(XENLOG_ERR "Failed to remove:
> > > > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > > > +                   start, end);
> > > > > > > > > > > +            goto out;
> > > > > > > > > > > +        }
> > > > > > > > > > > +    }
> > > > > > > > > > > +
> > > > > > > > > > > +    /* Remove grant table region */
> > > > > > > > > > > +    start = kinfo->gnttab_start;
> > > > > > > > > > > +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
> > > > > > > > > > > +    res = rangeset_remove_range(unalloc_mem, start, end);
> > > > > > > > > > > +    if ( res )
> > > > > > > > > > > +    {
> > > > > > > > > > > +        printk(XENLOG_ERR "Failed to remove:
> > > > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > > > +               start, end);
> > > > > > > > > > > +        goto out;
> > > > > > > > > > > +    }
> > > > > > > > > > > +
> > > > > > > > > > > +    start = EXT_REGION_START;
> > > > > > > > > > > +    end = min((1ULL << p2m_ipa_bits) - 1,
> > > > > > > > > > > EXT_REGION_END);
> > > > > > > > > > > +    res = rangeset_report_ranges(unalloc_mem, start, end,
> > > > > > > > > > > +                                 add_ext_regions,
> > > > > > > > > > > ext_regions);
> > > > > > > > > > > +    if ( res )
> > > > > > > > > > > +        ext_regions->nr_banks = 0;
> > > > > > > > > > > +    else if ( !ext_regions->nr_banks )
> > > > > > > > > > > +        res = -ENOENT;
> > > > > > > > > > > +
> > > > > > > > > > > +out:
> > > > > > > > > > > +    rangeset_destroy(unalloc_mem);
> > > > > > > > > > > +
> > > > > > > > > > > +    return res;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static int __init find_memory_holes(const struct
> > > > > > > > > > > kernel_info
> > > > > > > > > > > *kinfo,
> > > > > > > > > > > +                                    struct meminfo
> > > > > > > > > > > *ext_regions)
> > > > > > > > > > > +{
> > > > > > > > > > > +    struct dt_device_node *np;
> > > > > > > > > > > +    struct rangeset *mem_holes;
> > > > > > > > > > > +    paddr_t start, end;
> > > > > > > > > > > +    unsigned int i;
> > > > > > > > > > > +    int res;
> > > > > > > > > > > +
> > > > > > > > > > > +    dt_dprintk("Find memory holes for extended
> > > > > > > > > > > regions\n");
> > > > > > > > > > > +
> > > > > > > > > > > +    mem_holes = rangeset_new(NULL, NULL, 0);
> > > > > > > > > > > +    if ( !mem_holes )
> > > > > > > > > > > +        return -ENOMEM;
> > > > > > > > > > > +
> > > > > > > > > > > +    /* Start with maximum possible addressable physical
> > > > > > > > > > > memory
> > > > > > > > > > > range */
> > > > > > > > > > > +    start = EXT_REGION_START;
> > > > > > > > > > > +    end = min((1ULL << p2m_ipa_bits) - 1,
> > > > > > > > > > > EXT_REGION_END);
> > > > > > > > > > > +    res = rangeset_add_range(mem_holes, start, end);
> > > > > > > > > > > +    if ( res )
> > > > > > > > > > > +    {
> > > > > > > > > > > +        printk(XENLOG_ERR "Failed to add:
> > > > > > > > > > > %#"PRIx64"->%#"PRIx64"\n",
> > > > > > > > > > > +               start, end);
> > > > > > > > > > > +        goto out;
> > > > > > > > > > > +    }
> > > > > > > > > > > +
> > > > > > > > > > > +    /* Remove all regions described by "reg" property
> > > > > > > > > > > (MMIO,
> > > > > > > > > > > RAM,
> > > > > > > > > > > etc) */
> > > > > > > > > > Well... The loop below is not going to handle all the
> > > > > > > > > > regions
> > > > > > > > > > described in
> > > > > > > > > > the property "reg". Instead, it will cover a subset of "reg"
> > > > > > > > > > where
> > > > > > > > > > the
> > > > > > > > > > memory is addressable.
> > > > > > > > > As I understand, we are only interested in subset of "reg"
> > > > > > > > > where
> > > > > > > > > the
> > > > > > > > > memory is
> > > > > > > > > addressable.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > > You will also need to cover "ranges" that will describe the
> > > > > > > > > > BARs
> > > > > > > > > > for
> > > > > > > > > > the PCI
> > > > > > > > > > devices.
> > > > > > > > > Good point.
> > > > > > > > Yes, very good point!
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > Could you please clarify how to recognize whether it is a PCI
> > > > > > > > > device as long as PCI support is not merged? Or just to find
> > > > > > > > > any
> > > > > > > > > device nodes
> > > > > > > > > with non-empty "ranges" property
> > > > > > > > > and retrieve addresses?
> > > > > > > > Normally any bus can have a ranges property with the aperture
> > > > > > > > and
> > > > > > > > possible address translations, including /amba (compatible =
> > > > > > > > "simple-bus"). However, in these cases dt_device_get_address
> > > > > > > > already
> > > > > > > > takes care of it, see
> > > > > > > > xen/common/device_tree.c:dt_device_get_address.
> > > > > > > > 
> > > > > > > > The PCI bus is special for 2 reasons:
> > > > > > > > - the ranges property has a different format
> > > > > > > > - the bus is hot-pluggable
> > > > > > > > 
> > > > > > > > So I think the only one that we need to treat specially is PCI.
> > > > > > > > 
> > > > > > > > As far as I am aware PCI is the only bus (or maybe just the only
> > > > > > > > bus
> > > > > > > > that we support?) where ranges means the aperture.
> > > > > > > Now that I think about this, there is another "hotpluggable"
> > > > > > > scenario
> > > > > > > we
> > > > > > > need to think about:
> > > > > > > 
> > > > > > > [1] https://marc.info/?l=xen-devel&m=163056546214978
> > > > > > > 
> > > > > > > Xilinx devices have FPGA regions with apertures currently not
> > > > > > > described
> > > > > > > in device tree, where things can programmed in PL at runtime
> > > > > > > making
> > > > > > > new
> > > > > > > devices appear with new MMIO regions out of thin air.
> > > > > > > 
> > > > > > > Now let me start by saying that yes, the entire programmable
> > > > > > > region
> > > > > > > aperture could probably be described in device tree, however, in
> > > > > > > reality it is not currently done in any of the device trees we use
> > > > > > > (including the upstream device trees in linux.git).
> > > > > > This is rather annoying, but not unheard. There are a couple of
> > > > > > platforms
> > > > > > where the MMIOs are not fully described in the DT.
> > > > > > 
> > > > > > In fact, we have a callback 'specific_mappings' which create
> > > > > > additional
> > > > > > mappings (e.g. on the omap5) for dom0.
> > > > > > 
> > > > > > > So, we have a problem :-(
> > > > > > > 
> > > > > > > 
> > > > > > > I can work toward getting the right info on device tree, but in
> > > > > > > reality
> > > > > > > that is going to take time and for now the device tree doesn't
> > > > > > > have
> > > > > > > the
> > > > > > > FPGA aperture in it. So if we accept this series as is, it is
> > > > > > > going to
> > > > > > > stop features like [1] from working. >
> > > > > > > If we cannot come up with any better plans, I think it would be
> > > > > > > better
> > > > > > > to drop find_memory_holes, only rely on find_unallocated_memory
> > > > > > > even
> > > > > > > when the IOMMU is on. One idea is that we could add on top of the
> > > > > > > regions found by find_unallocated_memory any MMIO regions marked
> > > > > > > as
> > > > > > > xen,passthrough: they are safe because they are not going to dom0
> > > > > > > anyway.
> > > > > > (Oleksandr, it looks like some rationale about the different
> > > > > > approach is
> > > > > > missing in the commit message. Can you add it?)
> > > > > Yes sure, but let me please clarify what is different approach in this
> > > > > context. Is it to *also* take into the account MMIO regions of the
> > > > > devices
> > > > > for
> > > > > passthrough for case when IOMMU is off (in addition to unallocated
> > > > > memory)? If
> > > > > yes, I wonder whether we will gain much with that according to that
> > > > > device's
> > > > > MMIO regions are usually not big enough and we stick to allocate
> > > > > extended
> > > > > regions with bigger size (> 64MB).
> > > > That's fair enough. There are a couple of counter examples where the
> > > > MMIO regions for the device to assign are quite large, for instance a
> > > > GPU, Xilinx AIEngine, or the PCIe Root Complex with the entire aperture,
> > > > but maybe they are not that common. I am not sure if it is worth
> > > > scanning the tree for xen,passthrough regions every time at boot for
> > > > this.
> > > ok, I will add a few sentences to commit message about this different
> > > approach
> > > for now. At least this could be implemented later on if there is a need.
> > One thing that worries me about this is that if we take an old Xen with
> > this series and run it on a new board, it might cause problems. At the
> > very least [1] wouldn't work.
> 
> I got it.
> 
> 
> > 
> > Can we have a Xen command line argument to disable extended regions as
> > an emergecy toggle?
> 
> I think, yes. If no preference for the argument name I will name it
> "no-ext-region".

It is better to introduce it as ext-regions=yes/no with yes as default.
So that in the future we could extending it to ext-regions=start,size if
we wanted to.
Stefano Stabellini Sept. 22, 2021, 9:05 p.m. UTC | #27
On Wed, 22 Sep 2021, Oleksandr wrote:
> > > > > You will also need to cover "ranges" that will describe the BARs for
> > > > > the PCI
> > > > > devices.
> > > > Good point.
> > > Yes, very good point!
> > > 
> > > 
> > > > Could you please clarify how to recognize whether it is a PCI
> > > > device as long as PCI support is not merged? Or just to find any device
> > > > nodes
> > > > with non-empty "ranges" property
> > > > and retrieve addresses?
> > > Normally any bus can have a ranges property with the aperture and
> > > possible address translations, including /amba (compatible =
> > > "simple-bus"). However, in these cases dt_device_get_address already
> > > takes care of it, see xen/common/device_tree.c:dt_device_get_address.
> > > 
> > > The PCI bus is special for 2 reasons:
> > > - the ranges property has a different format
> > > - the bus is hot-pluggable
> > > 
> > > So I think the only one that we need to treat specially is PCI.
> > > 
> > > As far as I am aware PCI is the only bus (or maybe just the only bus
> > > that we support?) where ranges means the aperture.
> > Thank you for the clarification. I need to find device node with non-empty
> > ranges property
> > (and make sure that device_type property is "pci"), after that I need to
> > read the context of ranges property and translate it.
> > 
> > 
> 
> OK, I experimented with that and managed to parse ranges property for PCI host
> bridge node.
> 
> I tested on my setup where the host device tree contains two PCI host bridge
> nodes with the following:
> 
> pcie@fe000000 {
> ...
>             ranges = <0x1000000 0x0 0x0 0x0 0xfe100000 0x0 0x100000 0x2000000
> 0x0 0xfe200000 0x0 0xfe200000 0x0 0x200000 0x2000000 0x0 0x30000000 0x0
> 0x30000000 0x0 0x8000000 0x42000000 0x0 0x38000000 0x0 0x38000000 0x0
> 0x8000000>;
> ...
> };
> 
> pcie@ee800000 {
> ...
>             ranges = <0x1000000 0x0 0x0 0x0 0xee900000 0x0 0x100000 0x2000000
> 0x0 0xeea00000 0x0 0xeea00000 0x0 0x200000 0x2000000 0x0 0xc0000000 0x0
> 0xc0000000 0x0 0x8000000 0x42000000 0x0 0xc8000000 0x0 0xc8000000 0x0
> 0x8000000>;
> ...
> };
> 
> So Xen retrieves the *CPU addresses* from the ranges:
> 
> (XEN) dev /soc/pcie@fe000000 range_size 7 nr_ranges 4
> (XEN) 0: addr=fe100000, size=100000
> (XEN) 1: addr=fe200000, size=200000
> (XEN) 2: addr=30000000, size=8000000
> (XEN) 3: addr=38000000, size=8000000
> (XEN) dev /soc/pcie@ee800000 range_size 7 nr_ranges 4
> (XEN) 0: addr=ee900000, size=100000
> (XEN) 1: addr=eea00000, size=200000
> (XEN) 2: addr=c0000000, size=8000000
> (XEN) 3: addr=c8000000, size=8000000
> 
> The code below covers ranges property in the context of finding memory holes
> (to be squashed with current patch):
> 
> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> index d37156a..7d20c10 100644
> --- a/xen/arch/arm/domain_build.c
> +++ b/xen/arch/arm/domain_build.c
> @@ -834,6 +834,8 @@ static int __init find_memory_holes(struct meminfo
> *ext_regions)
>      {
>          unsigned int naddr;
>          u64 addr, size;
> +        const __be32 *ranges;
> +        u32 len;
> 
>          naddr = dt_number_of_address(np);
> 
> @@ -857,6 +859,41 @@ static int __init find_memory_holes(struct meminfo
> *ext_regions)
>                  goto out;
>              }
>          }
> +
> +        /*
> +         * Also looking for non-empty ranges property which would likely mean
> +         * that we deal with PCI host bridge device and the property here
> +         * describes the BARs for the PCI devices.
> +         */

One thing to be careful is that ranges with a valid parameter is not
only present in PCI busses. It can be present in amba and other
simple-busses too. In that case the format for ranges in simpler as it
doesn't have a "memory type" like PCI.

When you get addresses from reg, bus ranges properties are automatically
handled for you.

All of this to say that a check on "ranges" is not enough because it
might capture other non-PCI busses that have a different, simpler,
ranges format. You want to check for "ranges" under a device_type =
"pci"; node.


> +        ranges = dt_get_property(np, "ranges", &len);
> +        if ( ranges && len )
> +        {
> +            unsigned int range_size, nr_ranges;
> +            int na, ns, pna;
> +
> +            pna = dt_n_addr_cells(np);
> +            na = dt_child_n_addr_cells(np);
> +            ns = dt_child_n_size_cells(np);
> +            range_size = pna + na + ns;
> +            nr_ranges = len / sizeof(__be32) / range_size;
> +
> +            for ( i = 0; i < nr_ranges; i++, ranges += range_size )
> +            {
> +                /* Skip the child address and get the parent (CPU) address */
> +                addr = dt_read_number(ranges + na, pna);
> +                size = dt_read_number(ranges + na + pna, ns);
> +
> +                start = addr & PAGE_MASK;
> +                end = PAGE_ALIGN(addr + size);
> +                res = rangeset_remove_range(mem_holes, start, end - 1);
> +                if ( res )
> +                {
> +                    printk(XENLOG_ERR "Failed to remove:
> %#"PRIx64"->%#"PRIx64"\n",
> +                           start, end);
> +                    goto out;
> +                }
> +            }
> +        }
>      }
Oleksandr Tyshchenko Sept. 23, 2021, 10:10 a.m. UTC | #28
On 22.09.21 23:50, Stefano Stabellini wrote:

Hi Stefano.

> On Wed, 22 Sep 2021, Oleksandr wrote:
>> On 22.09.21 01:00, Stefano Stabellini wrote:
>>> On Tue, 21 Sep 2021, Oleksandr wrote:
>>>> On 21.09.21 02:21, Stefano Stabellini wrote:
>>>>> On Sun, 19 Sep 2021, Oleksandr wrote:
>>>>>>> On 18/09/2021 03:37, Stefano Stabellini wrote:
>>>>>>>> On Fri, 17 Sep 2021, Stefano Stabellini wrote:
>>>>>>>>> On Fri, 17 Sep 2021, Oleksandr wrote:
>>>>>>>>>>>> +
>>>>>>>>>>>> +    dt_dprintk("Find unallocated memory for extended
>>>>>>>>>>>> regions\n");
>>>>>>>>>>>> +
>>>>>>>>>>>> +    unalloc_mem = rangeset_new(NULL, NULL, 0);
>>>>>>>>>>>> +    if ( !unalloc_mem )
>>>>>>>>>>>> +        return -ENOMEM;
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* Start with all available RAM */
>>>>>>>>>>>> +    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>>>>>>>>>>> +    {
>>>>>>>>>>>> +        start = bootinfo.mem.bank[i].start;
>>>>>>>>>>>> +        end = bootinfo.mem.bank[i].start +
>>>>>>>>>>>> bootinfo.mem.bank[i].size - 1;
>>>>>>>>>>>> +        res = rangeset_add_range(unalloc_mem, start,
>>>>>>>>>>>> end);
>>>>>>>>>>>> +        if ( res )
>>>>>>>>>>>> +        {
>>>>>>>>>>>> +            printk(XENLOG_ERR "Failed to add:
>>>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>>>> +                   start, end);
>>>>>>>>>>>> +            goto out;
>>>>>>>>>>>> +        }
>>>>>>>>>>>> +    }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* Remove RAM assigned to Dom0 */
>>>>>>>>>>>> +    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>>>>>>>>>>> +    {
>>>>>>>>>>>> +        start = assign_mem->bank[i].start;
>>>>>>>>>>>> +        end = assign_mem->bank[i].start +
>>>>>>>>>>>> assign_mem->bank[i].size - 1;
>>>>>>>>>>>> +        res = rangeset_remove_range(unalloc_mem, start,
>>>>>>>>>>>> end);
>>>>>>>>>>>> +        if ( res )
>>>>>>>>>>>> +        {
>>>>>>>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>>>> +                   start, end);
>>>>>>>>>>>> +            goto out;
>>>>>>>>>>>> +        }
>>>>>>>>>>>> +    }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* Remove reserved-memory regions */
>>>>>>>>>>>> +    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++
>>>>>>>>>>>> )
>>>>>>>>>>>> +    {
>>>>>>>>>>>> +        start = bootinfo.reserved_mem.bank[i].start;
>>>>>>>>>>>> +        end = bootinfo.reserved_mem.bank[i].start +
>>>>>>>>>>>> +            bootinfo.reserved_mem.bank[i].size - 1;
>>>>>>>>>>>> +        res = rangeset_remove_range(unalloc_mem, start,
>>>>>>>>>>>> end);
>>>>>>>>>>>> +        if ( res )
>>>>>>>>>>>> +        {
>>>>>>>>>>>> +            printk(XENLOG_ERR "Failed to remove:
>>>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>>>> +                   start, end);
>>>>>>>>>>>> +            goto out;
>>>>>>>>>>>> +        }
>>>>>>>>>>>> +    }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* Remove grant table region */
>>>>>>>>>>>> +    start = kinfo->gnttab_start;
>>>>>>>>>>>> +    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
>>>>>>>>>>>> +    res = rangeset_remove_range(unalloc_mem, start, end);
>>>>>>>>>>>> +    if ( res )
>>>>>>>>>>>> +    {
>>>>>>>>>>>> +        printk(XENLOG_ERR "Failed to remove:
>>>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>>>> +               start, end);
>>>>>>>>>>>> +        goto out;
>>>>>>>>>>>> +    }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    start = EXT_REGION_START;
>>>>>>>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1,
>>>>>>>>>>>> EXT_REGION_END);
>>>>>>>>>>>> +    res = rangeset_report_ranges(unalloc_mem, start, end,
>>>>>>>>>>>> +                                 add_ext_regions,
>>>>>>>>>>>> ext_regions);
>>>>>>>>>>>> +    if ( res )
>>>>>>>>>>>> +        ext_regions->nr_banks = 0;
>>>>>>>>>>>> +    else if ( !ext_regions->nr_banks )
>>>>>>>>>>>> +        res = -ENOENT;
>>>>>>>>>>>> +
>>>>>>>>>>>> +out:
>>>>>>>>>>>> +    rangeset_destroy(unalloc_mem);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    return res;
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static int __init find_memory_holes(const struct
>>>>>>>>>>>> kernel_info
>>>>>>>>>>>> *kinfo,
>>>>>>>>>>>> +                                    struct meminfo
>>>>>>>>>>>> *ext_regions)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    struct dt_device_node *np;
>>>>>>>>>>>> +    struct rangeset *mem_holes;
>>>>>>>>>>>> +    paddr_t start, end;
>>>>>>>>>>>> +    unsigned int i;
>>>>>>>>>>>> +    int res;
>>>>>>>>>>>> +
>>>>>>>>>>>> +    dt_dprintk("Find memory holes for extended
>>>>>>>>>>>> regions\n");
>>>>>>>>>>>> +
>>>>>>>>>>>> +    mem_holes = rangeset_new(NULL, NULL, 0);
>>>>>>>>>>>> +    if ( !mem_holes )
>>>>>>>>>>>> +        return -ENOMEM;
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* Start with maximum possible addressable physical
>>>>>>>>>>>> memory
>>>>>>>>>>>> range */
>>>>>>>>>>>> +    start = EXT_REGION_START;
>>>>>>>>>>>> +    end = min((1ULL << p2m_ipa_bits) - 1,
>>>>>>>>>>>> EXT_REGION_END);
>>>>>>>>>>>> +    res = rangeset_add_range(mem_holes, start, end);
>>>>>>>>>>>> +    if ( res )
>>>>>>>>>>>> +    {
>>>>>>>>>>>> +        printk(XENLOG_ERR "Failed to add:
>>>>>>>>>>>> %#"PRIx64"->%#"PRIx64"\n",
>>>>>>>>>>>> +               start, end);
>>>>>>>>>>>> +        goto out;
>>>>>>>>>>>> +    }
>>>>>>>>>>>> +
>>>>>>>>>>>> +    /* Remove all regions described by "reg" property
>>>>>>>>>>>> (MMIO,
>>>>>>>>>>>> RAM,
>>>>>>>>>>>> etc) */
>>>>>>>>>>> Well... The loop below is not going to handle all the
>>>>>>>>>>> regions
>>>>>>>>>>> described in
>>>>>>>>>>> the property "reg". Instead, it will cover a subset of "reg"
>>>>>>>>>>> where
>>>>>>>>>>> the
>>>>>>>>>>> memory is addressable.
>>>>>>>>>> As I understand, we are only interested in subset of "reg"
>>>>>>>>>> where
>>>>>>>>>> the
>>>>>>>>>> memory is
>>>>>>>>>> addressable.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> You will also need to cover "ranges" that will describe the
>>>>>>>>>>> BARs
>>>>>>>>>>> for
>>>>>>>>>>> the PCI
>>>>>>>>>>> devices.
>>>>>>>>>> Good point.
>>>>>>>>> Yes, very good point!
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> Could you please clarify how to recognize whether it is a PCI
>>>>>>>>>> device as long as PCI support is not merged? Or just to find
>>>>>>>>>> any
>>>>>>>>>> device nodes
>>>>>>>>>> with non-empty "ranges" property
>>>>>>>>>> and retrieve addresses?
>>>>>>>>> Normally any bus can have a ranges property with the aperture
>>>>>>>>> and
>>>>>>>>> possible address translations, including /amba (compatible =
>>>>>>>>> "simple-bus"). However, in these cases dt_device_get_address
>>>>>>>>> already
>>>>>>>>> takes care of it, see
>>>>>>>>> xen/common/device_tree.c:dt_device_get_address.
>>>>>>>>>
>>>>>>>>> The PCI bus is special for 2 reasons:
>>>>>>>>> - the ranges property has a different format
>>>>>>>>> - the bus is hot-pluggable
>>>>>>>>>
>>>>>>>>> So I think the only one that we need to treat specially is PCI.
>>>>>>>>>
>>>>>>>>> As far as I am aware PCI is the only bus (or maybe just the only
>>>>>>>>> bus
>>>>>>>>> that we support?) where ranges means the aperture.
>>>>>>>> Now that I think about this, there is another "hotpluggable"
>>>>>>>> scenario
>>>>>>>> we
>>>>>>>> need to think about:
>>>>>>>>
>>>>>>>> [1] https://marc.info/?l=xen-devel&m=163056546214978
>>>>>>>>
>>>>>>>> Xilinx devices have FPGA regions with apertures currently not
>>>>>>>> described
>>>>>>>> in device tree, where things can programmed in PL at runtime
>>>>>>>> making
>>>>>>>> new
>>>>>>>> devices appear with new MMIO regions out of thin air.
>>>>>>>>
>>>>>>>> Now let me start by saying that yes, the entire programmable
>>>>>>>> region
>>>>>>>> aperture could probably be described in device tree, however, in
>>>>>>>> reality it is not currently done in any of the device trees we use
>>>>>>>> (including the upstream device trees in linux.git).
>>>>>>> This is rather annoying, but not unheard. There are a couple of
>>>>>>> platforms
>>>>>>> where the MMIOs are not fully described in the DT.
>>>>>>>
>>>>>>> In fact, we have a callback 'specific_mappings' which create
>>>>>>> additional
>>>>>>> mappings (e.g. on the omap5) for dom0.
>>>>>>>
>>>>>>>> So, we have a problem :-(
>>>>>>>>
>>>>>>>>
>>>>>>>> I can work toward getting the right info on device tree, but in
>>>>>>>> reality
>>>>>>>> that is going to take time and for now the device tree doesn't
>>>>>>>> have
>>>>>>>> the
>>>>>>>> FPGA aperture in it. So if we accept this series as is, it is
>>>>>>>> going to
>>>>>>>> stop features like [1] from working. >
>>>>>>>> If we cannot come up with any better plans, I think it would be
>>>>>>>> better
>>>>>>>> to drop find_memory_holes, only rely on find_unallocated_memory
>>>>>>>> even
>>>>>>>> when the IOMMU is on. One idea is that we could add on top of the
>>>>>>>> regions found by find_unallocated_memory any MMIO regions marked
>>>>>>>> as
>>>>>>>> xen,passthrough: they are safe because they are not going to dom0
>>>>>>>> anyway.
>>>>>>> (Oleksandr, it looks like some rationale about the different
>>>>>>> approach is
>>>>>>> missing in the commit message. Can you add it?)
>>>>>> Yes sure, but let me please clarify what is different approach in this
>>>>>> context. Is it to *also* take into the account MMIO regions of the
>>>>>> devices
>>>>>> for
>>>>>> passthrough for case when IOMMU is off (in addition to unallocated
>>>>>> memory)? If
>>>>>> yes, I wonder whether we will gain much with that according to that
>>>>>> device's
>>>>>> MMIO regions are usually not big enough and we stick to allocate
>>>>>> extended
>>>>>> regions with bigger size (> 64MB).
>>>>> That's fair enough. There are a couple of counter examples where the
>>>>> MMIO regions for the device to assign are quite large, for instance a
>>>>> GPU, Xilinx AIEngine, or the PCIe Root Complex with the entire aperture,
>>>>> but maybe they are not that common. I am not sure if it is worth
>>>>> scanning the tree for xen,passthrough regions every time at boot for
>>>>> this.
>>>> ok, I will add a few sentences to commit message about this different
>>>> approach
>>>> for now. At least this could be implemented later on if there is a need.
>>> One thing that worries me about this is that if we take an old Xen with
>>> this series and run it on a new board, it might cause problems. At the
>>> very least [1] wouldn't work.
>> I got it.
>>
>>
>>> Can we have a Xen command line argument to disable extended regions as
>>> an emergecy toggle?
>> I think, yes. If no preference for the argument name I will name it
>> "no-ext-region".
> It is better to introduce it as ext-regions=yes/no with yes as default.
> So that in the future we could extending it to ext-regions=start,size if
> we wanted to.

ok, will do
Oleksandr Tyshchenko Sept. 23, 2021, 10:11 a.m. UTC | #29
On 23.09.21 00:05, Stefano Stabellini wrote:

Hi Stefano

> On Wed, 22 Sep 2021, Oleksandr wrote:
>>>>>> You will also need to cover "ranges" that will describe the BARs for
>>>>>> the PCI
>>>>>> devices.
>>>>> Good point.
>>>> Yes, very good point!
>>>>
>>>>
>>>>> Could you please clarify how to recognize whether it is a PCI
>>>>> device as long as PCI support is not merged? Or just to find any device
>>>>> nodes
>>>>> with non-empty "ranges" property
>>>>> and retrieve addresses?
>>>> Normally any bus can have a ranges property with the aperture and
>>>> possible address translations, including /amba (compatible =
>>>> "simple-bus"). However, in these cases dt_device_get_address already
>>>> takes care of it, see xen/common/device_tree.c:dt_device_get_address.
>>>>
>>>> The PCI bus is special for 2 reasons:
>>>> - the ranges property has a different format
>>>> - the bus is hot-pluggable
>>>>
>>>> So I think the only one that we need to treat specially is PCI.
>>>>
>>>> As far as I am aware PCI is the only bus (or maybe just the only bus
>>>> that we support?) where ranges means the aperture.
>>> Thank you for the clarification. I need to find device node with non-empty
>>> ranges property
>>> (and make sure that device_type property is "pci"), after that I need to
>>> read the context of ranges property and translate it.
>>>
>>>
>> OK, I experimented with that and managed to parse ranges property for PCI host
>> bridge node.
>>
>> I tested on my setup where the host device tree contains two PCI host bridge
>> nodes with the following:
>>
>> pcie@fe000000 {
>> ...
>>              ranges = <0x1000000 0x0 0x0 0x0 0xfe100000 0x0 0x100000 0x2000000
>> 0x0 0xfe200000 0x0 0xfe200000 0x0 0x200000 0x2000000 0x0 0x30000000 0x0
>> 0x30000000 0x0 0x8000000 0x42000000 0x0 0x38000000 0x0 0x38000000 0x0
>> 0x8000000>;
>> ...
>> };
>>
>> pcie@ee800000 {
>> ...
>>              ranges = <0x1000000 0x0 0x0 0x0 0xee900000 0x0 0x100000 0x2000000
>> 0x0 0xeea00000 0x0 0xeea00000 0x0 0x200000 0x2000000 0x0 0xc0000000 0x0
>> 0xc0000000 0x0 0x8000000 0x42000000 0x0 0xc8000000 0x0 0xc8000000 0x0
>> 0x8000000>;
>> ...
>> };
>>
>> So Xen retrieves the *CPU addresses* from the ranges:
>>
>> (XEN) dev /soc/pcie@fe000000 range_size 7 nr_ranges 4
>> (XEN) 0: addr=fe100000, size=100000
>> (XEN) 1: addr=fe200000, size=200000
>> (XEN) 2: addr=30000000, size=8000000
>> (XEN) 3: addr=38000000, size=8000000
>> (XEN) dev /soc/pcie@ee800000 range_size 7 nr_ranges 4
>> (XEN) 0: addr=ee900000, size=100000
>> (XEN) 1: addr=eea00000, size=200000
>> (XEN) 2: addr=c0000000, size=8000000
>> (XEN) 3: addr=c8000000, size=8000000
>>
>> The code below covers ranges property in the context of finding memory holes
>> (to be squashed with current patch):
>>
>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>> index d37156a..7d20c10 100644
>> --- a/xen/arch/arm/domain_build.c
>> +++ b/xen/arch/arm/domain_build.c
>> @@ -834,6 +834,8 @@ static int __init find_memory_holes(struct meminfo
>> *ext_regions)
>>       {
>>           unsigned int naddr;
>>           u64 addr, size;
>> +        const __be32 *ranges;
>> +        u32 len;
>>
>>           naddr = dt_number_of_address(np);
>>
>> @@ -857,6 +859,41 @@ static int __init find_memory_holes(struct meminfo
>> *ext_regions)
>>                   goto out;
>>               }
>>           }
>> +
>> +        /*
>> +         * Also looking for non-empty ranges property which would likely mean
>> +         * that we deal with PCI host bridge device and the property here
>> +         * describes the BARs for the PCI devices.
>> +         */
> One thing to be careful is that ranges with a valid parameter is not
> only present in PCI busses. It can be present in amba and other
> simple-busses too. In that case the format for ranges in simpler as it
> doesn't have a "memory type" like PCI.
>
> When you get addresses from reg, bus ranges properties are automatically
> handled for you.
>
> All of this to say that a check on "ranges" is not enough because it
> might capture other non-PCI busses that have a different, simpler,
> ranges format. You want to check for "ranges" under a device_type =
> "pci"; node.

ok, will do.


>
>
>> +        ranges = dt_get_property(np, "ranges", &len);
>> +        if ( ranges && len )
>> +        {
>> +            unsigned int range_size, nr_ranges;
>> +            int na, ns, pna;
>> +
>> +            pna = dt_n_addr_cells(np);
>> +            na = dt_child_n_addr_cells(np);
>> +            ns = dt_child_n_size_cells(np);
>> +            range_size = pna + na + ns;
>> +            nr_ranges = len / sizeof(__be32) / range_size;
>> +
>> +            for ( i = 0; i < nr_ranges; i++, ranges += range_size )
>> +            {
>> +                /* Skip the child address and get the parent (CPU) address */
>> +                addr = dt_read_number(ranges + na, pna);
>> +                size = dt_read_number(ranges + na + pna, ns);
>> +
>> +                start = addr & PAGE_MASK;
>> +                end = PAGE_ALIGN(addr + size);
>> +                res = rangeset_remove_range(mem_holes, start, end - 1);
>> +                if ( res )
>> +                {
>> +                    printk(XENLOG_ERR "Failed to remove:
>> %#"PRIx64"->%#"PRIx64"\n",
>> +                           start, end);
>> +                    goto out;
>> +                }
>> +            }
>> +        }
>>       }
Oleksandr Tyshchenko Sept. 23, 2021, 10:41 a.m. UTC | #30
Hi Stefano, Julien


On 18.09.21 19:59, Oleksandr wrote:
>
> Hi Julien.
>
>
> [snip]
>
>
>>>
>>>
>>>> +#define EXT_REGION_END 0x80003fffffffULL
>>>> +
>>>> +static int __init find_unallocated_memory(const struct kernel_info 
>>>> *kinfo,
>>>> +                                          struct meminfo 
>>>> *ext_regions)
>>>> +{
>>>> +    const struct meminfo *assign_mem = &kinfo->mem;
>>>> +    struct rangeset *unalloc_mem;
>>>> +    paddr_t start, end;
>>>> +    unsigned int i;
>>>> +    int res;
>>>
>>> We technically already know which range of memory is unused. This is 
>>> pretty much any region in the freelist of the page allocator. So how 
>>> about walking the freelist instead?
>>
>> ok, I will investigate the page allocator code (right now I have no 
>> understanding of how to do that). BTW, I have just grepped "freelist" 
>> through the code and all page context related appearances are in x86 
>> code only.
>>
>>>
>>> The advantage is we don't need to worry about modifying the function 
>>> when adding new memory type.
>>>
>>> One disavantage is this will not cover *all* the unused memory as 
>>> this is doing. But I think this is an acceptable downside.
>
> I did some investigations and create test patch. Although, I am not 
> 100% sure this is exactly what you meant, but I will provide results 
> anyway.
>
> 1. Below the extended regions (unallocated memory, regions >=64MB ) 
> calculated by my initial method (bootinfo.mem - kinfo->mem - 
> bootinfo.reserved_mem - kinfo->gnttab):
>
> (XEN) Extended region 0: 0x48000000->0x54000000
> (XEN) Extended region 1: 0x57000000->0x60000000
> (XEN) Extended region 2: 0x70000000->0x78000000
> (XEN) Extended region 3: 0x78200000->0xc0000000
> (XEN) Extended region 4: 0x500000000->0x580000000
> (XEN) Extended region 5: 0x600000000->0x680000000
> (XEN) Extended region 6: 0x700000000->0x780000000
>
> 2. Below the extended regions (unallocated memory, regions >=64MB) 
> calculated by new method (free memory in page allocator):
>
> (XEN) Extended region 0: 0x48000000->0x54000000
> (XEN) Extended region 1: 0x58000000->0x60000000
> (XEN) Extended region 2: 0x70000000->0x78000000
> (XEN) Extended region 3: 0x78200000->0x84000000
> (XEN) Extended region 4: 0x86000000->0x8a000000
> (XEN) Extended region 5: 0x8c200000->0xc0000000
> (XEN) Extended region 6: 0x500000000->0x580000000
> (XEN) Extended region 7: 0x600000000->0x680000000
> (XEN) Extended region 8: 0x700000000->0x765e00000
>
> Some thoughts regarding that.
>
> 1. A few ranges below 4GB are absent in resulting extended regions. I 
> assume, this is because of the modules:
>
> (XEN) Checking for initrd in /chosen
> (XEN) Initrd 0000000084000040-0000000085effc48
> (XEN) RAM: 0000000048000000 - 00000000bfffffff
> (XEN) RAM: 0000000500000000 - 000000057fffffff
> (XEN) RAM: 0000000600000000 - 000000067fffffff
> (XEN) RAM: 0000000700000000 - 000000077fffffff
> (XEN)
> (XEN) MODULE[0]: 0000000078080000 - 00000000781d74c8 Xen
> (XEN) MODULE[1]: 0000000057fe7000 - 0000000057ffd080 Device Tree
> (XEN) MODULE[2]: 0000000084000040 - 0000000085effc48 Ramdisk
> (XEN) MODULE[3]: 000000008a000000 - 000000008c000000 Kernel
> (XEN) MODULE[4]: 000000008c000000 - 000000008c010000 XSM
> (XEN)  RESVD[0]: 0000000084000040 - 0000000085effc48
> (XEN)  RESVD[1]: 0000000054000000 - 0000000056ffffff
>
> 2. Also, it worth mentioning that relatively large chunk (~417MB) of 
> memory above 4GB is absent (to be precise, at the end of last RAM 
> bank), which I assume, used for Xen internals.
> We could really use it for extended regions.
> Below free regions in the heap (for last RAM bank) just in case:
>
> (XEN) heap[node=0][zone=23][order=5] 0x00000765ec0000-0x00000765ee0000
> (XEN) heap[node=0][zone=23][order=6] 0x00000765e80000-0x00000765ec0000
> (XEN) heap[node=0][zone=23][order=7] 0x00000765e00000-0x00000765e80000
> (XEN) heap[node=0][zone=23][order=9] 0x00000765c00000-0x00000765e00000
> (XEN) heap[node=0][zone=23][order=10] 0x00000765800000-0x00000765c00000
> (XEN) heap[node=0][zone=23][order=11] 0x00000765000000-0x00000765800000
> (XEN) heap[node=0][zone=23][order=12] 0x00000764000000-0x00000765000000
> (XEN) heap[node=0][zone=23][order=14] 0x00000760000000-0x00000764000000
> (XEN) heap[node=0][zone=23][order=17] 0x00000740000000-0x00000760000000
> (XEN) heap[node=0][zone=23][order=18] 0x00000540000000-0x00000580000000
> (XEN) heap[node=0][zone=23][order=18] 0x00000500000000-0x00000540000000
> (XEN) heap[node=0][zone=23][order=18] 0x00000640000000-0x00000680000000
> (XEN) heap[node=0][zone=23][order=18] 0x00000600000000-0x00000640000000
> (XEN) heap[node=0][zone=23][order=18] 0x00000700000000-0x00000740000000
>
> Yes, you already pointed out this disadvantage, so if it is an 
> acceptable downside, I am absolutely OK.
>
>
> 3. Common code updates. There is a question how to properly make a 
> connection between common allocator internals and Arm's code for 
> creating DT. I didn’t come up with anything better
> than creating for_each_avail_page() for invoking a callback with page 
> and its order.
>
> **********
>
> Below the proposed changes on top of the initial patch, would this be 
> acceptable in general?
>
> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> index 523eb19..1e58fc5 100644
> --- a/xen/arch/arm/domain_build.c
> +++ b/xen/arch/arm/domain_build.c
> @@ -753,16 +753,33 @@ static int __init add_ext_regions(unsigned long 
> s, unsigned long e, void *data)
>      return 0;
>  }
>
> +static int __init add_unalloc_mem(struct page_info *page, unsigned 
> int order,
> +                                  void *data)
> +{
> +    struct rangeset *unalloc_mem = data;
> +    paddr_t start, end;
> +    int res;
> +
> +    start = page_to_maddr(page);
> +    end = start + pfn_to_paddr(1UL << order);
> +    res = rangeset_add_range(unalloc_mem, start, end - 1);
> +    if ( res )
> +    {
> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> +               start, end);
> +        return res;
> +    }
> +
> +    return 0;
> +}
> +
>  #define EXT_REGION_START   0x40000000ULL
>  #define EXT_REGION_END     0x80003fffffffULL
>
> -static int __init find_unallocated_memory(const struct kernel_info 
> *kinfo,
> -                                          struct meminfo *ext_regions)
> +static int __init find_unallocated_memory(struct meminfo *ext_regions)
>  {
> -    const struct meminfo *assign_mem = &kinfo->mem;
>      struct rangeset *unalloc_mem;
>      paddr_t start, end;
> -    unsigned int i;
>      int res;
>
>      dt_dprintk("Find unallocated memory for extended regions\n");
> @@ -771,59 +788,9 @@ static int __init find_unallocated_memory(const 
> struct kernel_info *kinfo,
>      if ( !unalloc_mem )
>          return -ENOMEM;
>
> -    /* Start with all available RAM */
> -    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> -    {
> -        start = bootinfo.mem.bank[i].start;
> -        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size;
> -        res = rangeset_add_range(unalloc_mem, start, end - 1);
> -        if ( res )
> -        {
> -            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> -                   start, end);
> -            goto out;
> -        }
> -    }
> -
> -    /* Remove RAM assigned to Dom0 */
> -    for ( i = 0; i < assign_mem->nr_banks; i++ )
> -    {
> -        start = assign_mem->bank[i].start;
> -        end = assign_mem->bank[i].start + assign_mem->bank[i].size;
> -        res = rangeset_remove_range(unalloc_mem, start, end - 1);
> -        if ( res )
> -        {
> -            printk(XENLOG_ERR "Failed to remove: 
> %#"PRIx64"->%#"PRIx64"\n",
> -                   start, end);
> -            goto out;
> -        }
> -    }
> -
> -    /* Remove reserved-memory regions */
> -    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> -    {
> -        start = bootinfo.reserved_mem.bank[i].start;
> -        end = bootinfo.reserved_mem.bank[i].start +
> -            bootinfo.reserved_mem.bank[i].size;
> -        res = rangeset_remove_range(unalloc_mem, start, end - 1);
> -        if ( res )
> -        {
> -            printk(XENLOG_ERR "Failed to remove: 
> %#"PRIx64"->%#"PRIx64"\n",
> -                   start, end);
> -            goto out;
> -        }
> -    }
> -
> -    /* Remove grant table region */
> -    start = kinfo->gnttab_start;
> -    end = kinfo->gnttab_start + kinfo->gnttab_size;
> -    res = rangeset_remove_range(unalloc_mem, start, end - 1);
> +    res = for_each_avail_page(add_unalloc_mem, unalloc_mem);
>      if ( res )
> -    {
> -        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> -               start, end);
>          goto out;
> -    }
>
>      start = EXT_REGION_START;
>      end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> @@ -840,8 +807,7 @@ out:
>      return res;
>  }
>
> -static int __init find_memory_holes(const struct kernel_info *kinfo,
> -                                    struct meminfo *ext_regions)
> +static int __init find_memory_holes(struct meminfo *ext_regions)
>  {
>      struct dt_device_node *np;
>      struct rangeset *mem_holes;
> @@ -961,9 +927,9 @@ static int __init make_hypervisor_node(struct 
> domain *d,
>      else
>      {
>          if ( !is_iommu_enabled(d) )
> -            res = find_unallocated_memory(kinfo, ext_regions);
> +            res = find_unallocated_memory(ext_regions);
>          else
> -            res = find_memory_holes(kinfo, ext_regions);
> +            res = find_memory_holes(ext_regions);
>
>          if ( res )
>              printk(XENLOG_WARNING "Failed to allocate extended 
> regions\n");
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 8fad139..7cd1020 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -1572,6 +1572,40 @@ static int reserve_heap_page(struct page_info *pg)
>
>  }
>
> +/* TODO heap_lock? */
> +int for_each_avail_page(int (*cb)(struct page_info *, unsigned int, 
> void *),
> +                        void *data)
> +{
> +    unsigned int node, zone, order;
> +    int ret;
> +
> +    for ( node = 0; node < MAX_NUMNODES; node++ )
> +    {
> +        if ( !avail[node] )
> +            continue;
> +
> +        for ( zone = 0; zone < NR_ZONES; zone++ )
> +        {
> +            for ( order = 0; order <= MAX_ORDER; order++ )
> +            {
> +                struct page_info *head, *tmp;
> +
> +                if ( page_list_empty(&heap(node, zone, order)) )
> +                    continue;
> +
> +                page_list_for_each_safe ( head, tmp, &heap(node, 
> zone, order) )
> +                {
> +                    ret = cb(head, order, data);
> +                    if ( ret )
> +                        return ret;
> +                }
> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +
>  int offline_page(mfn_t mfn, int broken, uint32_t *status)
>  {
>      unsigned long old_info = 0;
> diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
> index 667f9da..64dd3e2 100644
> --- a/xen/include/xen/mm.h
> +++ b/xen/include/xen/mm.h
> @@ -123,6 +123,9 @@ unsigned int online_page(mfn_t mfn, uint32_t 
> *status);
>  int offline_page(mfn_t mfn, int broken, uint32_t *status);
>  int query_page_offline(mfn_t mfn, uint32_t *status);
>
> +int for_each_avail_page(int (*cb)(struct page_info *, unsigned int, 
> void *),
> +                        void *data);
> +
>  void heap_init_late(void);
>
>  int assign_pages(


I am sorry, but may I please clarify regarding that? Whether we will go 
this new direction (free memory in page allocator) or leave things as 
they are (bootinfo.mem - kinfo->mem - bootinfo.reserved_mem - 
kinfo->gnttab). This is only one still unclear moment to me in current 
patch before preparing V3.
Stefano Stabellini Sept. 23, 2021, 4:38 p.m. UTC | #31
On Thu, 23 Sep 2021, Oleksandr wrote:
> On 18.09.21 19:59, Oleksandr wrote:
> > > > > +#define EXT_REGION_END 0x80003fffffffULL
> > > > > +
> > > > > +static int __init find_unallocated_memory(const struct kernel_info
> > > > > *kinfo,
> > > > > +                                          struct meminfo
> > > > > *ext_regions)
> > > > > +{
> > > > > +    const struct meminfo *assign_mem = &kinfo->mem;
> > > > > +    struct rangeset *unalloc_mem;
> > > > > +    paddr_t start, end;
> > > > > +    unsigned int i;
> > > > > +    int res;
> > > > 
> > > > We technically already know which range of memory is unused. This is
> > > > pretty much any region in the freelist of the page allocator. So how
> > > > about walking the freelist instead?
> > > 
> > > ok, I will investigate the page allocator code (right now I have no
> > > understanding of how to do that). BTW, I have just grepped "freelist"
> > > through the code and all page context related appearances are in x86 code
> > > only.
> > > 
> > > > 
> > > > The advantage is we don't need to worry about modifying the function
> > > > when adding new memory type.
> > > > 
> > > > One disavantage is this will not cover *all* the unused memory as this
> > > > is doing. But I think this is an acceptable downside.
> > 
> > I did some investigations and create test patch. Although, I am not 100%
> > sure this is exactly what you meant, but I will provide results anyway.
> > 
> > 1. Below the extended regions (unallocated memory, regions >=64MB )
> > calculated by my initial method (bootinfo.mem - kinfo->mem -
> > bootinfo.reserved_mem - kinfo->gnttab):
> > 
> > (XEN) Extended region 0: 0x48000000->0x54000000
> > (XEN) Extended region 1: 0x57000000->0x60000000
> > (XEN) Extended region 2: 0x70000000->0x78000000
> > (XEN) Extended region 3: 0x78200000->0xc0000000
> > (XEN) Extended region 4: 0x500000000->0x580000000
> > (XEN) Extended region 5: 0x600000000->0x680000000
> > (XEN) Extended region 6: 0x700000000->0x780000000
> > 
> > 2. Below the extended regions (unallocated memory, regions >=64MB)
> > calculated by new method (free memory in page allocator):
> > 
> > (XEN) Extended region 0: 0x48000000->0x54000000
> > (XEN) Extended region 1: 0x58000000->0x60000000
> > (XEN) Extended region 2: 0x70000000->0x78000000
> > (XEN) Extended region 3: 0x78200000->0x84000000
> > (XEN) Extended region 4: 0x86000000->0x8a000000
> > (XEN) Extended region 5: 0x8c200000->0xc0000000
> > (XEN) Extended region 6: 0x500000000->0x580000000
> > (XEN) Extended region 7: 0x600000000->0x680000000
> > (XEN) Extended region 8: 0x700000000->0x765e00000
> > 
> > Some thoughts regarding that.
> > 
> > 1. A few ranges below 4GB are absent in resulting extended regions. I
> > assume, this is because of the modules:
> > 
> > (XEN) Checking for initrd in /chosen
> > (XEN) Initrd 0000000084000040-0000000085effc48
> > (XEN) RAM: 0000000048000000 - 00000000bfffffff
> > (XEN) RAM: 0000000500000000 - 000000057fffffff
> > (XEN) RAM: 0000000600000000 - 000000067fffffff
> > (XEN) RAM: 0000000700000000 - 000000077fffffff
> > (XEN)
> > (XEN) MODULE[0]: 0000000078080000 - 00000000781d74c8 Xen
> > (XEN) MODULE[1]: 0000000057fe7000 - 0000000057ffd080 Device Tree
> > (XEN) MODULE[2]: 0000000084000040 - 0000000085effc48 Ramdisk
> > (XEN) MODULE[3]: 000000008a000000 - 000000008c000000 Kernel
> > (XEN) MODULE[4]: 000000008c000000 - 000000008c010000 XSM
> > (XEN)  RESVD[0]: 0000000084000040 - 0000000085effc48
> > (XEN)  RESVD[1]: 0000000054000000 - 0000000056ffffff
> > 
> > 2. Also, it worth mentioning that relatively large chunk (~417MB) of memory
> > above 4GB is absent (to be precise, at the end of last RAM bank), which I
> > assume, used for Xen internals.
> > We could really use it for extended regions.
> > Below free regions in the heap (for last RAM bank) just in case:
> > 
> > (XEN) heap[node=0][zone=23][order=5] 0x00000765ec0000-0x00000765ee0000
> > (XEN) heap[node=0][zone=23][order=6] 0x00000765e80000-0x00000765ec0000
> > (XEN) heap[node=0][zone=23][order=7] 0x00000765e00000-0x00000765e80000
> > (XEN) heap[node=0][zone=23][order=9] 0x00000765c00000-0x00000765e00000
> > (XEN) heap[node=0][zone=23][order=10] 0x00000765800000-0x00000765c00000
> > (XEN) heap[node=0][zone=23][order=11] 0x00000765000000-0x00000765800000
> > (XEN) heap[node=0][zone=23][order=12] 0x00000764000000-0x00000765000000
> > (XEN) heap[node=0][zone=23][order=14] 0x00000760000000-0x00000764000000
> > (XEN) heap[node=0][zone=23][order=17] 0x00000740000000-0x00000760000000
> > (XEN) heap[node=0][zone=23][order=18] 0x00000540000000-0x00000580000000
> > (XEN) heap[node=0][zone=23][order=18] 0x00000500000000-0x00000540000000
> > (XEN) heap[node=0][zone=23][order=18] 0x00000640000000-0x00000680000000
> > (XEN) heap[node=0][zone=23][order=18] 0x00000600000000-0x00000640000000
> > (XEN) heap[node=0][zone=23][order=18] 0x00000700000000-0x00000740000000
> > 
> > Yes, you already pointed out this disadvantage, so if it is an acceptable
> > downside, I am absolutely OK.
> > 
> > 
> > 3. Common code updates. There is a question how to properly make a
> > connection between common allocator internals and Arm's code for creating
> > DT. I didn’t come up with anything better
> > than creating for_each_avail_page() for invoking a callback with page and
> > its order.
> > 
> > **********
> > 
> > Below the proposed changes on top of the initial patch, would this be
> > acceptable in general?
> > 
> > diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> > index 523eb19..1e58fc5 100644
> > --- a/xen/arch/arm/domain_build.c
> > +++ b/xen/arch/arm/domain_build.c
> > @@ -753,16 +753,33 @@ static int __init add_ext_regions(unsigned long s,
> > unsigned long e, void *data)
> >      return 0;
> >  }
> > 
> > +static int __init add_unalloc_mem(struct page_info *page, unsigned int
> > order,
> > +                                  void *data)
> > +{
> > +    struct rangeset *unalloc_mem = data;
> > +    paddr_t start, end;
> > +    int res;
> > +
> > +    start = page_to_maddr(page);
> > +    end = start + pfn_to_paddr(1UL << order);
> > +    res = rangeset_add_range(unalloc_mem, start, end - 1);
> > +    if ( res )
> > +    {
> > +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > +               start, end);
> > +        return res;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  #define EXT_REGION_START   0x40000000ULL
> >  #define EXT_REGION_END     0x80003fffffffULL
> > 
> > -static int __init find_unallocated_memory(const struct kernel_info *kinfo,
> > -                                          struct meminfo *ext_regions)
> > +static int __init find_unallocated_memory(struct meminfo *ext_regions)
> >  {
> > -    const struct meminfo *assign_mem = &kinfo->mem;
> >      struct rangeset *unalloc_mem;
> >      paddr_t start, end;
> > -    unsigned int i;
> >      int res;
> > 
> >      dt_dprintk("Find unallocated memory for extended regions\n");
> > @@ -771,59 +788,9 @@ static int __init find_unallocated_memory(const struct
> > kernel_info *kinfo,
> >      if ( !unalloc_mem )
> >          return -ENOMEM;
> > 
> > -    /* Start with all available RAM */
> > -    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
> > -    {
> > -        start = bootinfo.mem.bank[i].start;
> > -        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size;
> > -        res = rangeset_add_range(unalloc_mem, start, end - 1);
> > -        if ( res )
> > -        {
> > -            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
> > -                   start, end);
> > -            goto out;
> > -        }
> > -    }
> > -
> > -    /* Remove RAM assigned to Dom0 */
> > -    for ( i = 0; i < assign_mem->nr_banks; i++ )
> > -    {
> > -        start = assign_mem->bank[i].start;
> > -        end = assign_mem->bank[i].start + assign_mem->bank[i].size;
> > -        res = rangeset_remove_range(unalloc_mem, start, end - 1);
> > -        if ( res )
> > -        {
> > -            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> > -                   start, end);
> > -            goto out;
> > -        }
> > -    }
> > -
> > -    /* Remove reserved-memory regions */
> > -    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
> > -    {
> > -        start = bootinfo.reserved_mem.bank[i].start;
> > -        end = bootinfo.reserved_mem.bank[i].start +
> > -            bootinfo.reserved_mem.bank[i].size;
> > -        res = rangeset_remove_range(unalloc_mem, start, end - 1);
> > -        if ( res )
> > -        {
> > -            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> > -                   start, end);
> > -            goto out;
> > -        }
> > -    }
> > -
> > -    /* Remove grant table region */
> > -    start = kinfo->gnttab_start;
> > -    end = kinfo->gnttab_start + kinfo->gnttab_size;
> > -    res = rangeset_remove_range(unalloc_mem, start, end - 1);
> > +    res = for_each_avail_page(add_unalloc_mem, unalloc_mem);
> >      if ( res )
> > -    {
> > -        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
> > -               start, end);
> >          goto out;
> > -    }
> > 
> >      start = EXT_REGION_START;
> >      end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
> > @@ -840,8 +807,7 @@ out:
> >      return res;
> >  }
> > 
> > -static int __init find_memory_holes(const struct kernel_info *kinfo,
> > -                                    struct meminfo *ext_regions)
> > +static int __init find_memory_holes(struct meminfo *ext_regions)
> >  {
> >      struct dt_device_node *np;
> >      struct rangeset *mem_holes;
> > @@ -961,9 +927,9 @@ static int __init make_hypervisor_node(struct domain *d,
> >      else
> >      {
> >          if ( !is_iommu_enabled(d) )
> > -            res = find_unallocated_memory(kinfo, ext_regions);
> > +            res = find_unallocated_memory(ext_regions);
> >          else
> > -            res = find_memory_holes(kinfo, ext_regions);
> > +            res = find_memory_holes(ext_regions);
> > 
> >          if ( res )
> >              printk(XENLOG_WARNING "Failed to allocate extended regions\n");
> > diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> > index 8fad139..7cd1020 100644
> > --- a/xen/common/page_alloc.c
> > +++ b/xen/common/page_alloc.c
> > @@ -1572,6 +1572,40 @@ static int reserve_heap_page(struct page_info *pg)
> > 
> >  }
> > 
> > +/* TODO heap_lock? */
> > +int for_each_avail_page(int (*cb)(struct page_info *, unsigned int, void
> > *),
> > +                        void *data)
> > +{
> > +    unsigned int node, zone, order;
> > +    int ret;
> > +
> > +    for ( node = 0; node < MAX_NUMNODES; node++ )
> > +    {
> > +        if ( !avail[node] )
> > +            continue;
> > +
> > +        for ( zone = 0; zone < NR_ZONES; zone++ )
> > +        {
> > +            for ( order = 0; order <= MAX_ORDER; order++ )
> > +            {
> > +                struct page_info *head, *tmp;
> > +
> > +                if ( page_list_empty(&heap(node, zone, order)) )
> > +                    continue;
> > +
> > +                page_list_for_each_safe ( head, tmp, &heap(node, zone,
> > order) )
> > +                {
> > +                    ret = cb(head, order, data);
> > +                    if ( ret )
> > +                        return ret;
> > +                }
> > +            }
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  int offline_page(mfn_t mfn, int broken, uint32_t *status)
> >  {
> >      unsigned long old_info = 0;
> > diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
> > index 667f9da..64dd3e2 100644
> > --- a/xen/include/xen/mm.h
> > +++ b/xen/include/xen/mm.h
> > @@ -123,6 +123,9 @@ unsigned int online_page(mfn_t mfn, uint32_t *status);
> >  int offline_page(mfn_t mfn, int broken, uint32_t *status);
> >  int query_page_offline(mfn_t mfn, uint32_t *status);
> > 
> > +int for_each_avail_page(int (*cb)(struct page_info *, unsigned int, void
> > *),
> > +                        void *data);
> > +
> >  void heap_init_late(void);
> > 
> >  int assign_pages(
> 
> 
> I am sorry, but may I please clarify regarding that? Whether we will go this
> new direction (free memory in page allocator) or leave things as they are
> (bootinfo.mem - kinfo->mem - bootinfo.reserved_mem - kinfo->gnttab). This is
> only one still unclear moment to me in current patch before preparing V3.

I think both approaches are fine. Your original approach leads to better
results in terms of extended regions but the difference is not drastic.
The original approach requires more code (bad) but probably less CPU
cycles (good).

Personally I am fine either way but as Julien was the one to provide
feedback on this it would be best to get his opinion.

But in the meantime I think it is OK to send a v3 so that we can review
the rest.
Oleksandr Tyshchenko Sept. 23, 2021, 5:44 p.m. UTC | #32
On 23.09.21 19:38, Stefano Stabellini wrote:

Hi Stefano

> On Thu, 23 Sep 2021, Oleksandr wrote:
>> On 18.09.21 19:59, Oleksandr wrote:
>>>>>> +#define EXT_REGION_END 0x80003fffffffULL
>>>>>> +
>>>>>> +static int __init find_unallocated_memory(const struct kernel_info
>>>>>> *kinfo,
>>>>>> +                                          struct meminfo
>>>>>> *ext_regions)
>>>>>> +{
>>>>>> +    const struct meminfo *assign_mem = &kinfo->mem;
>>>>>> +    struct rangeset *unalloc_mem;
>>>>>> +    paddr_t start, end;
>>>>>> +    unsigned int i;
>>>>>> +    int res;
>>>>> We technically already know which range of memory is unused. This is
>>>>> pretty much any region in the freelist of the page allocator. So how
>>>>> about walking the freelist instead?
>>>> ok, I will investigate the page allocator code (right now I have no
>>>> understanding of how to do that). BTW, I have just grepped "freelist"
>>>> through the code and all page context related appearances are in x86 code
>>>> only.
>>>>
>>>>> The advantage is we don't need to worry about modifying the function
>>>>> when adding new memory type.
>>>>>
>>>>> One disavantage is this will not cover *all* the unused memory as this
>>>>> is doing. But I think this is an acceptable downside.
>>> I did some investigations and create test patch. Although, I am not 100%
>>> sure this is exactly what you meant, but I will provide results anyway.
>>>
>>> 1. Below the extended regions (unallocated memory, regions >=64MB )
>>> calculated by my initial method (bootinfo.mem - kinfo->mem -
>>> bootinfo.reserved_mem - kinfo->gnttab):
>>>
>>> (XEN) Extended region 0: 0x48000000->0x54000000
>>> (XEN) Extended region 1: 0x57000000->0x60000000
>>> (XEN) Extended region 2: 0x70000000->0x78000000
>>> (XEN) Extended region 3: 0x78200000->0xc0000000
>>> (XEN) Extended region 4: 0x500000000->0x580000000
>>> (XEN) Extended region 5: 0x600000000->0x680000000
>>> (XEN) Extended region 6: 0x700000000->0x780000000
>>>
>>> 2. Below the extended regions (unallocated memory, regions >=64MB)
>>> calculated by new method (free memory in page allocator):
>>>
>>> (XEN) Extended region 0: 0x48000000->0x54000000
>>> (XEN) Extended region 1: 0x58000000->0x60000000
>>> (XEN) Extended region 2: 0x70000000->0x78000000
>>> (XEN) Extended region 3: 0x78200000->0x84000000
>>> (XEN) Extended region 4: 0x86000000->0x8a000000
>>> (XEN) Extended region 5: 0x8c200000->0xc0000000
>>> (XEN) Extended region 6: 0x500000000->0x580000000
>>> (XEN) Extended region 7: 0x600000000->0x680000000
>>> (XEN) Extended region 8: 0x700000000->0x765e00000
>>>
>>> Some thoughts regarding that.
>>>
>>> 1. A few ranges below 4GB are absent in resulting extended regions. I
>>> assume, this is because of the modules:
>>>
>>> (XEN) Checking for initrd in /chosen
>>> (XEN) Initrd 0000000084000040-0000000085effc48
>>> (XEN) RAM: 0000000048000000 - 00000000bfffffff
>>> (XEN) RAM: 0000000500000000 - 000000057fffffff
>>> (XEN) RAM: 0000000600000000 - 000000067fffffff
>>> (XEN) RAM: 0000000700000000 - 000000077fffffff
>>> (XEN)
>>> (XEN) MODULE[0]: 0000000078080000 - 00000000781d74c8 Xen
>>> (XEN) MODULE[1]: 0000000057fe7000 - 0000000057ffd080 Device Tree
>>> (XEN) MODULE[2]: 0000000084000040 - 0000000085effc48 Ramdisk
>>> (XEN) MODULE[3]: 000000008a000000 - 000000008c000000 Kernel
>>> (XEN) MODULE[4]: 000000008c000000 - 000000008c010000 XSM
>>> (XEN)  RESVD[0]: 0000000084000040 - 0000000085effc48
>>> (XEN)  RESVD[1]: 0000000054000000 - 0000000056ffffff
>>>
>>> 2. Also, it worth mentioning that relatively large chunk (~417MB) of memory
>>> above 4GB is absent (to be precise, at the end of last RAM bank), which I
>>> assume, used for Xen internals.
>>> We could really use it for extended regions.
>>> Below free regions in the heap (for last RAM bank) just in case:
>>>
>>> (XEN) heap[node=0][zone=23][order=5] 0x00000765ec0000-0x00000765ee0000
>>> (XEN) heap[node=0][zone=23][order=6] 0x00000765e80000-0x00000765ec0000
>>> (XEN) heap[node=0][zone=23][order=7] 0x00000765e00000-0x00000765e80000
>>> (XEN) heap[node=0][zone=23][order=9] 0x00000765c00000-0x00000765e00000
>>> (XEN) heap[node=0][zone=23][order=10] 0x00000765800000-0x00000765c00000
>>> (XEN) heap[node=0][zone=23][order=11] 0x00000765000000-0x00000765800000
>>> (XEN) heap[node=0][zone=23][order=12] 0x00000764000000-0x00000765000000
>>> (XEN) heap[node=0][zone=23][order=14] 0x00000760000000-0x00000764000000
>>> (XEN) heap[node=0][zone=23][order=17] 0x00000740000000-0x00000760000000
>>> (XEN) heap[node=0][zone=23][order=18] 0x00000540000000-0x00000580000000
>>> (XEN) heap[node=0][zone=23][order=18] 0x00000500000000-0x00000540000000
>>> (XEN) heap[node=0][zone=23][order=18] 0x00000640000000-0x00000680000000
>>> (XEN) heap[node=0][zone=23][order=18] 0x00000600000000-0x00000640000000
>>> (XEN) heap[node=0][zone=23][order=18] 0x00000700000000-0x00000740000000
>>>
>>> Yes, you already pointed out this disadvantage, so if it is an acceptable
>>> downside, I am absolutely OK.
>>>
>>>
>>> 3. Common code updates. There is a question how to properly make a
>>> connection between common allocator internals and Arm's code for creating
>>> DT. I didn’t come up with anything better
>>> than creating for_each_avail_page() for invoking a callback with page and
>>> its order.
>>>
>>> **********
>>>
>>> Below the proposed changes on top of the initial patch, would this be
>>> acceptable in general?
>>>
>>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>>> index 523eb19..1e58fc5 100644
>>> --- a/xen/arch/arm/domain_build.c
>>> +++ b/xen/arch/arm/domain_build.c
>>> @@ -753,16 +753,33 @@ static int __init add_ext_regions(unsigned long s,
>>> unsigned long e, void *data)
>>>       return 0;
>>>   }
>>>
>>> +static int __init add_unalloc_mem(struct page_info *page, unsigned int
>>> order,
>>> +                                  void *data)
>>> +{
>>> +    struct rangeset *unalloc_mem = data;
>>> +    paddr_t start, end;
>>> +    int res;
>>> +
>>> +    start = page_to_maddr(page);
>>> +    end = start + pfn_to_paddr(1UL << order);
>>> +    res = rangeset_add_range(unalloc_mem, start, end - 1);
>>> +    if ( res )
>>> +    {
>>> +        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>> +               start, end);
>>> +        return res;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   #define EXT_REGION_START   0x40000000ULL
>>>   #define EXT_REGION_END     0x80003fffffffULL
>>>
>>> -static int __init find_unallocated_memory(const struct kernel_info *kinfo,
>>> -                                          struct meminfo *ext_regions)
>>> +static int __init find_unallocated_memory(struct meminfo *ext_regions)
>>>   {
>>> -    const struct meminfo *assign_mem = &kinfo->mem;
>>>       struct rangeset *unalloc_mem;
>>>       paddr_t start, end;
>>> -    unsigned int i;
>>>       int res;
>>>
>>>       dt_dprintk("Find unallocated memory for extended regions\n");
>>> @@ -771,59 +788,9 @@ static int __init find_unallocated_memory(const struct
>>> kernel_info *kinfo,
>>>       if ( !unalloc_mem )
>>>           return -ENOMEM;
>>>
>>> -    /* Start with all available RAM */
>>> -    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
>>> -    {
>>> -        start = bootinfo.mem.bank[i].start;
>>> -        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size;
>>> -        res = rangeset_add_range(unalloc_mem, start, end - 1);
>>> -        if ( res )
>>> -        {
>>> -            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
>>> -                   start, end);
>>> -            goto out;
>>> -        }
>>> -    }
>>> -
>>> -    /* Remove RAM assigned to Dom0 */
>>> -    for ( i = 0; i < assign_mem->nr_banks; i++ )
>>> -    {
>>> -        start = assign_mem->bank[i].start;
>>> -        end = assign_mem->bank[i].start + assign_mem->bank[i].size;
>>> -        res = rangeset_remove_range(unalloc_mem, start, end - 1);
>>> -        if ( res )
>>> -        {
>>> -            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>>> -                   start, end);
>>> -            goto out;
>>> -        }
>>> -    }
>>> -
>>> -    /* Remove reserved-memory regions */
>>> -    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
>>> -    {
>>> -        start = bootinfo.reserved_mem.bank[i].start;
>>> -        end = bootinfo.reserved_mem.bank[i].start +
>>> -            bootinfo.reserved_mem.bank[i].size;
>>> -        res = rangeset_remove_range(unalloc_mem, start, end - 1);
>>> -        if ( res )
>>> -        {
>>> -            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>>> -                   start, end);
>>> -            goto out;
>>> -        }
>>> -    }
>>> -
>>> -    /* Remove grant table region */
>>> -    start = kinfo->gnttab_start;
>>> -    end = kinfo->gnttab_start + kinfo->gnttab_size;
>>> -    res = rangeset_remove_range(unalloc_mem, start, end - 1);
>>> +    res = for_each_avail_page(add_unalloc_mem, unalloc_mem);
>>>       if ( res )
>>> -    {
>>> -        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
>>> -               start, end);
>>>           goto out;
>>> -    }
>>>
>>>       start = EXT_REGION_START;
>>>       end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
>>> @@ -840,8 +807,7 @@ out:
>>>       return res;
>>>   }
>>>
>>> -static int __init find_memory_holes(const struct kernel_info *kinfo,
>>> -                                    struct meminfo *ext_regions)
>>> +static int __init find_memory_holes(struct meminfo *ext_regions)
>>>   {
>>>       struct dt_device_node *np;
>>>       struct rangeset *mem_holes;
>>> @@ -961,9 +927,9 @@ static int __init make_hypervisor_node(struct domain *d,
>>>       else
>>>       {
>>>           if ( !is_iommu_enabled(d) )
>>> -            res = find_unallocated_memory(kinfo, ext_regions);
>>> +            res = find_unallocated_memory(ext_regions);
>>>           else
>>> -            res = find_memory_holes(kinfo, ext_regions);
>>> +            res = find_memory_holes(ext_regions);
>>>
>>>           if ( res )
>>>               printk(XENLOG_WARNING "Failed to allocate extended regions\n");
>>> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
>>> index 8fad139..7cd1020 100644
>>> --- a/xen/common/page_alloc.c
>>> +++ b/xen/common/page_alloc.c
>>> @@ -1572,6 +1572,40 @@ static int reserve_heap_page(struct page_info *pg)
>>>
>>>   }
>>>
>>> +/* TODO heap_lock? */
>>> +int for_each_avail_page(int (*cb)(struct page_info *, unsigned int, void
>>> *),
>>> +                        void *data)
>>> +{
>>> +    unsigned int node, zone, order;
>>> +    int ret;
>>> +
>>> +    for ( node = 0; node < MAX_NUMNODES; node++ )
>>> +    {
>>> +        if ( !avail[node] )
>>> +            continue;
>>> +
>>> +        for ( zone = 0; zone < NR_ZONES; zone++ )
>>> +        {
>>> +            for ( order = 0; order <= MAX_ORDER; order++ )
>>> +            {
>>> +                struct page_info *head, *tmp;
>>> +
>>> +                if ( page_list_empty(&heap(node, zone, order)) )
>>> +                    continue;
>>> +
>>> +                page_list_for_each_safe ( head, tmp, &heap(node, zone,
>>> order) )
>>> +                {
>>> +                    ret = cb(head, order, data);
>>> +                    if ( ret )
>>> +                        return ret;
>>> +                }
>>> +            }
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   int offline_page(mfn_t mfn, int broken, uint32_t *status)
>>>   {
>>>       unsigned long old_info = 0;
>>> diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
>>> index 667f9da..64dd3e2 100644
>>> --- a/xen/include/xen/mm.h
>>> +++ b/xen/include/xen/mm.h
>>> @@ -123,6 +123,9 @@ unsigned int online_page(mfn_t mfn, uint32_t *status);
>>>   int offline_page(mfn_t mfn, int broken, uint32_t *status);
>>>   int query_page_offline(mfn_t mfn, uint32_t *status);
>>>
>>> +int for_each_avail_page(int (*cb)(struct page_info *, unsigned int, void
>>> *),
>>> +                        void *data);
>>> +
>>>   void heap_init_late(void);
>>>
>>>   int assign_pages(
>>
>> I am sorry, but may I please clarify regarding that? Whether we will go this
>> new direction (free memory in page allocator) or leave things as they are
>> (bootinfo.mem - kinfo->mem - bootinfo.reserved_mem - kinfo->gnttab). This is
>> only one still unclear moment to me in current patch before preparing V3.
> I think both approaches are fine. Your original approach leads to better
> results in terms of extended regions but the difference is not drastic.
> The original approach requires more code (bad) but probably less CPU
> cycles (good).
>
> Personally I am fine either way but as Julien was the one to provide
> feedback on this it would be best to get his opinion.
>
> But in the meantime I think it is OK to send a v3 so that we can review
> the rest.


OK, thank you for the clarification.

I am also fine either way, I just wanted to know which one to pick. 
Anyway, I think I will be able to make updates later on.
diff mbox series

Patch

diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index 206038d..070ec27 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -724,6 +724,196 @@  static int __init make_memory_node(const struct domain *d,
     return res;
 }
 
+static int __init add_ext_regions(unsigned long s, unsigned long e, void *data)
+{
+    struct meminfo *ext_regions = data;
+    paddr_t start, size;
+
+    if ( ext_regions->nr_banks >= ARRAY_SIZE(ext_regions->bank) )
+        return 0;
+
+    /* Both start and size of the extended region should be 2MB aligned */
+    start = (s + SZ_2M - 1) & ~(SZ_2M - 1);
+    if ( start > e )
+        return 0;
+
+    size = (e - start + 1) & ~(SZ_2M - 1);
+    if ( !size )
+        return 0;
+
+    ext_regions->bank[ext_regions->nr_banks].start = start;
+    ext_regions->bank[ext_regions->nr_banks].size = size;
+    ext_regions->nr_banks ++;
+
+    return 0;
+}
+
+/*
+ * The extended regions will be prevalidated by the memory hotplug path
+ * in Linux which requires for any added address range to be within maximum
+ * possible addressable physical memory range for which the linear mapping
+ * could be created.
+ * For 48-bit VA space size the maximum addressable range are:
+ * 0x40000000 - 0x80003fffffff
+ */
+#define EXT_REGION_START   0x40000000ULL
+#define EXT_REGION_END     0x80003fffffffULL
+
+static int __init find_unallocated_memory(const struct kernel_info *kinfo,
+                                          struct meminfo *ext_regions)
+{
+    const struct meminfo *assign_mem = &kinfo->mem;
+    struct rangeset *unalloc_mem;
+    paddr_t start, end;
+    unsigned int i;
+    int res;
+
+    dt_dprintk("Find unallocated memory for extended regions\n");
+
+    unalloc_mem = rangeset_new(NULL, NULL, 0);
+    if ( !unalloc_mem )
+        return -ENOMEM;
+
+    /* Start with all available RAM */
+    for ( i = 0; i < bootinfo.mem.nr_banks; i++ )
+    {
+        start = bootinfo.mem.bank[i].start;
+        end = bootinfo.mem.bank[i].start + bootinfo.mem.bank[i].size - 1;
+        res = rangeset_add_range(unalloc_mem, start, end);
+        if ( res )
+        {
+            printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
+                   start, end);
+            goto out;
+        }
+    }
+
+    /* Remove RAM assigned to Dom0 */
+    for ( i = 0; i < assign_mem->nr_banks; i++ )
+    {
+        start = assign_mem->bank[i].start;
+        end = assign_mem->bank[i].start + assign_mem->bank[i].size - 1;
+        res = rangeset_remove_range(unalloc_mem, start, end);
+        if ( res )
+        {
+            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
+                   start, end);
+            goto out;
+        }
+    }
+
+    /* Remove reserved-memory regions */
+    for ( i = 0; i < bootinfo.reserved_mem.nr_banks; i++ )
+    {
+        start = bootinfo.reserved_mem.bank[i].start;
+        end = bootinfo.reserved_mem.bank[i].start +
+            bootinfo.reserved_mem.bank[i].size - 1;
+        res = rangeset_remove_range(unalloc_mem, start, end);
+        if ( res )
+        {
+            printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
+                   start, end);
+            goto out;
+        }
+    }
+
+    /* Remove grant table region */
+    start = kinfo->gnttab_start;
+    end = kinfo->gnttab_start + kinfo->gnttab_size - 1;
+    res = rangeset_remove_range(unalloc_mem, start, end);
+    if ( res )
+    {
+        printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
+               start, end);
+        goto out;
+    }
+
+    start = EXT_REGION_START;
+    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
+    res = rangeset_report_ranges(unalloc_mem, start, end,
+                                 add_ext_regions, ext_regions);
+    if ( res )
+        ext_regions->nr_banks = 0;
+    else if ( !ext_regions->nr_banks )
+        res = -ENOENT;
+
+out:
+    rangeset_destroy(unalloc_mem);
+
+    return res;
+}
+
+static int __init find_memory_holes(const struct kernel_info *kinfo,
+                                    struct meminfo *ext_regions)
+{
+    struct dt_device_node *np;
+    struct rangeset *mem_holes;
+    paddr_t start, end;
+    unsigned int i;
+    int res;
+
+    dt_dprintk("Find memory holes for extended regions\n");
+
+    mem_holes = rangeset_new(NULL, NULL, 0);
+    if ( !mem_holes )
+        return -ENOMEM;
+
+    /* Start with maximum possible addressable physical memory range */
+    start = EXT_REGION_START;
+    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
+    res = rangeset_add_range(mem_holes, start, end);
+    if ( res )
+    {
+        printk(XENLOG_ERR "Failed to add: %#"PRIx64"->%#"PRIx64"\n",
+               start, end);
+        goto out;
+    }
+
+    /* Remove all regions described by "reg" property (MMIO, RAM, etc) */
+    dt_for_each_device_node( dt_host, np )
+    {
+        unsigned int naddr;
+        u64 addr, size;
+
+        naddr = dt_number_of_address(np);
+
+        for ( i = 0; i < naddr; i++ )
+        {
+            res = dt_device_get_address(np, i, &addr, &size);
+            if ( res )
+            {
+                printk(XENLOG_ERR "Unable to retrieve address %u for %s\n",
+                       i, dt_node_full_name(np));
+                goto out;
+            }
+
+            start = addr & PAGE_MASK;
+            end = PAGE_ALIGN(addr + size) - 1;
+            res = rangeset_remove_range(mem_holes, start, end);
+            if ( res )
+            {
+                printk(XENLOG_ERR "Failed to remove: %#"PRIx64"->%#"PRIx64"\n",
+                       start, end);
+                goto out;
+            }
+        }
+    }
+
+    start = EXT_REGION_START;
+    end = min((1ULL << p2m_ipa_bits) - 1, EXT_REGION_END);
+    res = rangeset_report_ranges(mem_holes, start, end,
+                                 add_ext_regions,  ext_regions);
+    if ( res )
+        ext_regions->nr_banks = 0;
+    else if ( !ext_regions->nr_banks )
+        res = -ENOENT;
+
+out:
+    rangeset_destroy(mem_holes);
+
+    return res;
+}
+
 static int __init make_hypervisor_node(struct domain *d,
                                        const struct kernel_info *kinfo,
                                        int addrcells, int sizecells)
@@ -731,11 +921,13 @@  static int __init make_hypervisor_node(struct domain *d,
     const char compat[] =
         "xen,xen-"__stringify(XEN_VERSION)"."__stringify(XEN_SUBVERSION)"\0"
         "xen,xen";
-    __be32 reg[4];
+    __be32 reg[(NR_MEM_BANKS + 1) * 4];
     gic_interrupt_t intr;
     __be32 *cells;
     int res;
     void *fdt = kinfo->fdt;
+    struct meminfo *ext_regions;
+    unsigned int i;
 
     dt_dprintk("Create hypervisor node\n");
 
@@ -757,12 +949,42 @@  static int __init make_hypervisor_node(struct domain *d,
     if ( res )
         return res;
 
+    ext_regions = xzalloc(struct meminfo);
+    if ( !ext_regions )
+        return -ENOMEM;
+
+    if ( is_32bit_domain(d) )
+        printk(XENLOG_WARNING "The extended region is only supported for 64-bit guest\n");
+    else
+    {
+        if ( !is_iommu_enabled(d) )
+            res = find_unallocated_memory(kinfo, ext_regions);
+        else
+            res = find_memory_holes(kinfo, ext_regions);
+
+        if ( res )
+            printk(XENLOG_WARNING "Failed to allocate extended regions\n");
+    }
+
     /* reg 0 is grant table space */
     cells = &reg[0];
     dt_child_set_range(&cells, addrcells, sizecells,
                        kinfo->gnttab_start, kinfo->gnttab_size);
+    /* reg 1...N are extended regions */
+    for ( i = 0; i < ext_regions->nr_banks; i++ )
+    {
+        u64 start = ext_regions->bank[i].start;
+        u64 size = ext_regions->bank[i].size;
+
+        dt_dprintk("Extended region %d: %#"PRIx64"->%#"PRIx64"\n",
+                   i, start, start + size);
+
+        dt_child_set_range(&cells, addrcells, sizecells, start, size);
+    }
+    xfree(ext_regions);
+
     res = fdt_property(fdt, "reg", reg,
-                       dt_cells_to_size(addrcells + sizecells));
+                       dt_cells_to_size(addrcells + sizecells) * (i + 1));
     if ( res )
         return res;