diff mbox series

[v2,5/7] xen/ppc: Enable bootfdt and boot allocator

Message ID aa67a0dacab0e3f39dabeb30e31732d617cadde1.1702607884.git.sanastasio@raptorengineering.com (mailing list archive)
State New, archived
Headers show
Series Early Boot Allocation on Power | expand

Commit Message

Shawn Anastasio Dec. 15, 2023, 2:44 a.m. UTC
Move PPC off the asm-generic setup.h and enable usage of bootfdt for
populating the boot info struct from the firmware-provided device tree.
Also enable the Xen boot page allocator.

Includes minor changes to bootfdt.c's boot_fdt_info() to tolerate the
scenario in which the FDT overlaps a reserved memory region, as is the
case on PPC when booted directly from skiboot. Also includes a minor
change to record Xen's correct position on PPC where Xen relocates
itself to at the entrypoint.

Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
---
 xen/arch/ppc/include/asm/Makefile |   1 -
 xen/arch/ppc/include/asm/setup.h  | 123 +++++++++++++
 xen/arch/ppc/setup.c              | 289 +++++++++++++++++++++++++++++-
 xen/common/device-tree/bootfdt.c  |  25 ++-
 4 files changed, 434 insertions(+), 4 deletions(-)
 create mode 100644 xen/arch/ppc/include/asm/setup.h

Comments

Jan Beulich Dec. 20, 2023, 11:44 a.m. UTC | #1
On 15.12.2023 03:44, Shawn Anastasio wrote:
> --- /dev/null
> +++ b/xen/arch/ppc/include/asm/setup.h
> @@ -0,0 +1,123 @@
> +#ifndef __ASM_PPC_SETUP_H__
> +#define __ASM_PPC_SETUP_H__
> +
> +#define max_init_domid (0)
> +
> +#include <public/version.h>
> +#include <asm/p2m.h>
> +#include <xen/device_tree.h>
> +
> +#define MIN_FDT_ALIGN 8
> +#define MAX_FDT_SIZE SZ_2M
> +
> +#define NR_MEM_BANKS 256
> +
> +#define MAX_MODULES 32 /* Current maximum useful modules */
> +
> +typedef enum {
> +    BOOTMOD_XEN,
> +    BOOTMOD_FDT,
> +    BOOTMOD_KERNEL,
> +    BOOTMOD_RAMDISK,
> +    BOOTMOD_XSM,
> +    BOOTMOD_GUEST_DTB,
> +    BOOTMOD_UNKNOWN
> +}  bootmodule_kind;
> +
> +enum membank_type {
> +    /*
> +     * The MEMBANK_DEFAULT type refers to either reserved memory for the
> +     * device/firmware (when the bank is in 'reserved_mem') or any RAM (when
> +     * the bank is in 'mem').
> +     */
> +    MEMBANK_DEFAULT,
> +    /*
> +     * The MEMBANK_STATIC_DOMAIN type is used to indicate whether the memory
> +     * bank is bound to a static Xen domain. It is only valid when the bank
> +     * is in reserved_mem.
> +     */
> +    MEMBANK_STATIC_DOMAIN,
> +    /*
> +     * The MEMBANK_STATIC_HEAP type is used to indicate whether the memory
> +     * bank is reserved as static heap. It is only valid when the bank is
> +     * in reserved_mem.
> +     */
> +    MEMBANK_STATIC_HEAP,
> +};
> +
> +/* Indicates the maximum number of characters(\0 included) for shm_id */
> +#define MAX_SHM_ID_LENGTH 16
> +
> +struct membank {
> +    paddr_t start;
> +    paddr_t size;
> +    enum membank_type type;
> +};
> +
> +struct meminfo {
> +    unsigned int nr_banks;
> +    struct membank bank[NR_MEM_BANKS];
> +};
> +
> +/*
> + * The domU flag is set for kernels and ramdisks of "xen,domain" nodes.
> + * The purpose of the domU flag is to avoid getting confused in
> + * kernel_probe, where we try to guess which is the dom0 kernel and
> + * initrd to be compatible with all versions of the multiboot spec.
> + */
> +#define BOOTMOD_MAX_CMDLINE 1024

Why does this live here, rather than ...

> +struct bootmodule {
> +    bootmodule_kind kind;
> +    bool domU;
> +    paddr_t start;
> +    paddr_t size;
> +};
> +
> +/* DT_MAX_NAME is the node name max length according the DT spec */
> +#define DT_MAX_NAME 41

... next to this?

> +struct bootcmdline {
> +    bootmodule_kind kind;
> +    bool domU;
> +    paddr_t start;
> +    char dt_name[DT_MAX_NAME];
> +    char cmdline[BOOTMOD_MAX_CMDLINE];
> +};
> +
> +struct bootmodules {
> +    int nr_mods;
> +    struct bootmodule module[MAX_MODULES];
> +};
> +
> +struct bootcmdlines {
> +    unsigned int nr_mods;
> +    struct bootcmdline cmdline[MAX_MODULES];
> +};
> +
> +struct bootinfo {
> +    struct meminfo mem;
> +    struct meminfo reserved_mem;
> +    struct bootmodules modules;
> +    struct bootcmdlines cmdlines;
> +    bool static_heap;

Unused field?

> +};
> +
> +extern struct bootinfo bootinfo;
> +
> +/*
> + * setup.c
> + */
> +
> +bool check_reserved_regions_overlap(paddr_t region_start, paddr_t region_size);
> +struct bootmodule *add_boot_module(bootmodule_kind kind,
> +                                   paddr_t start, paddr_t size, bool domU);
> +void add_boot_cmdline(const char *name, const char *cmdline,
> +                      bootmodule_kind kind, paddr_t start, bool domU);
> +const char *boot_module_kind_as_string(bootmodule_kind kind);
> +struct bootcmdline * __init boot_cmdline_find_by_kind(bootmodule_kind kind);

No __init please on declarations; section placement attributes make sense only
on definitions (with pretty narrow exceptions).

> --- a/xen/arch/ppc/setup.c
> +++ b/xen/arch/ppc/setup.c
> @@ -1,16 +1,296 @@
>  /* SPDX-License-Identifier: GPL-2.0-or-later */
>  #include <xen/init.h>
>  #include <xen/lib.h>
> +#include <xen/libfdt/libfdt.h>
>  #include <xen/mm.h>
>  #include <public/version.h>
>  #include <asm/boot.h>
>  #include <asm/early_printk.h>
>  #include <asm/mm.h>
>  #include <asm/processor.h>
> +#include <asm/setup.h>
>  
>  /* Xen stack for bringing up the first CPU. */
>  unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __aligned(STACK_SIZE);
>  
> +struct bootinfo __initdata bootinfo;
> +
> +void __init add_boot_cmdline(const char *name, const char *cmdline,
> +                             bootmodule_kind kind, paddr_t start, bool domU)
> +{
> +    struct bootcmdlines *cmds = &bootinfo.cmdlines;
> +    struct bootcmdline *cmd;
> +
> +    if ( cmds->nr_mods == MAX_MODULES )
> +    {
> +        printk("Ignoring %s cmdline (too many)\n", name);
> +        return;
> +    }
> +
> +    cmd = &cmds->cmdline[cmds->nr_mods++];
> +    cmd->kind = kind;
> +    cmd->domU = domU;
> +    cmd->start = start;
> +
> +    ASSERT(strlen(name) <= DT_MAX_NAME);
> +    safe_strcpy(cmd->dt_name, name);
> +
> +    if ( strlen(cmdline) > BOOTMOD_MAX_CMDLINE )
> +        panic("module %s command line too long\n", name);
> +    safe_strcpy(cmd->cmdline, cmdline);
> +}
> +
> +struct bootmodule __init *add_boot_module(bootmodule_kind kind,
> +                                          paddr_t start, paddr_t size,
> +                                          bool domU)
> +{
> +    struct bootmodules *mods = &bootinfo.modules;
> +    struct bootmodule *mod;
> +    unsigned int i;
> +
> +    if ( mods->nr_mods == MAX_MODULES )
> +    {
> +        printk("Ignoring %s boot module at %"PRIpaddr"-%"PRIpaddr" (too many)\n",
> +               boot_module_kind_as_string(kind), start, start + size);
> +        return NULL;
> +    }
> +
> +    if ( check_reserved_regions_overlap(start, size) )
> +        return NULL;
> +
> +    for ( i = 0 ; i < mods->nr_mods ; i++ )
> +    {
> +        mod = &mods->module[i];
> +        if ( mod->kind == kind && mod->start == start )
> +        {
> +            if ( !domU )
> +                mod->domU = false;

What's the intention behind this (negative) accumulation of "domU"?

> +            return mod;
> +        }
> +    }

And what's the intention behind checking existing modules here (but not
existing command lines higher up in add_boot_cmdline())?

> +    mod = &mods->module[mods->nr_mods++];
> +    mod->kind = kind;
> +    mod->start = start;
> +    mod->size = size;
> +    mod->domU = domU;
> +
> +    return mod;
> +}
> +
> +const char * __init boot_module_kind_as_string(bootmodule_kind kind)
> +{
> +    switch ( kind )
> +    {
> +    case BOOTMOD_XEN:     return "Xen";
> +    case BOOTMOD_FDT:     return "Device Tree";
> +    case BOOTMOD_KERNEL:  return "Kernel";
> +    default: BUG();
> +    }
> +}
> +
> +/*
> + * TODO: '*_end' could be 0 if the module/region is at the end of the physical
> + * address space. This is for now not handled as it requires more rework.
> + */
> +static bool __init bootmodules_overlap_check(struct bootmodules *bootmodules,
> +                                             paddr_t region_start,
> +                                             paddr_t region_size)
> +{
> +    paddr_t mod_start = INVALID_PADDR, mod_end = 0;

Pointless initializers? (The variables might benefit from moving into
the more narrow scope anyway.)

> +    paddr_t region_end = region_start + region_size;
> +    unsigned int i, mod_num = bootmodules->nr_mods;
> +
> +    for ( i = 0; i < mod_num; i++ )
> +    {
> +        mod_start = bootmodules->module[i].start;
> +        mod_end = mod_start + bootmodules->module[i].size;
> +
> +        if ( region_end <= mod_start || region_start >= mod_end )
> +            continue;
> +        else

I for one consider such an "else" misleading.

> +        {
> +            printk("Region: [%#"PRIpaddr", %#"PRIpaddr") overlapping with"
> +                   " mod[%u]: [%#"PRIpaddr", %#"PRIpaddr")\n", region_start,
> +                   region_end, i, mod_start, mod_end);
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +/*
> + * TODO: '*_end' could be 0 if the bank/region is at the end of the physical
> + * address space. This is for now not handled as it requires more rework.
> + */
> +static bool __init meminfo_overlap_check(struct meminfo *meminfo,
> +                                         paddr_t region_start,
> +                                         paddr_t region_size)
> +{
> +    paddr_t bank_start = INVALID_PADDR, bank_end = 0;

As above: Pointless initializers? (The variables might benefit from moving
into the more narrow scope anyway.)

> +    paddr_t region_end = region_start + region_size;
> +    unsigned int i, bank_num = meminfo->nr_banks;
> +
> +    for ( i = 0; i < bank_num; i++ )
> +    {
> +        bank_start = meminfo->bank[i].start;
> +        bank_end = bank_start + meminfo->bank[i].size;
> +
> +        if ( region_end <= bank_start || region_start >= bank_end )
> +            continue;
> +        else
> +        {
> +            printk("Region: [%#"PRIpaddr", %#"PRIpaddr") overlapping with"
> +                   " bank[%u]: [%#"PRIpaddr", %#"PRIpaddr")\n", region_start,
> +                   region_end, i, bank_start, bank_end);

I think this is confusing to read: When the format string already needs
wrapping, I don't think the next argument should be put on the same line.
I was almost going to complain that arguments don't fit the format string.

> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +/*
> + * Given an input physical address range, check if this range is overlapping
> + * with the existing reserved memory regions defined in bootinfo.
> + * Return true if the input physical address range is overlapping with any
> + * existing reserved memory regions, otherwise false.
> + */
> +bool __init check_reserved_regions_overlap(paddr_t region_start,
> +                                           paddr_t region_size)
> +{
> +    /* Check if input region is overlapping with bootinfo.reserved_mem banks */
> +    if ( meminfo_overlap_check(&bootinfo.reserved_mem,
> +                               region_start, region_size) )
> +        return true;
> +
> +    /* Check if input region is overlapping with bootmodules */
> +    if ( bootmodules_overlap_check(&bootinfo.modules,
> +                                   region_start, region_size) )
> +        return true;
> +
> +    return false;
> +}
> +
> +/*
> + * Return the end of the non-module region starting at s. In other
> + * words return s the start of the next modules after s.

Stray " s" after "return"?

> + * On input *end is the end of the region which should be considered
> + * and it is updated to reflect the end of the module, clipped to the
> + * end of the region if it would run over.
> + */
> +static paddr_t __init next_module(paddr_t s, paddr_t *end)
> +{
> +    struct bootmodules *mi = &bootinfo.modules;

const?

> +    paddr_t lowest = ~(paddr_t)0;
> +    int i;

unsigned int?

> +    for ( i = 0; i < mi->nr_mods; i++ )
> +    {
> +        paddr_t mod_s = mi->module[i].start;
> +        paddr_t mod_e = mod_s + mi->module[i].size;
> +
> +        if ( !mi->module[i].size )
> +            continue;
> +
> +        if ( mod_s < s )
> +            continue;
> +        if ( mod_s > lowest )
> +            continue;
> +        if ( mod_s > *end )
> +            continue;
> +        lowest = mod_s;
> +        *end = min(*end, mod_e);
> +    }
> +    return lowest;
> +}
> +
> +static void __init dt_unreserved_regions(paddr_t s, paddr_t e,
> +                                         void (*cb)(paddr_t ps, paddr_t pe),
> +                                         unsigned int first)
> +{
> +    unsigned int i;
> +
> +    for ( i = 0 ; i < bootinfo.reserved_mem.nr_banks; i++ )
> +    {
> +        paddr_t r_s = bootinfo.reserved_mem.bank[i].start;
> +        paddr_t r_e = r_s + bootinfo.reserved_mem.bank[i].size;
> +
> +        if ( s < r_e && r_s < e )
> +        {
> +            dt_unreserved_regions(r_e, e, cb, i + 1);
> +            dt_unreserved_regions(s, r_s, cb, i + 1);

What's the reason for this recursion? Can there be overlapping regions?
If so, is there a guaranteed depth limit (seeing in particular that
you're not using the last function parameter)?

> +            return;
> +        }
> +    }
> +
> +    cb(s, e);
> +}
> +
> +/*
> + * boot_cmdline_find_by_kind can only be used to return Xen modules (e.g
> + * XSM, DTB) or Dom0 modules. This is not suitable for looking up guest
> + * modules.
> + */
> +struct bootcmdline * __init boot_cmdline_find_by_kind(bootmodule_kind kind)

This function looks to be unused?

> +{
> +    struct bootcmdlines *cmds = &bootinfo.cmdlines;
> +    struct bootcmdline *cmd;
> +    int i;

unsigned int?

> +    for ( i = 0 ; i < cmds->nr_mods ; i++ )
> +    {
> +        cmd = &cmds->cmdline[i];
> +        if ( cmd->kind == kind && !cmd->domU )
> +            return cmd;
> +    }
> +    return NULL;
> +}
> +
> +/*
> + * Populate the boot allocator. Based on arch/arm/setup.c's
> + * populate_boot_allocator.
> + * All RAM but the following regions will be added to the boot allocator:
> + *  - Modules (e.g., Xen, Kernel)
> + *  - Reserved regions
> + */
> +static void __init populate_boot_allocator(void)
> +{
> +    unsigned int i;
> +    const struct meminfo *banks = &bootinfo.mem;
> +    paddr_t s, e;
> +
> +    for ( i = 0; i < banks->nr_banks; i++ )
> +    {
> +        const struct membank *bank = &banks->bank[i];
> +        paddr_t bank_end = bank->start + bank->size;

Since you already make use of inner scope variables, and reason not
to put s here and ...

> +        s = bank->start;
> +        while ( s < bank_end )
> +        {
> +            paddr_t n = bank_end;

... e even here (each then gaining an initializer as well)?

> +            e = next_module(s, &n);
> +
> +            if ( e == ~(paddr_t)0 )
> +                e = n = bank_end;
> +
> +            /*
> +             * Module in a RAM bank other than the one which we are
> +             * not dealing with here.

Nit: "not"?

> +             */
> +            if ( e > bank_end )
> +                e = bank_end;
> +
> +            dt_unreserved_regions(s, e, init_boot_pages, 0);
> +
> +            s = n;
> +        }
> +    }
> +}
> +
>  void setup_exceptions(void)
>  {
>      unsigned long lpcr;
> @@ -24,6 +304,8 @@ void __init noreturn start_xen(unsigned long r3, unsigned long r4,
>                                 unsigned long r5, unsigned long r6,
>                                 unsigned long r7)
>  {
> +    void *boot_fdt;

const?

> @@ -32,11 +314,16 @@ void __init noreturn start_xen(unsigned long r3, unsigned long r4,
>      else
>      {
>          /* kexec boot protocol */
> -        boot_opal_init((void *)r3);
> +        boot_fdt = (void *)r3;

And then here as well, to avoid Misra complaints.

> +        boot_opal_init(boot_fdt);
>      }
>  
>      setup_exceptions();
>  
> +    boot_fdt_info(boot_fdt, r3);
> +
> +    populate_boot_allocator();
> +
>      setup_initial_pagetables();
>  
>      early_printk("Hello, ppc64le!\n");
> --- a/xen/common/device-tree/bootfdt.c
> +++ b/xen/common/device-tree/bootfdt.c
> @@ -543,12 +543,33 @@ size_t __init boot_fdt_info(const void *fdt, paddr_t paddr)
>      if ( ret < 0 )
>          panic("No valid device tree\n");
>  
> -    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
> -
>      ret = device_tree_for_each_node((void *)fdt, 0, early_scan_node, NULL);
>      if ( ret )
>          panic("Early FDT parsing failed (%d)\n", ret);
>  
> +    /*
> +     * Add module for the FDT itself after the device tree has been parsed. This
> +     * is required on ppc64le where the device tree passed to Xen may have been
> +     * allocated by skiboot, in which case it will exist within a reserved
> +     * region and this call will fail. This is fine, however, since either way
> +     * the allocator will know not to step on the device tree.
> +     */
> +    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
> +
> +    /*
> +     * Xen relocates itself at the ppc64 entrypoint, so we need to manually mark
> +     * the kernel module.
> +     */
> +    if ( IS_ENABLED(CONFIG_PPC64) ) {

Nit (style): Brace placement.

> +        paddr_t xen_start, xen_end;
> +
> +        xen_start = __pa(_start);
> +        xen_end = PAGE_ALIGN(__pa(_end));
> +        if ( !add_boot_module(BOOTMOD_XEN, xen_start, xen_end, false) )
> +            panic("Xen overlaps reserved memory! %016lx - %016lx\n", xen_start,
> +                xen_end);
> +    }

I'll need to leave commenting on this to the DT maintainers.

Jan
Julien Grall Dec. 20, 2023, 1:23 p.m. UTC | #2
Hi,

On 15/12/2023 02:44, Shawn Anastasio wrote:
> Move PPC off the asm-generic setup.h and enable usage of bootfdt for
> populating the boot info struct from the firmware-provided device tree.
> Also enable the Xen boot page allocator.
> 
> Includes minor changes to bootfdt.c's boot_fdt_info() to tolerate the
> scenario in which the FDT overlaps a reserved memory region, as is the
> case on PPC when booted directly from skiboot. Also includes a minor
> change to record Xen's correct position on PPC where Xen relocates
> itself to at the entrypoint.
> 
> Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
> ---
>   xen/arch/ppc/include/asm/Makefile |   1 -
>   xen/arch/ppc/include/asm/setup.h  | 123 +++++++++++++
>   xen/arch/ppc/setup.c              | 289 +++++++++++++++++++++++++++++-

I might be missing something. But isn't most of the code you add is the 
same as Arm? And if so, shouldn't this be consolidated?

I would also expect RISC-V to need the same.

Cheers,
Julien Grall Dec. 20, 2023, 1:49 p.m. UTC | #3
Hi,

On 15/12/2023 02:44, Shawn Anastasio wrote:
> diff --git a/xen/common/device-tree/bootfdt.c b/xen/common/device-tree/bootfdt.c
> index 796ac01c18..7ddfcc7e2a 100644
> --- a/xen/common/device-tree/bootfdt.c
> +++ b/xen/common/device-tree/bootfdt.c
> @@ -543,12 +543,33 @@ size_t __init boot_fdt_info(const void *fdt, paddr_t paddr)
>       if ( ret < 0 )
>           panic("No valid device tree\n");
>   
> -    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
> -
>       ret = device_tree_for_each_node((void *)fdt, 0, early_scan_node, NULL);
>       if ( ret )
>           panic("Early FDT parsing failed (%d)\n", ret);
>   
> +    /*
> +     * Add module for the FDT itself after the device tree has been parsed. This
> +     * is required on ppc64le where the device tree passed to Xen may have been
> +     * allocated by skiboot, in which case it will exist within a reserved
> +     * region and this call will fail. This is fine, however, since either way
> +     * the allocator will know not to step on the device tree.
> +     */
> +    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);

The consequence is BOOTMOD_FDT will not be added. AFAICT, Arm doesn't 
try to access BOOTMOD_FDT, but I think it would be worth to print message.

> +
> +    /*
> +     * Xen relocates itself at the ppc64 entrypoint, so we need to manually mark
> +     * the kernel module.
> +     */
> +    if ( IS_ENABLED(CONFIG_PPC64) ) {
> +        paddr_t xen_start, xen_end;
> +
> +        xen_start = __pa(_start);
> +        xen_end = PAGE_ALIGN(__pa(_end));
> +        if ( !add_boot_module(BOOTMOD_XEN, xen_start, xen_end, false) )
> +            panic("Xen overlaps reserved memory! %016lx - %016lx\n", xen_start,
> +                xen_end);
> +    }

Can you explain why this is added here rather than in the caller of 
boot_fdt_info()? Either before or after?

If after, I guess it is because of early_print_info(). In which case, I 
would suggest to move off early_print_info() to caller on each architecture.

Cheers,
Julien Grall Dec. 20, 2023, 4:07 p.m. UTC | #4
Hi Shawn,

On 20/12/2023 13:23, Julien Grall wrote:
> Hi,
> 
> On 15/12/2023 02:44, Shawn Anastasio wrote:
>> Move PPC off the asm-generic setup.h and enable usage of bootfdt for
>> populating the boot info struct from the firmware-provided device tree.
>> Also enable the Xen boot page allocator.
>>
>> Includes minor changes to bootfdt.c's boot_fdt_info() to tolerate the
>> scenario in which the FDT overlaps a reserved memory region, as is the
>> case on PPC when booted directly from skiboot. Also includes a minor
>> change to record Xen's correct position on PPC where Xen relocates
>> itself to at the entrypoint.
>>
>> Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
>> ---
>>   xen/arch/ppc/include/asm/Makefile |   1 -
>>   xen/arch/ppc/include/asm/setup.h  | 123 +++++++++++++
>>   xen/arch/ppc/setup.c              | 289 +++++++++++++++++++++++++++++-
> 
> I might be missing something. But isn't most of the code you add is the 
> same as Arm? And if so, shouldn't this be consolidated?

So I have create a new file and move the PPC functions I think should be 
the same. Then I replace with the Arm code. Below the diff.

Looking through it, I can't see why the code cannot be shared in except 
part of populate_boot_allocator() (which could be split).

I forsee that some of the code you remove will be necessary (e.g. 
BOOTMOD_RAMDISK, BOOTMOD_XSM). And some of the style change doesn't 
match what we do in Xen (messages are not split).

If there are common bits with other architectures, then I would really 
like if they are shared rather than duplicated. I am more than happy to 
help finding a good split because it will reduce maintenance effort for 
everyone in the longer run.

Cheers,

diff --git a/xen/common/device-tree/setup.c b/xen/common/device-tree/setup.c
index 3f79b97e9036..9c376527d11b 100644
--- a/xen/common/device-tree/setup.c
+++ b/xen/common/device-tree/setup.c
@@ -70,6 +70,10 @@ const char * __init 
boot_module_kind_as_string(bootmodule_kind kind)
      case BOOTMOD_XEN:     return "Xen";
      case BOOTMOD_FDT:     return "Device Tree";
      case BOOTMOD_KERNEL:  return "Kernel";
+    case BOOTMOD_RAMDISK: return "Ramdisk";
+    case BOOTMOD_XSM:     return "XSM";
+    case BOOTMOD_GUEST_DTB:     return "DTB";
+    case BOOTMOD_UNKNOWN: return "Unknown";
      default: BUG();
      }
  }
@@ -95,9 +99,8 @@ static bool __init bootmodules_overlap_check(struct 
bootmodules *bootmodules,
              continue;
          else
          {
-            printk("Region: [%#"PRIpaddr", %#"PRIpaddr") overlapping with"
-                   " mod[%u]: [%#"PRIpaddr", %#"PRIpaddr")\n", 
region_start,
-                   region_end, i, mod_start, mod_end);
+            printk("Region: [%#"PRIpaddr", %#"PRIpaddr") overlapping 
with mod[%u]: [%#"PRIpaddr", %#"PRIpaddr")\n",
+                   region_start, region_end, i, mod_start, mod_end);
              return true;
          }
      }
@@ -126,9 +129,8 @@ static bool __init meminfo_overlap_check(struct 
meminfo *meminfo,
              continue;
          else
          {
-            printk("Region: [%#"PRIpaddr", %#"PRIpaddr") overlapping with"
-                   " bank[%u]: [%#"PRIpaddr", %#"PRIpaddr")\n", 
region_start,
-                   region_end, i, bank_start, bank_end);
+            printk("Region: [%#"PRIpaddr", %#"PRIpaddr") overlapping 
with bank[%u]: [%#"PRIpaddr", %#"PRIpaddr")\n",
+                   region_start, region_end, i, bank_start, bank_end);
              return true;
          }
      }
@@ -155,6 +157,12 @@ bool __init check_reserved_regions_overlap(paddr_t 
region_start,
                                     region_start, region_size) )
          return true;

+#ifdef CONFIG_ACPI
+    /* Check if input region is overlapping with ACPI 
EfiACPIReclaimMemory */
+    if ( meminfo_overlap_check(&bootinfo.acpi, region_start, region_size) )
+        return true;
+#endif
+
      return false;
  }

@@ -196,12 +204,47 @@ static void __init dt_unreserved_regions(paddr_t 
s, paddr_t e,
                                           void (*cb)(paddr_t ps, 
paddr_t pe),
                                           unsigned int first)
  {
-    unsigned int i;
+    unsigned int i, nr;
+    int rc;
+
+    rc = fdt_num_mem_rsv(device_tree_flattened);
+    if ( rc < 0 )
+        panic("Unable to retrieve the number of reserved regions 
(rc=%d)\n",
+              rc);

-    for ( i = 0 ; i < bootinfo.reserved_mem.nr_banks; i++ )
+    nr = rc;
+
+    for ( i = first; i < nr ; i++ )
      {
-        paddr_t r_s = bootinfo.reserved_mem.bank[i].start;
-        paddr_t r_e = r_s + bootinfo.reserved_mem.bank[i].size;
+        paddr_t r_s, r_e;
+
+        if ( fdt_get_mem_rsv_paddr(device_tree_flattened, i, &r_s, &r_e 
) < 0 )
+            /* If we can't read it, pretend it doesn't exist... */
+            continue;
+
+        r_e += r_s; /* fdt_get_mem_rsv_paddr returns length */
+
+        if ( s < r_e && r_s < e )
+        {
+            dt_unreserved_regions(r_e, e, cb, i+1);
+            dt_unreserved_regions(s, r_s, cb, i+1);
+            return;
+        }
+    }
+
+    /*
+     * i is the current bootmodule we are evaluating across all possible
+     * kinds.
+     *
+     * When retrieving the corresponding reserved-memory addresses
+     * below, we need to index the bootinfo.reserved_mem bank starting
+     * from 0, and only counting the reserved-memory modules. Hence,
+     * we need to use i - nr.
+     */
+    for ( ; i - nr < bootinfo.reserved_mem.nr_banks; i++ )
+    {
+        paddr_t r_s = bootinfo.reserved_mem.bank[i - nr].start;
+        paddr_t r_e = r_s + bootinfo.reserved_mem.bank[i - nr].size;

          if ( s < r_e && r_s < e )
          {
@@ -214,6 +257,16 @@ static void __init dt_unreserved_regions(paddr_t s, 
paddr_t e,
      cb(s, e);
  }

+void __init fw_unreserved_regions(paddr_t s, paddr_t e,
+                                  void (*cb)(paddr_t ps, paddr_t pe),
+                                  unsigned int first)
+{
+    if ( acpi_disabled )
+        dt_unreserved_regions(s, e, cb, first);
+    else
+        cb(s, e);
+}
+
  /*
   * boot_cmdline_find_by_kind can only be used to return Xen modules (e.g
   * XSM, DTB) or Dom0 modules. This is not suitable for looking up guest
@@ -235,18 +288,47 @@ struct bootcmdline * __init 
boot_cmdline_find_by_kind(bootmodule_kind kind)
  }

  /*
- * Populate the boot allocator. Based on arch/arm/setup.c's
- * populate_boot_allocator.
- * All RAM but the following regions will be added to the boot allocator:
+ * Populate the boot allocator.
+ * If a static heap was not provided by the admin, all the RAM but the
+ * following regions will be added:
   *  - Modules (e.g., Xen, Kernel)
   *  - Reserved regions
+ *  - Xenheap (arm32 only)
+ * If a static heap was provided by the admin, populate the boot
+ * allocator with the corresponding regions only, but with Xenheap excluded
+ * on arm32.
   */
-static void __init populate_boot_allocator(void)
+void __init populate_boot_allocator(void)
  {
      unsigned int i;
      const struct meminfo *banks = &bootinfo.mem;
      paddr_t s, e;

+    if ( bootinfo.static_heap )
+    {
+        for ( i = 0 ; i < bootinfo.reserved_mem.nr_banks; i++ )
+        {
+            if ( bootinfo.reserved_mem.bank[i].type != 
MEMBANK_STATIC_HEAP )
+                continue;
+
+            s = bootinfo.reserved_mem.bank[i].start;
+            e = s + bootinfo.reserved_mem.bank[i].size;
+#ifdef CONFIG_ARM_32
+            /* Avoid the xenheap, note that the xenheap cannot across a 
bank */
+            if ( s <= mfn_to_maddr(directmap_mfn_start) &&
+                 e >= mfn_to_maddr(directmap_mfn_end) )
+            {
+                init_boot_pages(s, mfn_to_maddr(directmap_mfn_start));
+                init_boot_pages(mfn_to_maddr(directmap_mfn_end), e);
+            }
+            else
+#endif
+                init_boot_pages(s, e);
+        }
+
+        return;
+    }
+
      for ( i = 0; i < banks->nr_banks; i++ )
      {
          const struct membank *bank = &banks->bank[i];
@@ -269,11 +351,18 @@ static void __init populate_boot_allocator(void)
              if ( e > bank_end )
                  e = bank_end;

-            dt_unreserved_regions(s, e, init_boot_pages, 0);
-
+#ifdef CONFIG_ARM_32
+            /* Avoid the xenheap */
+            if ( s < mfn_to_maddr(directmap_mfn_end) &&
+                 mfn_to_maddr(directmap_mfn_start) < e )
+            {
+                e = mfn_to_maddr(directmap_mfn_start);
+                n = mfn_to_maddr(directmap_mfn_end);
+            }
+#endif
+
+            fw_unreserved_regions(s, e, init_boot_pages, 0);
              s = n;
          }
      }
  }
-
-


> 
> I would also expect RISC-V to need the same.
> 
> Cheers,
>
Shawn Anastasio Jan. 18, 2024, 1:36 a.m. UTC | #5
Hi Julien,

On 12/20/23 7:49 AM, Julien Grall wrote:
> Hi,
> 
> On 15/12/2023 02:44, Shawn Anastasio wrote:
>> diff --git a/xen/common/device-tree/bootfdt.c
>> b/xen/common/device-tree/bootfdt.c
>> index 796ac01c18..7ddfcc7e2a 100644
>> --- a/xen/common/device-tree/bootfdt.c
>> +++ b/xen/common/device-tree/bootfdt.c
>> @@ -543,12 +543,33 @@ size_t __init boot_fdt_info(const void *fdt,
>> paddr_t paddr)
>>       if ( ret < 0 )
>>           panic("No valid device tree\n");
>>   -    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
>> -
>>       ret = device_tree_for_each_node((void *)fdt, 0, early_scan_node,
>> NULL);
>>       if ( ret )
>>           panic("Early FDT parsing failed (%d)\n", ret);
>>   +    /*
>> +     * Add module for the FDT itself after the device tree has been
>> parsed. This
>> +     * is required on ppc64le where the device tree passed to Xen may
>> have been
>> +     * allocated by skiboot, in which case it will exist within a
>> reserved
>> +     * region and this call will fail. This is fine, however, since
>> either way
>> +     * the allocator will know not to step on the device tree.
>> +     */
>> +    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
> 
> The consequence is BOOTMOD_FDT will not be added. AFAICT, Arm doesn't
> try to access BOOTMOD_FDT, but I think it would be worth to print message.
>

A message will already be printed by `meminfo_overlap_check` in setup.c
when this condition is hit, like this:

(XEN) Region: [0x0000003076e9b0, 0x00000030775215) overlapping with
bank[3]: [0x00000030600000, 0x00000031000000)

If you'd like me to add another more descriptive message here to let the
user know that BOOTMOD_FDT wasn't added I could do that, though.

>> +
>> +    /*
>> +     * Xen relocates itself at the ppc64 entrypoint, so we need to
>> manually mark
>> +     * the kernel module.
>> +     */
>> +    if ( IS_ENABLED(CONFIG_PPC64) ) {
>> +        paddr_t xen_start, xen_end;
>> +
>> +        xen_start = __pa(_start);
>> +        xen_end = PAGE_ALIGN(__pa(_end));
>> +        if ( !add_boot_module(BOOTMOD_XEN, xen_start, xen_end, false) )
>> +            panic("Xen overlaps reserved memory! %016lx - %016lx\n",
>> xen_start,
>> +                xen_end);
>> +    }
> 
> Can you explain why this is added here rather than in the caller of
> boot_fdt_info()? Either before or after?
> 
> If after, I guess it is because of early_print_info(). In which case, I
> would suggest to move off early_print_info() to caller on each
> architecture.
>

Right, it can't be after due to the early_print_info() call, but doing
it before actually works fine. Thank you for pointing that out -- I'll
change this in the next revision.

> Cheers,
>

Thanks,
Shawn
Julien Grall Jan. 18, 2024, 9:21 a.m. UTC | #6
Hi Shawn,

On 18/01/2024 01:36, Shawn Anastasio wrote:
> Hi Julien,
> 
> On 12/20/23 7:49 AM, Julien Grall wrote:
>> Hi,
>>
>> On 15/12/2023 02:44, Shawn Anastasio wrote:
>>> diff --git a/xen/common/device-tree/bootfdt.c
>>> b/xen/common/device-tree/bootfdt.c
>>> index 796ac01c18..7ddfcc7e2a 100644
>>> --- a/xen/common/device-tree/bootfdt.c
>>> +++ b/xen/common/device-tree/bootfdt.c
>>> @@ -543,12 +543,33 @@ size_t __init boot_fdt_info(const void *fdt,
>>> paddr_t paddr)
>>>        if ( ret < 0 )
>>>            panic("No valid device tree\n");
>>>    -    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
>>> -
>>>        ret = device_tree_for_each_node((void *)fdt, 0, early_scan_node,
>>> NULL);
>>>        if ( ret )
>>>            panic("Early FDT parsing failed (%d)\n", ret);
>>>    +    /*
>>> +     * Add module for the FDT itself after the device tree has been
>>> parsed. This
>>> +     * is required on ppc64le where the device tree passed to Xen may
>>> have been
>>> +     * allocated by skiboot, in which case it will exist within a
>>> reserved
>>> +     * region and this call will fail. This is fine, however, since
>>> either way
>>> +     * the allocator will know not to step on the device tree.
>>> +     */
>>> +    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
>>
>> The consequence is BOOTMOD_FDT will not be added. AFAICT, Arm doesn't
>> try to access BOOTMOD_FDT, but I think it would be worth to print message.
>>
> 
> A message will already be printed by `meminfo_overlap_check` in setup.c
> when this condition is hit, like this:
> 
> (XEN) Region: [0x0000003076e9b0, 0x00000030775215) overlapping with
> bank[3]: [0x00000030600000, 0x00000031000000)

This is somewhat non-descriptive. IOW it doesn't tell me that the 
overlap was with the FDT. But I guess this can be improved (this is not 
a request for you to send another patch).

Cheers,
diff mbox series

Patch

diff --git a/xen/arch/ppc/include/asm/Makefile b/xen/arch/ppc/include/asm/Makefile
index 7167661f86..a711cfa856 100644
--- a/xen/arch/ppc/include/asm/Makefile
+++ b/xen/arch/ppc/include/asm/Makefile
@@ -6,6 +6,5 @@  generic-y += iocap.h
 generic-y += paging.h
 generic-y += percpu.h
 generic-y += random.h
-generic-y += setup.h
 generic-y += static-shmem.h
 generic-y += vm_event.h
diff --git a/xen/arch/ppc/include/asm/setup.h b/xen/arch/ppc/include/asm/setup.h
new file mode 100644
index 0000000000..c0f700fc07
--- /dev/null
+++ b/xen/arch/ppc/include/asm/setup.h
@@ -0,0 +1,123 @@ 
+#ifndef __ASM_PPC_SETUP_H__
+#define __ASM_PPC_SETUP_H__
+
+#define max_init_domid (0)
+
+#include <public/version.h>
+#include <asm/p2m.h>
+#include <xen/device_tree.h>
+
+#define MIN_FDT_ALIGN 8
+#define MAX_FDT_SIZE SZ_2M
+
+#define NR_MEM_BANKS 256
+
+#define MAX_MODULES 32 /* Current maximum useful modules */
+
+typedef enum {
+    BOOTMOD_XEN,
+    BOOTMOD_FDT,
+    BOOTMOD_KERNEL,
+    BOOTMOD_RAMDISK,
+    BOOTMOD_XSM,
+    BOOTMOD_GUEST_DTB,
+    BOOTMOD_UNKNOWN
+}  bootmodule_kind;
+
+enum membank_type {
+    /*
+     * The MEMBANK_DEFAULT type refers to either reserved memory for the
+     * device/firmware (when the bank is in 'reserved_mem') or any RAM (when
+     * the bank is in 'mem').
+     */
+    MEMBANK_DEFAULT,
+    /*
+     * The MEMBANK_STATIC_DOMAIN type is used to indicate whether the memory
+     * bank is bound to a static Xen domain. It is only valid when the bank
+     * is in reserved_mem.
+     */
+    MEMBANK_STATIC_DOMAIN,
+    /*
+     * The MEMBANK_STATIC_HEAP type is used to indicate whether the memory
+     * bank is reserved as static heap. It is only valid when the bank is
+     * in reserved_mem.
+     */
+    MEMBANK_STATIC_HEAP,
+};
+
+/* Indicates the maximum number of characters(\0 included) for shm_id */
+#define MAX_SHM_ID_LENGTH 16
+
+struct membank {
+    paddr_t start;
+    paddr_t size;
+    enum membank_type type;
+};
+
+struct meminfo {
+    unsigned int nr_banks;
+    struct membank bank[NR_MEM_BANKS];
+};
+
+/*
+ * The domU flag is set for kernels and ramdisks of "xen,domain" nodes.
+ * The purpose of the domU flag is to avoid getting confused in
+ * kernel_probe, where we try to guess which is the dom0 kernel and
+ * initrd to be compatible with all versions of the multiboot spec.
+ */
+#define BOOTMOD_MAX_CMDLINE 1024
+struct bootmodule {
+    bootmodule_kind kind;
+    bool domU;
+    paddr_t start;
+    paddr_t size;
+};
+
+/* DT_MAX_NAME is the node name max length according the DT spec */
+#define DT_MAX_NAME 41
+struct bootcmdline {
+    bootmodule_kind kind;
+    bool domU;
+    paddr_t start;
+    char dt_name[DT_MAX_NAME];
+    char cmdline[BOOTMOD_MAX_CMDLINE];
+};
+
+struct bootmodules {
+    int nr_mods;
+    struct bootmodule module[MAX_MODULES];
+};
+
+struct bootcmdlines {
+    unsigned int nr_mods;
+    struct bootcmdline cmdline[MAX_MODULES];
+};
+
+struct bootinfo {
+    struct meminfo mem;
+    struct meminfo reserved_mem;
+    struct bootmodules modules;
+    struct bootcmdlines cmdlines;
+    bool static_heap;
+};
+
+extern struct bootinfo bootinfo;
+
+/*
+ * setup.c
+ */
+
+bool check_reserved_regions_overlap(paddr_t region_start, paddr_t region_size);
+struct bootmodule *add_boot_module(bootmodule_kind kind,
+                                   paddr_t start, paddr_t size, bool domU);
+void add_boot_cmdline(const char *name, const char *cmdline,
+                      bootmodule_kind kind, paddr_t start, bool domU);
+const char *boot_module_kind_as_string(bootmodule_kind kind);
+struct bootcmdline * __init boot_cmdline_find_by_kind(bootmodule_kind kind);
+
+/*
+ * bootfdt.c
+ */
+size_t boot_fdt_info(const void *fdt, paddr_t paddr);
+
+#endif /* __ASM_PPC_SETUP_H__ */
diff --git a/xen/arch/ppc/setup.c b/xen/arch/ppc/setup.c
index 101bdd8bb6..143f2e449e 100644
--- a/xen/arch/ppc/setup.c
+++ b/xen/arch/ppc/setup.c
@@ -1,16 +1,296 @@ 
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 #include <xen/init.h>
 #include <xen/lib.h>
+#include <xen/libfdt/libfdt.h>
 #include <xen/mm.h>
 #include <public/version.h>
 #include <asm/boot.h>
 #include <asm/early_printk.h>
 #include <asm/mm.h>
 #include <asm/processor.h>
+#include <asm/setup.h>
 
 /* Xen stack for bringing up the first CPU. */
 unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __aligned(STACK_SIZE);
 
+struct bootinfo __initdata bootinfo;
+
+void __init add_boot_cmdline(const char *name, const char *cmdline,
+                             bootmodule_kind kind, paddr_t start, bool domU)
+{
+    struct bootcmdlines *cmds = &bootinfo.cmdlines;
+    struct bootcmdline *cmd;
+
+    if ( cmds->nr_mods == MAX_MODULES )
+    {
+        printk("Ignoring %s cmdline (too many)\n", name);
+        return;
+    }
+
+    cmd = &cmds->cmdline[cmds->nr_mods++];
+    cmd->kind = kind;
+    cmd->domU = domU;
+    cmd->start = start;
+
+    ASSERT(strlen(name) <= DT_MAX_NAME);
+    safe_strcpy(cmd->dt_name, name);
+
+    if ( strlen(cmdline) > BOOTMOD_MAX_CMDLINE )
+        panic("module %s command line too long\n", name);
+    safe_strcpy(cmd->cmdline, cmdline);
+}
+
+struct bootmodule __init *add_boot_module(bootmodule_kind kind,
+                                          paddr_t start, paddr_t size,
+                                          bool domU)
+{
+    struct bootmodules *mods = &bootinfo.modules;
+    struct bootmodule *mod;
+    unsigned int i;
+
+    if ( mods->nr_mods == MAX_MODULES )
+    {
+        printk("Ignoring %s boot module at %"PRIpaddr"-%"PRIpaddr" (too many)\n",
+               boot_module_kind_as_string(kind), start, start + size);
+        return NULL;
+    }
+
+    if ( check_reserved_regions_overlap(start, size) )
+        return NULL;
+
+    for ( i = 0 ; i < mods->nr_mods ; i++ )
+    {
+        mod = &mods->module[i];
+        if ( mod->kind == kind && mod->start == start )
+        {
+            if ( !domU )
+                mod->domU = false;
+            return mod;
+        }
+    }
+
+    mod = &mods->module[mods->nr_mods++];
+    mod->kind = kind;
+    mod->start = start;
+    mod->size = size;
+    mod->domU = domU;
+
+    return mod;
+}
+
+const char * __init boot_module_kind_as_string(bootmodule_kind kind)
+{
+    switch ( kind )
+    {
+    case BOOTMOD_XEN:     return "Xen";
+    case BOOTMOD_FDT:     return "Device Tree";
+    case BOOTMOD_KERNEL:  return "Kernel";
+    default: BUG();
+    }
+}
+
+/*
+ * TODO: '*_end' could be 0 if the module/region is at the end of the physical
+ * address space. This is for now not handled as it requires more rework.
+ */
+static bool __init bootmodules_overlap_check(struct bootmodules *bootmodules,
+                                             paddr_t region_start,
+                                             paddr_t region_size)
+{
+    paddr_t mod_start = INVALID_PADDR, mod_end = 0;
+    paddr_t region_end = region_start + region_size;
+    unsigned int i, mod_num = bootmodules->nr_mods;
+
+    for ( i = 0; i < mod_num; i++ )
+    {
+        mod_start = bootmodules->module[i].start;
+        mod_end = mod_start + bootmodules->module[i].size;
+
+        if ( region_end <= mod_start || region_start >= mod_end )
+            continue;
+        else
+        {
+            printk("Region: [%#"PRIpaddr", %#"PRIpaddr") overlapping with"
+                   " mod[%u]: [%#"PRIpaddr", %#"PRIpaddr")\n", region_start,
+                   region_end, i, mod_start, mod_end);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/*
+ * TODO: '*_end' could be 0 if the bank/region is at the end of the physical
+ * address space. This is for now not handled as it requires more rework.
+ */
+static bool __init meminfo_overlap_check(struct meminfo *meminfo,
+                                         paddr_t region_start,
+                                         paddr_t region_size)
+{
+    paddr_t bank_start = INVALID_PADDR, bank_end = 0;
+    paddr_t region_end = region_start + region_size;
+    unsigned int i, bank_num = meminfo->nr_banks;
+
+    for ( i = 0; i < bank_num; i++ )
+    {
+        bank_start = meminfo->bank[i].start;
+        bank_end = bank_start + meminfo->bank[i].size;
+
+        if ( region_end <= bank_start || region_start >= bank_end )
+            continue;
+        else
+        {
+            printk("Region: [%#"PRIpaddr", %#"PRIpaddr") overlapping with"
+                   " bank[%u]: [%#"PRIpaddr", %#"PRIpaddr")\n", region_start,
+                   region_end, i, bank_start, bank_end);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/*
+ * Given an input physical address range, check if this range is overlapping
+ * with the existing reserved memory regions defined in bootinfo.
+ * Return true if the input physical address range is overlapping with any
+ * existing reserved memory regions, otherwise false.
+ */
+bool __init check_reserved_regions_overlap(paddr_t region_start,
+                                           paddr_t region_size)
+{
+    /* Check if input region is overlapping with bootinfo.reserved_mem banks */
+    if ( meminfo_overlap_check(&bootinfo.reserved_mem,
+                               region_start, region_size) )
+        return true;
+
+    /* Check if input region is overlapping with bootmodules */
+    if ( bootmodules_overlap_check(&bootinfo.modules,
+                                   region_start, region_size) )
+        return true;
+
+    return false;
+}
+
+/*
+ * Return the end of the non-module region starting at s. In other
+ * words return s the start of the next modules after s.
+ *
+ * On input *end is the end of the region which should be considered
+ * and it is updated to reflect the end of the module, clipped to the
+ * end of the region if it would run over.
+ */
+static paddr_t __init next_module(paddr_t s, paddr_t *end)
+{
+    struct bootmodules *mi = &bootinfo.modules;
+    paddr_t lowest = ~(paddr_t)0;
+    int i;
+
+    for ( i = 0; i < mi->nr_mods; i++ )
+    {
+        paddr_t mod_s = mi->module[i].start;
+        paddr_t mod_e = mod_s + mi->module[i].size;
+
+        if ( !mi->module[i].size )
+            continue;
+
+        if ( mod_s < s )
+            continue;
+        if ( mod_s > lowest )
+            continue;
+        if ( mod_s > *end )
+            continue;
+        lowest = mod_s;
+        *end = min(*end, mod_e);
+    }
+    return lowest;
+}
+
+static void __init dt_unreserved_regions(paddr_t s, paddr_t e,
+                                         void (*cb)(paddr_t ps, paddr_t pe),
+                                         unsigned int first)
+{
+    unsigned int i;
+
+    for ( i = 0 ; i < bootinfo.reserved_mem.nr_banks; i++ )
+    {
+        paddr_t r_s = bootinfo.reserved_mem.bank[i].start;
+        paddr_t r_e = r_s + bootinfo.reserved_mem.bank[i].size;
+
+        if ( s < r_e && r_s < e )
+        {
+            dt_unreserved_regions(r_e, e, cb, i + 1);
+            dt_unreserved_regions(s, r_s, cb, i + 1);
+            return;
+        }
+    }
+
+    cb(s, e);
+}
+
+/*
+ * boot_cmdline_find_by_kind can only be used to return Xen modules (e.g
+ * XSM, DTB) or Dom0 modules. This is not suitable for looking up guest
+ * modules.
+ */
+struct bootcmdline * __init boot_cmdline_find_by_kind(bootmodule_kind kind)
+{
+    struct bootcmdlines *cmds = &bootinfo.cmdlines;
+    struct bootcmdline *cmd;
+    int i;
+
+    for ( i = 0 ; i < cmds->nr_mods ; i++ )
+    {
+        cmd = &cmds->cmdline[i];
+        if ( cmd->kind == kind && !cmd->domU )
+            return cmd;
+    }
+    return NULL;
+}
+
+/*
+ * Populate the boot allocator. Based on arch/arm/setup.c's
+ * populate_boot_allocator.
+ * All RAM but the following regions will be added to the boot allocator:
+ *  - Modules (e.g., Xen, Kernel)
+ *  - Reserved regions
+ */
+static void __init populate_boot_allocator(void)
+{
+    unsigned int i;
+    const struct meminfo *banks = &bootinfo.mem;
+    paddr_t s, e;
+
+    for ( i = 0; i < banks->nr_banks; i++ )
+    {
+        const struct membank *bank = &banks->bank[i];
+        paddr_t bank_end = bank->start + bank->size;
+
+        s = bank->start;
+        while ( s < bank_end )
+        {
+            paddr_t n = bank_end;
+
+            e = next_module(s, &n);
+
+            if ( e == ~(paddr_t)0 )
+                e = n = bank_end;
+
+            /*
+             * Module in a RAM bank other than the one which we are
+             * not dealing with here.
+             */
+            if ( e > bank_end )
+                e = bank_end;
+
+            dt_unreserved_regions(s, e, init_boot_pages, 0);
+
+            s = n;
+        }
+    }
+}
+
 void setup_exceptions(void)
 {
     unsigned long lpcr;
@@ -24,6 +304,8 @@  void __init noreturn start_xen(unsigned long r3, unsigned long r4,
                                unsigned long r5, unsigned long r6,
                                unsigned long r7)
 {
+    void *boot_fdt;
+
     if ( r5 )
     {
         /* Unsupported OpenFirmware boot protocol */
@@ -32,11 +314,16 @@  void __init noreturn start_xen(unsigned long r3, unsigned long r4,
     else
     {
         /* kexec boot protocol */
-        boot_opal_init((void *)r3);
+        boot_fdt = (void *)r3;
+        boot_opal_init(boot_fdt);
     }
 
     setup_exceptions();
 
+    boot_fdt_info(boot_fdt, r3);
+
+    populate_boot_allocator();
+
     setup_initial_pagetables();
 
     early_printk("Hello, ppc64le!\n");
diff --git a/xen/common/device-tree/bootfdt.c b/xen/common/device-tree/bootfdt.c
index 796ac01c18..7ddfcc7e2a 100644
--- a/xen/common/device-tree/bootfdt.c
+++ b/xen/common/device-tree/bootfdt.c
@@ -543,12 +543,33 @@  size_t __init boot_fdt_info(const void *fdt, paddr_t paddr)
     if ( ret < 0 )
         panic("No valid device tree\n");
 
-    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
-
     ret = device_tree_for_each_node((void *)fdt, 0, early_scan_node, NULL);
     if ( ret )
         panic("Early FDT parsing failed (%d)\n", ret);
 
+    /*
+     * Add module for the FDT itself after the device tree has been parsed. This
+     * is required on ppc64le where the device tree passed to Xen may have been
+     * allocated by skiboot, in which case it will exist within a reserved
+     * region and this call will fail. This is fine, however, since either way
+     * the allocator will know not to step on the device tree.
+     */
+    add_boot_module(BOOTMOD_FDT, paddr, fdt_totalsize(fdt), false);
+
+    /*
+     * Xen relocates itself at the ppc64 entrypoint, so we need to manually mark
+     * the kernel module.
+     */
+    if ( IS_ENABLED(CONFIG_PPC64) ) {
+        paddr_t xen_start, xen_end;
+
+        xen_start = __pa(_start);
+        xen_end = PAGE_ALIGN(__pa(_end));
+        if ( !add_boot_module(BOOTMOD_XEN, xen_start, xen_end, false) )
+            panic("Xen overlaps reserved memory! %016lx - %016lx\n", xen_start,
+                xen_end);
+    }
+
     /*
      * On Arm64 setup_directmap_mappings() expects to be called with the lowest
      * bank in memory first. There is no requirement that the DT will provide