diff mbox

[v2,18/30] xen/x86: setup PVHv2 Dom0 ACPI tables

Message ID 1474991845-27962-19-git-send-email-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné Sept. 27, 2016, 3:57 p.m. UTC
This maps all the regions in the e820 marked as E820_ACPI or E820_NVS and
the top-level ACPI tables discovered by Xen to Dom0 1:1. It also shadows the
page(s) where the native MADT is placed by mapping a RAM page over it,
copying the original data and modifying it afterwards in order to represent
the real CPU topology exposed to Dom0.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
FWIW, I think that the current approach that I've used in order to craft the
MADT is not the best one, IMHO it would be better to place the MADT at the
end of the E820_ACPI region (expanding it's size one page), and modify the
XSDT/RSDT in order to point to it, that way we avoid shadowing any other
ACPI data that might be at the same page as the native MADT (and that needs
to be modified by Dom0).
---
 xen/arch/x86/domain_build.c | 274 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)

Comments

Jan Beulich Oct. 6, 2016, 3:40 p.m. UTC | #1
>>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
> FWIW, I think that the current approach that I've used in order to craft the
> MADT is not the best one, IMHO it would be better to place the MADT at the
> end of the E820_ACPI region (expanding it's size one page), and modify the
> XSDT/RSDT in order to point to it, that way we avoid shadowing any other
> ACPI data that might be at the same page as the native MADT (and that needs
> to be modified by Dom0).

I agree with the idea of placing MADT elsewhere, but I don't think you
can rely on E820_ACPI to have room to grow into right after its end.

> @@ -50,6 +53,8 @@ static long __initdata dom0_max_nrpages = LONG_MAX;
>  #define HVM_VM86_TSS_SIZE   128
>  
>  static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];
> +static unsigned int __initdata acpi_intr_overrrides = 0;
> +static struct acpi_madt_interrupt_override __initdata *intsrcovr = NULL;

Pointless initializers.

> +static void __init acpi_zap_table_signature(char *name)
> +{
> +    struct acpi_table_header *table;
> +    acpi_status status;
> +    union {
> +        char str[ACPI_NAME_SIZE];
> +        uint32_t bits;
> +    } signature;
> +    char tmp;
> +    int i;
> +
> +    status = acpi_get_table(name, 0, &table);
> +    if ( ACPI_SUCCESS(status) )
> +    {
> +        memcpy(&signature.str[0], &table->signature[0], ACPI_NAME_SIZE);
> +        for ( i = 0; i < ACPI_NAME_SIZE / 2; i++ )
> +        {
> +            tmp = signature.str[ACPI_NAME_SIZE - i - 1];
> +            signature.str[ACPI_NAME_SIZE - i - 1] = signature.str[i];
> +            signature.str[i] = tmp;
> +        }
> +        write_atomic((uint32_t*)&table->signature[0], signature.bits);
> +    }
> +}

Together with moving MADT elsewhere we should also move
XSDT/RSDT, at which point we can simply avoid copying the
pointers for tables we don't want Dom0 to see (and down the
road we also have the option of adding tables).

The downside to both approaches is that this once again is a
black-listing model, i.e. new structure types we may need to
also suppress will remain visible to Dom0 until a patched
hypervisor would become available.

> +static int __init hvm_setup_acpi(struct domain *d)
> +{
> +    struct vcpu *saved_current, *v = d->vcpu[0];
> +    unsigned long pfn, nr_pages;
> +    uint64_t size, start_addr, end_addr;
> +    uint64_t madt_addr[2] = { 0, 0 };
> +    struct acpi_table_header *table;
> +    struct acpi_table_madt *madt;
> +    struct acpi_madt_io_apic *io_apic;
> +    struct acpi_madt_local_apic *local_apic;
> +    acpi_status status;
> +    int rc, i;
> +
> +    printk("** Setup ACPI tables **\n");
> +
> +    /* ZAP the HPET, SLIT, SRAT, MPST and PMTT tables. */
> +    acpi_zap_table_signature(ACPI_SIG_HPET);
> +    acpi_zap_table_signature(ACPI_SIG_SLIT);
> +    acpi_zap_table_signature(ACPI_SIG_SRAT);
> +    acpi_zap_table_signature(ACPI_SIG_MPST);
> +    acpi_zap_table_signature(ACPI_SIG_PMTT);
> +
> +    /* Map ACPI tables 1:1 */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        if ( d->arch.e820[i].type != E820_ACPI &&
> +             d->arch.e820[i].type != E820_NVS )
> +            continue;
> +
> +        pfn = PFN_DOWN(d->arch.e820[i].addr);
> +        nr_pages = DIV_ROUND_UP(d->arch.e820[i].size, PAGE_SIZE);
> +
> +        rc = modify_mmio_11(d, pfn, nr_pages, true);
> +        if ( rc )
> +        {
> +            printk(
> +                "Failed to map ACPI region %#lx - %#lx into Dom0 memory map\n",

Please keep the format string on the printk line.

> +                   pfn, pfn + nr_pages);
> +            return rc;
> +        }
> +    }
> +    /*
> +     * Since most memory maps provided by hardware are wrong, make sure each
> +     * top-level table is properly mapped into Dom0.
> +     */

Please be more specific here - wrong in which way? Not all ACPI tables
living in one of the two E820 type regions? But see also below.

In any event you need to be more careful here: Mapping ordinary RAM
1:1 into Dom0 can't end well. Also, once working with a cloned XSDT you
won't need to cover tables here which have got hidden from Dom0.

> +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> +    {
> +        pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
> +        nr_pages = DIV_ROUND_UP(acpi_gbl_root_table_list.tables[i].length,
> +                                PAGE_SIZE);
> +        rc = modify_mmio_11(d, pfn, nr_pages, true);
> +        if ( rc )
> +        {
> +            printk(
> +                "Failed to map ACPI region %#lx - %#lx into Dom0 memory map\n",
> +                   pfn, pfn + nr_pages);
> +            return rc;
> +        }
> +    }
> +
> +    /*
> +     * Special treatment for memory < 1MB:
> +     *  - Copy the data in e820 regions marked as RAM (BDA, EBDA...).

Copy? What if some of it needs to get modified to interact with
firmware? I think you'd be better off mapping everything Xen
doesn't use into Dom0, and only mapping fresh RAM pages
over regions Xen does use (namely the trampoline).

> +     *  - Map any reserved regions as 1:1.

Why would reserved regions need such treatment only when below
1Mb?

> +    acpi_get_table_phys(ACPI_SIG_MADT, 0, &madt_addr[0], &size);
> +    if ( !madt_addr[0] )
> +    {
> +        printk("Unable to find ACPI MADT table\n");
> +        return -EINVAL;
> +    }
> +    if ( size > PAGE_SIZE )
> +    {
> +        printk("MADT table is bigger than PAGE_SIZE, aborting\n");
> +        return -EINVAL;
> +    }

This restriction for sure needs to go away.

> +    acpi_get_table_phys(ACPI_SIG_MADT, 2, &madt_addr[1], &size);

Why 2 (and not 1) when the first invocation used 0? But this is not
going to be needed anyway with the alternative model.

> +    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
> +    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
> +    io_apic->header.length = sizeof(*io_apic);
> +    io_apic->id = 1;
> +    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;

I think you need to make as many IO-APICs available to Dom0 as
there are hardware ones.

> +    if ( dom0_max_vcpus() > num_online_cpus() )
> +    {
> +        printk("CPU overcommit is not supported for Dom0\n");

???

> +        xfree(madt);

I don't think such cleanup is needed here - boot won't succeed
anyway.

Jan
Andrew Cooper Oct. 6, 2016, 3:48 p.m. UTC | #2
On 06/10/16 16:40, Jan Beulich wrote:
>>>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
>> FWIW, I think that the current approach that I've used in order to craft the
>> MADT is not the best one, IMHO it would be better to place the MADT at the
>> end of the E820_ACPI region (expanding it's size one page), and modify the
>> XSDT/RSDT in order to point to it, that way we avoid shadowing any other
>> ACPI data that might be at the same page as the native MADT (and that needs
>> to be modified by Dom0).
> I agree with the idea of placing MADT elsewhere, but I don't think you
> can rely on E820_ACPI to have room to grow into right after its end.
>
>> @@ -50,6 +53,8 @@ static long __initdata dom0_max_nrpages = LONG_MAX;
>>  #define HVM_VM86_TSS_SIZE   128
>>  
>>  static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];
>> +static unsigned int __initdata acpi_intr_overrrides = 0;
>> +static struct acpi_madt_interrupt_override __initdata *intsrcovr = NULL;
> Pointless initializers.
>
>> +static void __init acpi_zap_table_signature(char *name)
>> +{
>> +    struct acpi_table_header *table;
>> +    acpi_status status;
>> +    union {
>> +        char str[ACPI_NAME_SIZE];
>> +        uint32_t bits;
>> +    } signature;
>> +    char tmp;
>> +    int i;
>> +
>> +    status = acpi_get_table(name, 0, &table);
>> +    if ( ACPI_SUCCESS(status) )
>> +    {
>> +        memcpy(&signature.str[0], &table->signature[0], ACPI_NAME_SIZE);
>> +        for ( i = 0; i < ACPI_NAME_SIZE / 2; i++ )
>> +        {
>> +            tmp = signature.str[ACPI_NAME_SIZE - i - 1];
>> +            signature.str[ACPI_NAME_SIZE - i - 1] = signature.str[i];
>> +            signature.str[i] = tmp;
>> +        }
>> +        write_atomic((uint32_t*)&table->signature[0], signature.bits);
>> +    }
>> +}
> Together with moving MADT elsewhere we should also move
> XSDT/RSDT, at which point we can simply avoid copying the
> pointers for tables we don't want Dom0 to see (and down the
> road we also have the option of adding tables).
>
> The downside to both approaches is that this once again is a
> black-listing model, i.e. new structure types we may need to
> also suppress will remain visible to Dom0 until a patched
> hypervisor would become available.

If we are providing a new XSDT/RSDT, we can have full control over the
tables dom0 sees.  Pointers to existing tables should only be entered
into the new XSDT/RSDT if Xen explicitly knows the table & version.

We will have to be diligent at keeping on top of new versions of the
ACPI spec, but everything like this should be whitelist based.  (This is
also the same model I trying to move the CPUID/MSR infrastructure towards).

~Andrew
Roger Pau Monné Oct. 12, 2016, 3:35 p.m. UTC | #3
On Thu, Oct 06, 2016 at 09:40:50AM -0600, Jan Beulich wrote:
> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
> > FWIW, I think that the current approach that I've used in order to craft the
> > MADT is not the best one, IMHO it would be better to place the MADT at the
> > end of the E820_ACPI region (expanding it's size one page), and modify the
> > XSDT/RSDT in order to point to it, that way we avoid shadowing any other
> > ACPI data that might be at the same page as the native MADT (and that needs
> > to be modified by Dom0).
> 
> I agree with the idea of placing MADT elsewhere, but I don't think you
> can rely on E820_ACPI to have room to grow into right after its end.

Right, I think picking some memory from a RAM region and marking it as 
E820_ACPI is the best approach.

> > @@ -50,6 +53,8 @@ static long __initdata dom0_max_nrpages = LONG_MAX;
> >  #define HVM_VM86_TSS_SIZE   128
> >  
> >  static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];
> > +static unsigned int __initdata acpi_intr_overrrides = 0;
> > +static struct acpi_madt_interrupt_override __initdata *intsrcovr = NULL;
> 
> Pointless initializers.

Removed.

> > +static void __init acpi_zap_table_signature(char *name)
> > +{
> > +    struct acpi_table_header *table;
> > +    acpi_status status;
> > +    union {
> > +        char str[ACPI_NAME_SIZE];
> > +        uint32_t bits;
> > +    } signature;
> > +    char tmp;
> > +    int i;
> > +
> > +    status = acpi_get_table(name, 0, &table);
> > +    if ( ACPI_SUCCESS(status) )
> > +    {
> > +        memcpy(&signature.str[0], &table->signature[0], ACPI_NAME_SIZE);
> > +        for ( i = 0; i < ACPI_NAME_SIZE / 2; i++ )
> > +        {
> > +            tmp = signature.str[ACPI_NAME_SIZE - i - 1];
> > +            signature.str[ACPI_NAME_SIZE - i - 1] = signature.str[i];
> > +            signature.str[i] = tmp;
> > +        }
> > +        write_atomic((uint32_t*)&table->signature[0], signature.bits);
> > +    }
> > +}
> 
> Together with moving MADT elsewhere we should also move
> XSDT/RSDT, at which point we can simply avoid copying the
> pointers for tables we don't want Dom0 to see (and down the
> road we also have the option of adding tables).
> 
> The downside to both approaches is that this once again is a
> black-listing model, i.e. new structure types we may need to
> also suppress will remain visible to Dom0 until a patched
> hypervisor would become available.

Maybe we could do whitelisting instead? I can see that at least the 
following tables should be available to Dom0 if present, but TBH, it's hard 
to tell:

MADT, DSDT, FADT, SSDT, FACS, SBST, NFIT, MCFG, TCPA. Then ECDT, CPEP and 
RASF also seem fine to make available to Dom0, but I'm dubious.

But I agree that crafting a custom XSDT/RSDT is the best option here.
 
> > +                   pfn, pfn + nr_pages);
> > +            return rc;
> > +        }
> > +    }
> > +    /*
> > +     * Since most memory maps provided by hardware are wrong, make sure each
> > +     * top-level table is properly mapped into Dom0.
> > +     */
> 
> Please be more specific here - wrong in which way? Not all ACPI tables
> living in one of the two E820 type regions? But see also below.
> 
> In any event you need to be more careful here: Mapping ordinary RAM
> 1:1 into Dom0 can't end well. Also, once working with a cloned XSDT you
> won't need to cover tables here which have got hidden from Dom0.

I've found systems where some ACPI tables reside in memory holes or reserved 
regions. I don't think I've found a system where an ACPI table would reside 
in a RAM region, but I agree that it's worth adding a check here to make 
sure.

> > +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> > +    {
> > +        pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
> > +        nr_pages = DIV_ROUND_UP(acpi_gbl_root_table_list.tables[i].length,
> > +                                PAGE_SIZE);
> > +        rc = modify_mmio_11(d, pfn, nr_pages, true);
> > +        if ( rc )
> > +        {
> > +            printk(
> > +                "Failed to map ACPI region %#lx - %#lx into Dom0 memory map\n",
> > +                   pfn, pfn + nr_pages);
> > +            return rc;
> > +        }
> > +    }
> > +
> > +    /*
> > +     * Special treatment for memory < 1MB:
> > +     *  - Copy the data in e820 regions marked as RAM (BDA, EBDA...).
> 
> Copy? What if some of it needs to get modified to interact with
> firmware? I think you'd be better off mapping everything Xen
> doesn't use into Dom0, and only mapping fresh RAM pages
> over regions Xen does use (namely the trampoline).

Hm, that was my first approach (mapping the whole first MB into Dom0), but 
sadly it doesn't seem to work because FreeBSD at least places the AP boot 
trampoline there, and since APs are launched in 16b mode, the emulator 
cannot handle executing memory from MMIO regions and crashes the domain. 
That's why I had to map and copy data from RAM regions below 1MB. The BDA 
it's all static data AFAICT, and the EBDA should reside in a reserved 
region (or at least does on my systems).

Might it be possible to solve this by identity mapping the first 1MB, and 
marking the RAM regions there as p2m_ram_rw? Or would that create even 
further problems?

> > +     *  - Map any reserved regions as 1:1.
> 
> Why would reserved regions need such treatment only when below
> 1Mb?

The video RAM for the VGA display needs to be mapped into Dom0, and that's 
in the reserved region below 1MB (usually starting at 0xA0000).

I would like to avoid mapping all the reserved regions into Dom0 by default 
(and the IOMMU), but this might be needed on some systems as a workaround 
(specially those not providing or with wrong RMRR tables).

> > +    acpi_get_table_phys(ACPI_SIG_MADT, 0, &madt_addr[0], &size);
> > +    if ( !madt_addr[0] )
> > +    {
> > +        printk("Unable to find ACPI MADT table\n");
> > +        return -EINVAL;
> > +    }
> > +    if ( size > PAGE_SIZE )
> > +    {
> > +        printk("MADT table is bigger than PAGE_SIZE, aborting\n");
> > +        return -EINVAL;
> > +    }
> 
> This restriction for sure needs to go away.

Sure.

> > +    acpi_get_table_phys(ACPI_SIG_MADT, 2, &madt_addr[1], &size);
> 
> Why 2 (and not 1) when the first invocation used 0? But this is not
> going to be needed anyway with the alternative model.
> 
> > +    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
> > +    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
> > +    io_apic->header.length = sizeof(*io_apic);
> > +    io_apic->id = 1;
> > +    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;
> 
> I think you need to make as many IO-APICs available to Dom0 as
> there are hardware ones.

Right, I've been wondering about this, and then I need to expand the IO APIC 
emulation code so that Xen is able to emulate two IO-APICs.

Can I ask why do you think this is needed? If the number of pins in the 
multiple IO APIC case doesn't exceed 48 (which is what the emulated IO APIC 
provides), shouldn't this be enough?

> > +    if ( dom0_max_vcpus() > num_online_cpus() )
> > +    {
> > +        printk("CPU overcommit is not supported for Dom0\n");
> 
> ???

This is going away with the new MADT approach.

Thanks, Roger.
Jan Beulich Oct. 12, 2016, 3:55 p.m. UTC | #4
>>> On 12.10.16 at 17:35, <roger.pau@citrix.com> wrote:
> On Thu, Oct 06, 2016 at 09:40:50AM -0600, Jan Beulich wrote:
>> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
>> > +static void __init acpi_zap_table_signature(char *name)
>> > +{
>> > +    struct acpi_table_header *table;
>> > +    acpi_status status;
>> > +    union {
>> > +        char str[ACPI_NAME_SIZE];
>> > +        uint32_t bits;
>> > +    } signature;
>> > +    char tmp;
>> > +    int i;
>> > +
>> > +    status = acpi_get_table(name, 0, &table);
>> > +    if ( ACPI_SUCCESS(status) )
>> > +    {
>> > +        memcpy(&signature.str[0], &table->signature[0], ACPI_NAME_SIZE);
>> > +        for ( i = 0; i < ACPI_NAME_SIZE / 2; i++ )
>> > +        {
>> > +            tmp = signature.str[ACPI_NAME_SIZE - i - 1];
>> > +            signature.str[ACPI_NAME_SIZE - i - 1] = signature.str[i];
>> > +            signature.str[i] = tmp;
>> > +        }
>> > +        write_atomic((uint32_t*)&table->signature[0], signature.bits);
>> > +    }
>> > +}
>> 
>> Together with moving MADT elsewhere we should also move
>> XSDT/RSDT, at which point we can simply avoid copying the
>> pointers for tables we don't want Dom0 to see (and down the
>> road we also have the option of adding tables).
>> 
>> The downside to both approaches is that this once again is a
>> black-listing model, i.e. new structure types we may need to
>> also suppress will remain visible to Dom0 until a patched
>> hypervisor would become available.
> 
> Maybe we could do whitelisting instead? I can see that at least the 
> following tables should be available to Dom0 if present, but TBH, it's hard 
> to tell:

Taking an abstract perspective I agree with Andrew that we should
be whitelisting here. However, as you already see from the list you
provided (which afaict is far from complete wrt ACPI 6.1), this may
become cumbersome already initially, not to speak of down the road.

>> > +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
>> > +    {
>> > +        pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
>> > +        nr_pages = DIV_ROUND_UP(acpi_gbl_root_table_list.tables[i].length,
>> > +                                PAGE_SIZE);
>> > +        rc = modify_mmio_11(d, pfn, nr_pages, true);
>> > +        if ( rc )
>> > +        {
>> > +            printk(
>> > +                "Failed to map ACPI region %#lx - %#lx into Dom0 memory map\n",
>> > +                   pfn, pfn + nr_pages);
>> > +            return rc;
>> > +        }
>> > +    }
>> > +
>> > +    /*
>> > +     * Special treatment for memory < 1MB:
>> > +     *  - Copy the data in e820 regions marked as RAM (BDA, EBDA...).
>> 
>> Copy? What if some of it needs to get modified to interact with
>> firmware? I think you'd be better off mapping everything Xen
>> doesn't use into Dom0, and only mapping fresh RAM pages
>> over regions Xen does use (namely the trampoline).
> 
> Hm, that was my first approach (mapping the whole first MB into Dom0), but 
> sadly it doesn't seem to work because FreeBSD at least places the AP boot 
> trampoline there, and since APs are launched in 16b mode, the emulator 
> cannot handle executing memory from MMIO regions and crashes the domain. 
> That's why I had to map and copy data from RAM regions below 1MB. The BDA 
> it's all static data AFAICT, and the EBDA should reside in a reserved 
> region (or at least does on my systems).

I'm afraid there are systems where the EBDA is not marked reserved.
But maybe there are no current (64-bit capable) ones of that sort.

> Might it be possible to solve this by identity mapping the first 1MB, and 
> marking the RAM regions there as p2m_ram_rw? Or would that create even 
> further problems?

Hmm, not sure - the range below 1Mb is marked as MMIO in
frame_table[], so there would be a (benign?) conflict at least there.

>> > +    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
>> > +    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
>> > +    io_apic->header.length = sizeof(*io_apic);
>> > +    io_apic->id = 1;
>> > +    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;
>> 
>> I think you need to make as many IO-APICs available to Dom0 as
>> there are hardware ones.
> 
> Right, I've been wondering about this, and then I need to expand the IO APIC 
> emulation code so that Xen is able to emulate two IO-APICs.
> 
> Can I ask why do you think this is needed? If the number of pins in the 
> multiple IO APIC case doesn't exceed 48 (which is what the emulated IO APIC 
> provides), shouldn't this be enough?

The number of pins easily can be larger. And I think the mapping
code would end up simpler if there was a 1:1 relationship between
physical and virtual IO-APICs. I've just not checked one of my
larger (but older) systems - it has 5 IO-APICs with a total of 120 pins.

Jan
Roger Pau Monné Oct. 26, 2016, 11:35 a.m. UTC | #5
On Wed, Oct 12, 2016 at 09:55:44AM -0600, Jan Beulich wrote:
> >>> On 12.10.16 at 17:35, <roger.pau@citrix.com> wrote:
> > On Thu, Oct 06, 2016 at 09:40:50AM -0600, Jan Beulich wrote:
> >> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
> >> > +static void __init acpi_zap_table_signature(char *name)
> >> > +{
> >> > +    struct acpi_table_header *table;
> >> > +    acpi_status status;
> >> > +    union {
> >> > +        char str[ACPI_NAME_SIZE];
> >> > +        uint32_t bits;
> >> > +    } signature;
> >> > +    char tmp;
> >> > +    int i;
> >> > +
> >> > +    status = acpi_get_table(name, 0, &table);
> >> > +    if ( ACPI_SUCCESS(status) )
> >> > +    {
> >> > +        memcpy(&signature.str[0], &table->signature[0], ACPI_NAME_SIZE);
> >> > +        for ( i = 0; i < ACPI_NAME_SIZE / 2; i++ )
> >> > +        {
> >> > +            tmp = signature.str[ACPI_NAME_SIZE - i - 1];
> >> > +            signature.str[ACPI_NAME_SIZE - i - 1] = signature.str[i];
> >> > +            signature.str[i] = tmp;
> >> > +        }
> >> > +        write_atomic((uint32_t*)&table->signature[0], signature.bits);
> >> > +    }
> >> > +}
> >> 
> >> Together with moving MADT elsewhere we should also move
> >> XSDT/RSDT, at which point we can simply avoid copying the
> >> pointers for tables we don't want Dom0 to see (and down the
> >> road we also have the option of adding tables).
> >> 
> >> The downside to both approaches is that this once again is a
> >> black-listing model, i.e. new structure types we may need to
> >> also suppress will remain visible to Dom0 until a patched
> >> hypervisor would become available.
> > 
> > Maybe we could do whitelisting instead? I can see that at least the 
> > following tables should be available to Dom0 if present, but TBH, it's hard 
> > to tell:
> 
> Taking an abstract perspective I agree with Andrew that we should
> be whitelisting here. However, as you already see from the list you
> provided (which afaict is far from complete wrt ACPI 6.1), this may
> become cumbersome already initially, not to speak of down the road.

I've initially used a back-listing approach. We can always change this later 
on.

So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
from the guest memory map and marked as E820_ACPI, which means that the new 
RSDP no longer resides below 1MB, and that the Dom0 kernel _must_ use the 
rsdp_paddr provided in the start info, or else it's going to access the 
native RSDP.

> >> > +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> >> > +    {
> >> > +        pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
> >> > +        nr_pages = DIV_ROUND_UP(acpi_gbl_root_table_list.tables[i].length,
> >> > +                                PAGE_SIZE);
> >> > +        rc = modify_mmio_11(d, pfn, nr_pages, true);
> >> > +        if ( rc )
> >> > +        {
> >> > +            printk(
> >> > +                "Failed to map ACPI region %#lx - %#lx into Dom0 memory map\n",
> >> > +                   pfn, pfn + nr_pages);
> >> > +            return rc;
> >> > +        }
> >> > +    }
> >> > +
> >> > +    /*
> >> > +     * Special treatment for memory < 1MB:
> >> > +     *  - Copy the data in e820 regions marked as RAM (BDA, EBDA...).
> >> 
> >> Copy? What if some of it needs to get modified to interact with
> >> firmware? I think you'd be better off mapping everything Xen
> >> doesn't use into Dom0, and only mapping fresh RAM pages
> >> over regions Xen does use (namely the trampoline).
> > 
> > Hm, that was my first approach (mapping the whole first MB into Dom0), but 
> > sadly it doesn't seem to work because FreeBSD at least places the AP boot 
> > trampoline there, and since APs are launched in 16b mode, the emulator 
> > cannot handle executing memory from MMIO regions and crashes the domain. 
> > That's why I had to map and copy data from RAM regions below 1MB. The BDA 
> > it's all static data AFAICT, and the EBDA should reside in a reserved 
> > region (or at least does on my systems).
> 
> I'm afraid there are systems where the EBDA is not marked reserved.
> But maybe there are no current (64-bit capable) ones of that sort.

Hm, I would say that we leave this as it is currently, and then we can 
always play more tricks later on if we found any of such systems.
 
> > Might it be possible to solve this by identity mapping the first 1MB, and 
> > marking the RAM regions there as p2m_ram_rw? Or would that create even 
> > further problems?
> 
> Hmm, not sure - the range below 1Mb is marked as MMIO in
> frame_table[], so there would be a (benign?) conflict at least there.

As said before, I would leave the current implementation and look into that 
option if needed.

> >> > +    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
> >> > +    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
> >> > +    io_apic->header.length = sizeof(*io_apic);
> >> > +    io_apic->id = 1;
> >> > +    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;
> >> 
> >> I think you need to make as many IO-APICs available to Dom0 as
> >> there are hardware ones.
> > 
> > Right, I've been wondering about this, and then I need to expand the IO APIC 
> > emulation code so that Xen is able to emulate two IO-APICs.
> > 
> > Can I ask why do you think this is needed? If the number of pins in the 
> > multiple IO APIC case doesn't exceed 48 (which is what the emulated IO APIC 
> > provides), shouldn't this be enough?
> 
> The number of pins easily can be larger. And I think the mapping
> code would end up simpler if there was a 1:1 relationship between
> physical and virtual IO-APICs. I've just not checked one of my
> larger (but older) systems - it has 5 IO-APICs with a total of 120 pins.

Yes, I agree that having a 1:1 relation between physical and virtual IO 
APICs is the best option. I've added a warning printk if the native hardware 
has more than one IO APIC, and I plan to expand the current IO APIC 
emulation code when I'm done with this initial PVHv2 Dom0 implementation, so 
that it can support emulating multiple IO APICs with varying number of pins.

Roger.
Jan Beulich Oct. 26, 2016, 2:10 p.m. UTC | #6
>>> On 26.10.16 at 13:35, <roger.pau@citrix.com> wrote:
> On Wed, Oct 12, 2016 at 09:55:44AM -0600, Jan Beulich wrote:
>> Taking an abstract perspective I agree with Andrew that we should
>> be whitelisting here. However, as you already see from the list you
>> provided (which afaict is far from complete wrt ACPI 6.1), this may
>> become cumbersome already initially, not to speak of down the road.
> 
> I've initially used a back-listing approach. We can always change this later 
> on.
> 
> So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
> crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
> 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
> from the guest memory map and marked as E820_ACPI, which means that the new 
> RSDP no longer resides below 1MB, and that the Dom0 kernel _must_ use the 
> rsdp_paddr provided in the start info, or else it's going to access the 
> native RSDP.

Hmm, for the RSDP I'm not sure. It might be better if we put it at the
same spot where the host one is, mapping a RAM page there with a
copy of the respective host page data. Otoh your approach allows
Dom0 to still find the real tables if need be, which has both up and
down sides.

>> I'm afraid there are systems where the EBDA is not marked reserved.
>> But maybe there are no current (64-bit capable) ones of that sort.
> 
> Hm, I would say that we leave this as it is currently, and then we can 
> always play more tricks later on if we found any of such systems.

As long as the code is experimental, and there's a note to that effect
which can be easily found (and has to be gone for the code to become
non-experimental), I'm fine with that.

>> > Might it be possible to solve this by identity mapping the first 1MB, and 
>> > marking the RAM regions there as p2m_ram_rw? Or would that create even 
>> > further problems?
>> 
>> Hmm, not sure - the range below 1Mb is marked as MMIO in
>> frame_table[], so there would be a (benign?) conflict at least there.
> 
> As said before, I would leave the current implementation and look into that 
> option if needed.

Same thing here.

Jan
Roger Pau Monné Oct. 26, 2016, 3:08 p.m. UTC | #7
On Wed, Oct 26, 2016 at 08:10:50AM -0600, Jan Beulich wrote:
> >>> On 26.10.16 at 13:35, <roger.pau@citrix.com> wrote:
> > On Wed, Oct 12, 2016 at 09:55:44AM -0600, Jan Beulich wrote:
> >> Taking an abstract perspective I agree with Andrew that we should
> >> be whitelisting here. However, as you already see from the list you
> >> provided (which afaict is far from complete wrt ACPI 6.1), this may
> >> become cumbersome already initially, not to speak of down the road.
> > 
> > I've initially used a back-listing approach. We can always change this later 
> > on.
> > 
> > So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
> > crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
> > 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
> > from the guest memory map and marked as E820_ACPI, which means that the new 
> > RSDP no longer resides below 1MB, and that the Dom0 kernel _must_ use the 
> > rsdp_paddr provided in the start info, or else it's going to access the 
> > native RSDP.
> 
> Hmm, for the RSDP I'm not sure. It might be better if we put it at the
> same spot where the host one is, mapping a RAM page there with a
> copy of the respective host page data. Otoh your approach allows
> Dom0 to still find the real tables if need be, which has both up and
> down sides.

The problem with putting it at the same page is that AFAICT there's a big 
chance that other things (like EBDA or ROM) being at the same page, and 
we would be shadowing them by mapping a RAM page over it, even if the 
original data is copied. This hole area below 1MB is just a mess to deal 
with TBH.

> >> I'm afraid there are systems where the EBDA is not marked reserved.
> >> But maybe there are no current (64-bit capable) ones of that sort.
> > 
> > Hm, I would say that we leave this as it is currently, and then we can 
> > always play more tricks later on if we found any of such systems.
> 
> As long as the code is experimental, and there's a note to that effect
> which can be easily found (and has to be gone for the code to become
> non-experimental), I'm fine with that.
> 
> >> > Might it be possible to solve this by identity mapping the first 1MB, and 
> >> > marking the RAM regions there as p2m_ram_rw? Or would that create even 
> >> > further problems?
> >> 
> >> Hmm, not sure - the range below 1Mb is marked as MMIO in
> >> frame_table[], so there would be a (benign?) conflict at least there.
> > 
> > As said before, I would leave the current implementation and look into that 
> > option if needed.

Right, I've added to following note as a comment:

NB2: regions marked as RAM in the memory map are backed by RAM pages
in the p2m, and the original data is copied over. This is done because
at least FreeBSD places the AP boot trampoline in a RAM region found
below the first MB, and the real-mode emulator found in Xen cannot
deal with code that resides in guest pages marked as MMIO. This can
cause problems if the memory map is not correct, and for example the
EBDA or the video ROM region is marked as RAM.

Roger.
Jan Beulich Oct. 26, 2016, 3:16 p.m. UTC | #8
>>> On 26.10.16 at 17:08, <roger.pau@citrix.com> wrote:
> On Wed, Oct 26, 2016 at 08:10:50AM -0600, Jan Beulich wrote:
>> >>> On 26.10.16 at 13:35, <roger.pau@citrix.com> wrote:
>> > On Wed, Oct 12, 2016 at 09:55:44AM -0600, Jan Beulich wrote:
>> >> Taking an abstract perspective I agree with Andrew that we should
>> >> be whitelisting here. However, as you already see from the list you
>> >> provided (which afaict is far from complete wrt ACPI 6.1), this may
>> >> become cumbersome already initially, not to speak of down the road.
>> > 
>> > I've initially used a back-listing approach. We can always change this later 
>> > on.
>> > 
>> > So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
>> > crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
>> > 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
>> > from the guest memory map and marked as E820_ACPI, which means that the new 
>> > RSDP no longer resides below 1MB, and that the Dom0 kernel _must_ use the 
>> > rsdp_paddr provided in the start info, or else it's going to access the 
>> > native RSDP.
>> 
>> Hmm, for the RSDP I'm not sure. It might be better if we put it at the
>> same spot where the host one is, mapping a RAM page there with a
>> copy of the respective host page data. Otoh your approach allows
>> Dom0 to still find the real tables if need be, which has both up and
>> down sides.
> 
> The problem with putting it at the same page is that AFAICT there's a big 
> chance that other things (like EBDA or ROM) being at the same page, and 
> we would be shadowing them by mapping a RAM page over it, even if the 
> original data is copied. This hole area below 1MB is just a mess to deal 
> with TBH.

Unless it's page zero, what bad could such shadowing do?

Jan
Roger Pau Monné Oct. 26, 2016, 4:03 p.m. UTC | #9
On Wed, Oct 26, 2016 at 09:16:55AM -0600, Jan Beulich wrote:
> >>> On 26.10.16 at 17:08, <roger.pau@citrix.com> wrote:
> > On Wed, Oct 26, 2016 at 08:10:50AM -0600, Jan Beulich wrote:
> >> >>> On 26.10.16 at 13:35, <roger.pau@citrix.com> wrote:
> >> > On Wed, Oct 12, 2016 at 09:55:44AM -0600, Jan Beulich wrote:
> >> >> Taking an abstract perspective I agree with Andrew that we should
> >> >> be whitelisting here. However, as you already see from the list you
> >> >> provided (which afaict is far from complete wrt ACPI 6.1), this may
> >> >> become cumbersome already initially, not to speak of down the road.
> >> > 
> >> > I've initially used a back-listing approach. We can always change this later 
> >> > on.
> >> > 
> >> > So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
> >> > crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
> >> > 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
> >> > from the guest memory map and marked as E820_ACPI, which means that the new 
> >> > RSDP no longer resides below 1MB, and that the Dom0 kernel _must_ use the 
> >> > rsdp_paddr provided in the start info, or else it's going to access the 
> >> > native RSDP.
> >> 
> >> Hmm, for the RSDP I'm not sure. It might be better if we put it at the
> >> same spot where the host one is, mapping a RAM page there with a
> >> copy of the respective host page data. Otoh your approach allows
> >> Dom0 to still find the real tables if need be, which has both up and
> >> down sides.
> > 
> > The problem with putting it at the same page is that AFAICT there's a big 
> > chance that other things (like EBDA or ROM) being at the same page, and 
> > we would be shadowing them by mapping a RAM page over it, even if the 
> > original data is copied. This hole area below 1MB is just a mess to deal 
> > with TBH.
> 
> Unless it's page zero, what bad could such shadowing do?

You where the one that warmed me of this in 
http://marc.info/?l=xen-devel&m=147576851115855 that shadowing regions 
below 1MB could prevent the OS from properly interacting with the firmware, 
or at least this was my understanding. The RSDP is usually located in the 
same page as the EBDA, so we would be shadowing the EBDA area.

Roger.
Boris Ostrovsky Oct. 26, 2016, 5:14 p.m. UTC | #10
>>>> I've initially used a back-listing approach. We can always change this later 
>>>> on.
>>>>
>>>> So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
>>>> crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
>>>> 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
>>>> from the guest memory map and marked as E820_ACPI, which means that the new 
>>>> RSDP no longer resides below 1MB, 


As I mentioned in the other thread I am not sure this would be in
compliance with the ACPI spec.

-boris



>>>> and that the Dom0 kernel _must_ use the 
>>>> rsdp_paddr provided in the start info, or else it's going to access the 
>>>> native RSDP.
>>>
Jan Beulich Oct. 27, 2016, 7:25 a.m. UTC | #11
>>> On 26.10.16 at 18:03, <roger.pau@citrix.com> wrote:
> On Wed, Oct 26, 2016 at 09:16:55AM -0600, Jan Beulich wrote:
>> >>> On 26.10.16 at 17:08, <roger.pau@citrix.com> wrote:
>> > On Wed, Oct 26, 2016 at 08:10:50AM -0600, Jan Beulich wrote:
>> >> >>> On 26.10.16 at 13:35, <roger.pau@citrix.com> wrote:
>> >> > On Wed, Oct 12, 2016 at 09:55:44AM -0600, Jan Beulich wrote:
>> >> >> Taking an abstract perspective I agree with Andrew that we should
>> >> >> be whitelisting here. However, as you already see from the list you
>> >> >> provided (which afaict is far from complete wrt ACPI 6.1), this may
>> >> >> become cumbersome already initially, not to speak of down the road.
>> >> > 
>> >> > I've initially used a back-listing approach. We can always change this later 
>> >> > on.
>> >> > 
>> >> > So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
>> >> > crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
>> >> > 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
>> >> > from the guest memory map and marked as E820_ACPI, which means that the new 
>> >> > RSDP no longer resides below 1MB, and that the Dom0 kernel _must_ use the 
>> >> > rsdp_paddr provided in the start info, or else it's going to access the 
>> >> > native RSDP.
>> >> 
>> >> Hmm, for the RSDP I'm not sure. It might be better if we put it at the
>> >> same spot where the host one is, mapping a RAM page there with a
>> >> copy of the respective host page data. Otoh your approach allows
>> >> Dom0 to still find the real tables if need be, which has both up and
>> >> down sides.
>> > 
>> > The problem with putting it at the same page is that AFAICT there's a big 
>> > chance that other things (like EBDA or ROM) being at the same page, and 
>> > we would be shadowing them by mapping a RAM page over it, even if the 
>> > original data is copied. This hole area below 1MB is just a mess to deal 
>> > with TBH.
>> 
>> Unless it's page zero, what bad could such shadowing do?
> 
> You where the one that warmed me of this in 
> http://marc.info/?l=xen-devel&m=147576851115855 that shadowing regions 
> below 1MB could prevent the OS from properly interacting with the firmware, 
> or at least this was my understanding. The RSDP is usually located in the 
> same page as the EBDA, so we would be shadowing the EBDA area.

Oh, right, for a moment I did forget that the EBDA is a permissible
place for the RSDPTR to live in. Only mapping a page over a ROM
one (E0000...FFFFF) would be reasonable. So as said - there are
positive aspects to keeping the original pointer visible; the main
issue I foresee is that once again user mode tools will (unless made
Xen-aware) have a different view of the system than the kernel.

Jan
Jan Beulich Oct. 27, 2016, 7:27 a.m. UTC | #12
>>> On 26.10.16 at 19:14, <boris.ostrovsky@oracle.com> wrote:
>>>>> I've initially used a back-listing approach. We can always change this later 
>>>>> on.
>>>>>
>>>>> So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
>>>>> crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
>>>>> 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
>>>>> from the guest memory map and marked as E820_ACPI, which means that the new 
>>>>> RSDP no longer resides below 1MB, 
> 
> As I mentioned in the other thread I am not sure this would be in
> compliance with the ACPI spec.

While that's true, Roger had already pointed out that the normal
way of finding RSDPTR needs to be avoided anyway.

Jan
Roger Pau Monné Oct. 27, 2016, 11:08 a.m. UTC | #13
On Thu, Oct 27, 2016 at 01:25:40AM -0600, Jan Beulich wrote:
> >>> On 26.10.16 at 18:03, <roger.pau@citrix.com> wrote:
> > On Wed, Oct 26, 2016 at 09:16:55AM -0600, Jan Beulich wrote:
> >> >>> On 26.10.16 at 17:08, <roger.pau@citrix.com> wrote:
> >> > On Wed, Oct 26, 2016 at 08:10:50AM -0600, Jan Beulich wrote:
> >> >> >>> On 26.10.16 at 13:35, <roger.pau@citrix.com> wrote:
> >> >> > On Wed, Oct 12, 2016 at 09:55:44AM -0600, Jan Beulich wrote:
> >> >> >> Taking an abstract perspective I agree with Andrew that we should
> >> >> >> be whitelisting here. However, as you already see from the list you
> >> >> >> provided (which afaict is far from complete wrt ACPI 6.1), this may
> >> >> >> become cumbersome already initially, not to speak of down the road.
> >> >> > 
> >> >> > I've initially used a back-listing approach. We can always change this later 
> >> >> > on.
> >> >> > 
> >> >> > So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
> >> >> > crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
> >> >> > 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
> >> >> > from the guest memory map and marked as E820_ACPI, which means that the new 
> >> >> > RSDP no longer resides below 1MB, and that the Dom0 kernel _must_ use the 
> >> >> > rsdp_paddr provided in the start info, or else it's going to access the 
> >> >> > native RSDP.
> >> >> 
> >> >> Hmm, for the RSDP I'm not sure. It might be better if we put it at the
> >> >> same spot where the host one is, mapping a RAM page there with a
> >> >> copy of the respective host page data. Otoh your approach allows
> >> >> Dom0 to still find the real tables if need be, which has both up and
> >> >> down sides.
> >> > 
> >> > The problem with putting it at the same page is that AFAICT there's a big 
> >> > chance that other things (like EBDA or ROM) being at the same page, and 
> >> > we would be shadowing them by mapping a RAM page over it, even if the 
> >> > original data is copied. This hole area below 1MB is just a mess to deal 
> >> > with TBH.
> >> 
> >> Unless it's page zero, what bad could such shadowing do?
> > 
> > You where the one that warmed me of this in 
> > http://marc.info/?l=xen-devel&m=147576851115855 that shadowing regions 
> > below 1MB could prevent the OS from properly interacting with the firmware, 
> > or at least this was my understanding. The RSDP is usually located in the 
> > same page as the EBDA, so we would be shadowing the EBDA area.
> 
> Oh, right, for a moment I did forget that the EBDA is a permissible
> place for the RSDPTR to live in. Only mapping a page over a ROM
> one (E0000...FFFFF) would be reasonable. So as said - there are
> positive aspects to keeping the original pointer visible; the main
> issue I foresee is that once again user mode tools will (unless made
> Xen-aware) have a different view of the system than the kernel.

Yes, that's certainly possible. On FreeBSD acpidump will try to fetch the 
address of the RSDP from sysctl, and that's going to be right (because it's 
set by the kernel based on the RSDP that's using). I guess Linux is using 
some similar functionality, or else tools like acpidump won't work on UEFI 
environments (where AFAIK the RSDP can be anywhere in memory).

Roger.
Roger Pau Monné Oct. 27, 2016, 11:13 a.m. UTC | #14
On Wed, Oct 26, 2016 at 01:14:16PM -0400, Boris Ostrovsky wrote:
> 
> >>>> I've initially used a back-listing approach. We can always change this later 
> >>>> on.
> >>>>
> >>>> So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
> >>>> crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
> >>>> 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
> >>>> from the guest memory map and marked as E820_ACPI, which means that the new 
> >>>> RSDP no longer resides below 1MB, 
> 
> 
> As I mentioned in the other thread I am not sure this would be in
> compliance with the ACPI spec.

Right, the ACPI spec mandates that the RSDP must reside in the first 1 KB of 
the EBDA, or in the ROM regions between 0xE0000 0xFFFFF, but that's only 
when booted from a legacy BIOS, when booted from UEFI the RSDP can reside 
anywhere in memory, and the pointer must be fetched from a UEFI specific 
table. I would consider Xen to be similar to UEFI boot in that regard.

Roger.
Jan Beulich Oct. 27, 2016, 11:25 a.m. UTC | #15
>>> On 27.10.16 at 13:13, <roger.pau@citrix.com> wrote:
> On Wed, Oct 26, 2016 at 01:14:16PM -0400, Boris Ostrovsky wrote:
>> >>>> I've initially used a back-listing approach. We can always change this later 
>> >>>> on.
>> >>>>
>> >>>> So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not 
>> >>>> crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address = 
>> >>>> 0 in the RSDP together with revision = 2). This is all placed in RAM stolen 
>> >>>> from the guest memory map and marked as E820_ACPI, which means that the new 
>> >>>> RSDP no longer resides below 1MB, 
>> 
>> As I mentioned in the other thread I am not sure this would be in
>> compliance with the ACPI spec.
> 
> Right, the ACPI spec mandates that the RSDP must reside in the first 1 KB of 
> the EBDA, or in the ROM regions between 0xE0000 0xFFFFF, but that's only 
> when booted from a legacy BIOS, when booted from UEFI the RSDP can reside 
> anywhere in memory, and the pointer must be fetched from a UEFI specific 
> table. I would consider Xen to be similar to UEFI boot in that regard.

+1

Jan
Boris Ostrovsky Oct. 27, 2016, 1:51 p.m. UTC | #16
On 10/27/2016 07:13 AM, Roger Pau Monne wrote:
> On Wed, Oct 26, 2016 at 01:14:16PM -0400, Boris Ostrovsky wrote:
>>
>>>>>> I've initially used a back-listing approach. We can always change this later
>>>>>> on.
>>>>>>
>>>>>> So I've ended up crafting a new MADT, XSDT and RSDP. Note that I'm not
>>>>>> crafting a new custom RSDT (and in fact I'm setting rsdt_physical_address =
>>>>>> 0 in the RSDP together with revision = 2). This is all placed in RAM stolen
>>>>>> from the guest memory map and marked as E820_ACPI, which means that the new
>>>>>> RSDP no longer resides below 1MB,
>>
>>
>> As I mentioned in the other thread I am not sure this would be in
>> compliance with the ACPI spec.
>
> Right, the ACPI spec mandates that the RSDP must reside in the first 1 KB of
> the EBDA, or in the ROM regions between 0xE0000 0xFFFFF, but that's only
> when booted from a legacy BIOS, when booted from UEFI the RSDP can reside
> anywhere in memory, and the pointer must be fetched from a UEFI specific
> table. I would consider Xen to be similar to UEFI boot in that regard.


It is similar but not a true UEFI boot so we are violating the spec. 
Which, for example, means that standard Linux ACPI root discovery won't 
work.

I re-read this thread and I am not sure I understand why we can't keep 
host's RSDP descriptor. We are not mapping dom0 memory 1:1 (right?) so 
we can place our versions of RSDT/XSDT at the address that the 
descriptor points to.

Unless that address is beyond dom0 memory allocation so that could be a 
problem I guess.

-boris
Jan Beulich Oct. 27, 2016, 2:02 p.m. UTC | #17
>>> On 27.10.16 at 15:51, <boris.ostrovsky@oracle.com> wrote:
> I re-read this thread and I am not sure I understand why we can't keep 
> host's RSDP descriptor. We are not mapping dom0 memory 1:1 (right?) so 
> we can place our versions of RSDT/XSDT at the address that the 
> descriptor points to.
> 
> Unless that address is beyond dom0 memory allocation so that could be a 
> problem I guess.

Also eventually we may want to add tables, not just remove some,
and then there may not be enough space at the original location.
Plus I think nothing precludes the XSDT living below 1Mb, and then
we're back into the same problems discussed in another branch of
this thread.

Jan
Boris Ostrovsky Oct. 27, 2016, 2:15 p.m. UTC | #18
On 10/27/2016 10:02 AM, Jan Beulich wrote:
>>>> On 27.10.16 at 15:51, <boris.ostrovsky@oracle.com> wrote:
>> I re-read this thread and I am not sure I understand why we can't keep
>> host's RSDP descriptor. We are not mapping dom0 memory 1:1 (right?) so
>> we can place our versions of RSDT/XSDT at the address that the
>> descriptor points to.
>>
>> Unless that address is beyond dom0 memory allocation so that could be a
>> problem I guess.
>
> Also eventually we may want to add tables, not just remove some,
> and then there may not be enough space at the original location.
> Plus I think nothing precludes the XSDT living below 1Mb, and then
> we're back into the same problems discussed in another branch of
> this thread.

Yes, that is a problem.

I guess we will need to do something in Linux to override descriptor 
search. There is already some cruft for this but it appears to be 
kexec-specific.


-boris
Jan Beulich Oct. 27, 2016, 2:30 p.m. UTC | #19
>>> On 27.10.16 at 16:15, <boris.ostrovsky@oracle.com> wrote:
> On 10/27/2016 10:02 AM, Jan Beulich wrote:
>>>>> On 27.10.16 at 15:51, <boris.ostrovsky@oracle.com> wrote:
>>> I re-read this thread and I am not sure I understand why we can't keep
>>> host's RSDP descriptor. We are not mapping dom0 memory 1:1 (right?) so
>>> we can place our versions of RSDT/XSDT at the address that the
>>> descriptor points to.
>>>
>>> Unless that address is beyond dom0 memory allocation so that could be a
>>> problem I guess.
>>
>> Also eventually we may want to add tables, not just remove some,
>> and then there may not be enough space at the original location.
>> Plus I think nothing precludes the XSDT living below 1Mb, and then
>> we're back into the same problems discussed in another branch of
>> this thread.
> 
> Yes, that is a problem.
> 
> I guess we will need to do something in Linux to override descriptor 
> search. There is already some cruft for this but it appears to be 
> kexec-specific.

As Roger pointed out - there should be kexec-independent logic
to avoid that lookup on EFI systems (by finding the pointer earlier
another way).

Jan
Boris Ostrovsky Oct. 27, 2016, 2:40 p.m. UTC | #20
On 10/27/2016 10:30 AM, Jan Beulich wrote:
>>>> On 27.10.16 at 16:15, <boris.ostrovsky@oracle.com> wrote:
>> On 10/27/2016 10:02 AM, Jan Beulich wrote:
>>>>>> On 27.10.16 at 15:51, <boris.ostrovsky@oracle.com> wrote:
>>>> I re-read this thread and I am not sure I understand why we can't keep
>>>> host's RSDP descriptor. We are not mapping dom0 memory 1:1 (right?) so
>>>> we can place our versions of RSDT/XSDT at the address that the
>>>> descriptor points to.
>>>>
>>>> Unless that address is beyond dom0 memory allocation so that could be a
>>>> problem I guess.
>>>
>>> Also eventually we may want to add tables, not just remove some,
>>> and then there may not be enough space at the original location.
>>> Plus I think nothing precludes the XSDT living below 1Mb, and then
>>> we're back into the same problems discussed in another branch of
>>> this thread.
>>
>> Yes, that is a problem.
>>
>> I guess we will need to do something in Linux to override descriptor
>> search. There is already some cruft for this but it appears to be
>> kexec-specific.
>
> As Roger pointed out - there should be kexec-independent logic
> to avoid that lookup on EFI systems (by finding the pointer earlier
> another way).


Yes, that's exactly what Linux does (acpi_os_get_root_pointer()) but 
that would imply that we are running on a UEFI system, which we are not. 
And trying to fake just this feature may cause other components to get 
confused.


-boris
Roger Pau Monné Oct. 27, 2016, 3:04 p.m. UTC | #21
On Thu, Oct 27, 2016 at 10:40:52AM -0400, Boris Ostrovsky wrote:
> 
> 
> On 10/27/2016 10:30 AM, Jan Beulich wrote:
> > > > > On 27.10.16 at 16:15, <boris.ostrovsky@oracle.com> wrote:
> > > On 10/27/2016 10:02 AM, Jan Beulich wrote:
> > > > > > > On 27.10.16 at 15:51, <boris.ostrovsky@oracle.com> wrote:
> > > > > I re-read this thread and I am not sure I understand why we can't keep
> > > > > host's RSDP descriptor. We are not mapping dom0 memory 1:1 (right?) so
> > > > > we can place our versions of RSDT/XSDT at the address that the
> > > > > descriptor points to.
> > > > > 
> > > > > Unless that address is beyond dom0 memory allocation so that could be a
> > > > > problem I guess.
> > > > 
> > > > Also eventually we may want to add tables, not just remove some,
> > > > and then there may not be enough space at the original location.
> > > > Plus I think nothing precludes the XSDT living below 1Mb, and then
> > > > we're back into the same problems discussed in another branch of
> > > > this thread.
> > > 
> > > Yes, that is a problem.
> > > 
> > > I guess we will need to do something in Linux to override descriptor
> > > search. There is already some cruft for this but it appears to be
> > > kexec-specific.
> > 
> > As Roger pointed out - there should be kexec-independent logic
> > to avoid that lookup on EFI systems (by finding the pointer earlier
> > another way).
> 
> 
> Yes, that's exactly what Linux does (acpi_os_get_root_pointer()) but that
> would imply that we are running on a UEFI system, which we are not. And
> trying to fake just this feature may cause other components to get confused.

TBH, I would just make acpi_rsdp not dependent on KEXEC, but is this going 
to be propagated to user-space tools somehow (ie: so that acpidump will 
report the right tables)?

I prefer this solution because it seems simpler and less likely to cause 
future issues, but if this seems like too much fuss I can always try to 
place the RSDP on top of the original one and shadow that page. From the 
information I've been able to find on the Internet, the EBDA just seems to 
contain the position of the pointing device (PS/2 mouse) [0], which I doubt 
is used by any modern OS.

Roger.

[0] http://www.msc-technologies.eu/fileadmin/documentpool/Support-Center/Archive/70-PISA/PISA-P/02-Manual/BIOS%20Programmer's%20Guide%20v10.pdf?file=&ref=
Jan Beulich Oct. 27, 2016, 3:20 p.m. UTC | #22
>>> On 27.10.16 at 17:04, <roger.pau@citrix.com> wrote:
> I prefer this solution because it seems simpler and less likely to cause 
> future issues, but if this seems like too much fuss I can always try to 
> place the RSDP on top of the original one and shadow that page. From the 
> information I've been able to find on the Internet, the EBDA just seems to 
> contain the position of the pointing device (PS/2 mouse) [0], which I doubt 
> is used by any modern OS.

Well, that's the (half way) documented aspect of what it gets used
for. As far as I recall from my DOS support / engineering days,
there's quite a bit more use of it without being documented (among
other things for remote boot purposes). And then I'm surprised you
found only that part documented. My primary source of information
back in those days (Ralph Brown's "Interrupt List") tells me:

Format of Extended BIOS Data Area (IBM):
Offset	Size	Description	(Table M0001)
 00h	BYTE	length of EBDA in kilobytes
 01h 15 BYTEs	reserved
 17h	BYTE	number of entries in POST error log (0-5)
 18h  5 WORDs	POST error log (each word is a POST error number)
 22h	DWORD	Pointing Device Driver entry point
 26h	BYTE	Pointing Device Flags 1 (see #M0002)
 27h	BYTE	Pointing Device Flags 2 (see #M0003)
 28h  8 BYTEs	Pointing Device Auxiliary Device Data
 30h	DWORD	Vector for INT 07h stored here during 80387 interrupt
 34h	DWORD	Vector for INT 01h stored here during INT 07h emulation
 38h	BYTE	Scratchpad for 80287/80387 interrupt code
 39h	WORD	Timer3: Watchdog timer initial count
 3Bh	BYTE	??? seen non-zero on Model 30
 3Ch	BYTE	???
 3Dh 16 BYTEs	Fixed Disk parameter table for drive 0 (for older machines
		  which don't directly support the installed drive)
 4Dh 16 BYTEs	Fixed Disk parameter table for drive 1 (for older machines
		  which don't directly support the installed drive)
 5Dh-67h	???
 68h	BYTE	cache control
		bits 7-2 unused (0)
		bit 1: CPU cache failed test
		bit 0: CPU cache disabled
 69h-6Bh	???
 6Ch	BYTE	Fixed disk: (=FFh on ESDI systems)
		    bits 7-4: Channel number 00-0Fh
		    bits 3-0: DMA arbitration level 00-0Eh
 6Dh	BYTE	???
 6Eh	WORD	current typematic setting (see INT 16/AH=03h)
 70h	BYTE	number of attached hard drives
 71h	BYTE	hard disk 16-bit DMA channel
 72h	BYTE	interrupt status for hard disk controller (1Fh on timeout)
 73h	BYTE	hard disk operation flags
		bit 7: controller issued operation-complete INT 76h
		bit 6: controller has been reset
		bits 5-0: unused (0)
 74h	DWORD	old INT 76h vector
 78h	BYTE	hard disk DMA type
		typically 44h for reads and 4Ch for writes
 79h	BYTE	status of last hard disk operation
 7Ah	BYTE	hard disk timeout counter
 7Bh-7Dh
 7Eh  8 WORDs	storage for hard disk controller status
 8Eh-E6h
 E7h	BYTE	floppy drive type
		bit 7: drive(s) present
		bits 6-2: unused (0)
		bit 1: drive 1 is 5.25" instead of 3.5"
		bit 0: drive 0 is 5.25"
 E8h  4 BYTEs	???
 ECh	BYTE	hard disk parameters flag
		bit 7: parameters loaded into EBDA
		bits 6-0: unused (0)
 EDh	BYTE	???
 EEh	BYTE	CPU family ID (03h = 386, 04h = 486, etc.) (see INT 15/AH=C9h)
 EFh	BYTE	CPU stepping (see INT 15/AH=C9h)
 F0h 39 BYTEs	???
117h	WORD	keyboard ID (see INT 16/AH=0Ah)
		(most commonly 41ABh)
119h	BYTE	???
11Ah	BYTE	non-BIOS INT 18h flag
		bits 7-1: unused (0)
		bit 0: set by BIOS before calling user INT 18h at offset 11Dh
11Bh  2 BYTE	???
11Dh	DWORD	user INT 18h vector if BIOS has re-hooked INT 18h
121h and up:	??? seen non-zero on Model 60
3F0h	BYTE	Fixed disk buffer (???)

Many question marks there ... There's a second layout documented
in there, too, but in the end the actual layout is of no interest to us.

Jan
Roger Pau Monné Oct. 27, 2016, 3:37 p.m. UTC | #23
On Thu, Oct 27, 2016 at 09:20:00AM -0600, Jan Beulich wrote:
> >>> On 27.10.16 at 17:04, <roger.pau@citrix.com> wrote:
> > I prefer this solution because it seems simpler and less likely to cause 
> > future issues, but if this seems like too much fuss I can always try to 
> > place the RSDP on top of the original one and shadow that page. From the 
> > information I've been able to find on the Internet, the EBDA just seems to 
> > contain the position of the pointing device (PS/2 mouse) [0], which I doubt 
> > is used by any modern OS.
> 
> Well, that's the (half way) documented aspect of what it gets used
> for. As far as I recall from my DOS support / engineering days,
> there's quite a bit more use of it without being documented (among
> other things for remote boot purposes). And then I'm surprised you
> found only that part documented. My primary source of information
> back in those days (Ralph Brown's "Interrupt List") tells me:
> 
> Format of Extended BIOS Data Area (IBM):
> Offset	Size	Description	(Table M0001)
>  00h	BYTE	length of EBDA in kilobytes
>  01h 15 BYTEs	reserved
>  17h	BYTE	number of entries in POST error log (0-5)
>  18h  5 WORDs	POST error log (each word is a POST error number)
>  22h	DWORD	Pointing Device Driver entry point
>  26h	BYTE	Pointing Device Flags 1 (see #M0002)
>  27h	BYTE	Pointing Device Flags 2 (see #M0003)
>  28h  8 BYTEs	Pointing Device Auxiliary Device Data
>  30h	DWORD	Vector for INT 07h stored here during 80387 interrupt
>  34h	DWORD	Vector for INT 01h stored here during INT 07h emulation
>  38h	BYTE	Scratchpad for 80287/80387 interrupt code
>  39h	WORD	Timer3: Watchdog timer initial count
>  3Bh	BYTE	??? seen non-zero on Model 30
>  3Ch	BYTE	???
>  3Dh 16 BYTEs	Fixed Disk parameter table for drive 0 (for older machines
> 		  which don't directly support the installed drive)
>  4Dh 16 BYTEs	Fixed Disk parameter table for drive 1 (for older machines
> 		  which don't directly support the installed drive)
>  5Dh-67h	???
>  68h	BYTE	cache control
> 		bits 7-2 unused (0)
> 		bit 1: CPU cache failed test
> 		bit 0: CPU cache disabled
>  69h-6Bh	???
>  6Ch	BYTE	Fixed disk: (=FFh on ESDI systems)
> 		    bits 7-4: Channel number 00-0Fh
> 		    bits 3-0: DMA arbitration level 00-0Eh
>  6Dh	BYTE	???
>  6Eh	WORD	current typematic setting (see INT 16/AH=03h)
>  70h	BYTE	number of attached hard drives
>  71h	BYTE	hard disk 16-bit DMA channel
>  72h	BYTE	interrupt status for hard disk controller (1Fh on timeout)
>  73h	BYTE	hard disk operation flags
> 		bit 7: controller issued operation-complete INT 76h
> 		bit 6: controller has been reset
> 		bits 5-0: unused (0)
>  74h	DWORD	old INT 76h vector
>  78h	BYTE	hard disk DMA type
> 		typically 44h for reads and 4Ch for writes
>  79h	BYTE	status of last hard disk operation
>  7Ah	BYTE	hard disk timeout counter
>  7Bh-7Dh
>  7Eh  8 WORDs	storage for hard disk controller status
>  8Eh-E6h
>  E7h	BYTE	floppy drive type
> 		bit 7: drive(s) present
> 		bits 6-2: unused (0)
> 		bit 1: drive 1 is 5.25" instead of 3.5"
> 		bit 0: drive 0 is 5.25"
>  E8h  4 BYTEs	???
>  ECh	BYTE	hard disk parameters flag
> 		bit 7: parameters loaded into EBDA
> 		bits 6-0: unused (0)
>  EDh	BYTE	???
>  EEh	BYTE	CPU family ID (03h = 386, 04h = 486, etc.) (see INT 15/AH=C9h)
>  EFh	BYTE	CPU stepping (see INT 15/AH=C9h)
>  F0h 39 BYTEs	???
> 117h	WORD	keyboard ID (see INT 16/AH=0Ah)
> 		(most commonly 41ABh)
> 119h	BYTE	???
> 11Ah	BYTE	non-BIOS INT 18h flag
> 		bits 7-1: unused (0)
> 		bit 0: set by BIOS before calling user INT 18h at offset 11Dh
> 11Bh  2 BYTE	???
> 11Dh	DWORD	user INT 18h vector if BIOS has re-hooked INT 18h
> 121h and up:	??? seen non-zero on Model 60
> 3F0h	BYTE	Fixed disk buffer (???)
> 
> Many question marks there ... There's a second layout documented
> in there, too, but in the end the actual layout is of no interest to us.

Thanks for the info, seeing that now I think we have no other option but to 
place the RSDP in another area.

Roger.
Boris Ostrovsky Oct. 28, 2016, 1:51 p.m. UTC | #24
On 10/27/2016 11:04 AM, Roger Pau Monne wrote:
> On Thu, Oct 27, 2016 at 10:40:52AM -0400, Boris Ostrovsky wrote:
>>
>> On 10/27/2016 10:30 AM, Jan Beulich wrote:
>>>>>> On 27.10.16 at 16:15, <boris.ostrovsky@oracle.com> wrote:
>>>> On 10/27/2016 10:02 AM, Jan Beulich wrote:
>>>>>>>> On 27.10.16 at 15:51, <boris.ostrovsky@oracle.com> wrote:
>>>>>> I re-read this thread and I am not sure I understand why we can't keep
>>>>>> host's RSDP descriptor. We are not mapping dom0 memory 1:1 (right?) so
>>>>>> we can place our versions of RSDT/XSDT at the address that the
>>>>>> descriptor points to.
>>>>>>
>>>>>> Unless that address is beyond dom0 memory allocation so that could be a
>>>>>> problem I guess.
>>>>> Also eventually we may want to add tables, not just remove some,
>>>>> and then there may not be enough space at the original location.
>>>>> Plus I think nothing precludes the XSDT living below 1Mb, and then
>>>>> we're back into the same problems discussed in another branch of
>>>>> this thread.
>>>> Yes, that is a problem.
>>>>
>>>> I guess we will need to do something in Linux to override descriptor
>>>> search. There is already some cruft for this but it appears to be
>>>> kexec-specific.
>>> As Roger pointed out - there should be kexec-independent logic
>>> to avoid that lookup on EFI systems (by finding the pointer earlier
>>> another way).
>>
>> Yes, that's exactly what Linux does (acpi_os_get_root_pointer()) but that
>> would imply that we are running on a UEFI system, which we are not. And
>> trying to fake just this feature may cause other components to get confused.
> TBH, I would just make acpi_rsdp not dependent on KEXEC, but is this going 
> to be propagated to user-space tools somehow (ie: so that acpidump will 
> report the right tables)?

(Sorry, I forgot to respond)

If a tool is reading /sys/firmware/acpi/tables then I think it will see
what the kernel is using. acpidump, for example, *is* reading this
directory.

-boris

>
> I prefer this solution because it seems simpler and less likely to cause 
> future issues, but if this seems like too much fuss I can always try to 
> place the RSDP on top of the original one and shadow that page. From the 
> information I've been able to find on the Internet, the EBDA just seems to 
> contain the position of the pointing device (PS/2 mouse) [0], which I doubt 
> is used by any modern OS.
>
> Roger.
>
> [0] http://www.msc-technologies.eu/fileadmin/documentpool/Support-Center/Archive/70-PISA/PISA-P/02-Manual/BIOS%20Programmer's%20Guide%20v10.pdf?file=&ref=
diff mbox

Patch

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 8ea54ae..407f742 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -23,6 +23,7 @@ 
 #include <xen/libelf.h>
 #include <xen/pfn.h>
 #include <xen/guest_access.h>
+#include <xen/acpi.h>
 #include <asm/regs.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -38,6 +39,8 @@ 
 #include <asm/io_apic.h>
 #include <asm/hpet.h>
 
+#include <acpi/actables.h>
+
 #include <public/version.h>
 #include <public/arch-x86/hvm/start_info.h>
 #include <public/hvm/hvm_vcpu.h>
@@ -50,6 +53,8 @@  static long __initdata dom0_max_nrpages = LONG_MAX;
 #define HVM_VM86_TSS_SIZE   128
 
 static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];
+static unsigned int __initdata acpi_intr_overrrides = 0;
+static struct acpi_madt_interrupt_override __initdata *intsrcovr = NULL;
 
 /*
  * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
@@ -1999,6 +2004,7 @@  static int __init hvm_load_kernel(struct domain *d, const module_t *image,
     last_addr += sizeof(mod);
     start_info.magic = XEN_HVM_START_MAGIC_VALUE;
     start_info.flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
+    start_info.rsdp_paddr = acpi_os_get_root_pointer();
     rc = hvm_copy_to_guest_phys(last_addr, &start_info, sizeof(start_info));
     if ( rc != HVMCOPY_okay )
     {
@@ -2111,6 +2117,267 @@  static int __init hvm_setup_cpus(struct domain *d, paddr_t entry,
     return 0;
 }
 
+static int __init acpi_count_intr_ov(struct acpi_subtable_header *header,
+                                     const unsigned long end)
+{
+
+    acpi_intr_overrrides++;
+    return 0;
+}
+
+static int __init acpi_set_intr_ov(struct acpi_subtable_header *header,
+                                   const unsigned long end)
+{
+    struct acpi_madt_interrupt_override *intr =
+        container_of(header, struct acpi_madt_interrupt_override, header);
+
+    ACPI_MEMCPY(intsrcovr, intr, sizeof(*intr));
+    intsrcovr++;
+
+    return 0;
+}
+
+static void __init acpi_zap_table_signature(char *name)
+{
+    struct acpi_table_header *table;
+    acpi_status status;
+    union {
+        char str[ACPI_NAME_SIZE];
+        uint32_t bits;
+    } signature;
+    char tmp;
+    int i;
+
+    status = acpi_get_table(name, 0, &table);
+    if ( ACPI_SUCCESS(status) )
+    {
+        memcpy(&signature.str[0], &table->signature[0], ACPI_NAME_SIZE);
+        for ( i = 0; i < ACPI_NAME_SIZE / 2; i++ )
+        {
+            tmp = signature.str[ACPI_NAME_SIZE - i - 1];
+            signature.str[ACPI_NAME_SIZE - i - 1] = signature.str[i];
+            signature.str[i] = tmp;
+        }
+        write_atomic((uint32_t*)&table->signature[0], signature.bits);
+    }
+}
+
+static int __init hvm_setup_acpi(struct domain *d)
+{
+    struct vcpu *saved_current, *v = d->vcpu[0];
+    unsigned long pfn, nr_pages;
+    uint64_t size, start_addr, end_addr;
+    uint64_t madt_addr[2] = { 0, 0 };
+    struct acpi_table_header *table;
+    struct acpi_table_madt *madt;
+    struct acpi_madt_io_apic *io_apic;
+    struct acpi_madt_local_apic *local_apic;
+    acpi_status status;
+    int rc, i;
+
+    printk("** Setup ACPI tables **\n");
+
+    /* ZAP the HPET, SLIT, SRAT, MPST and PMTT tables. */
+    acpi_zap_table_signature(ACPI_SIG_HPET);
+    acpi_zap_table_signature(ACPI_SIG_SLIT);
+    acpi_zap_table_signature(ACPI_SIG_SRAT);
+    acpi_zap_table_signature(ACPI_SIG_MPST);
+    acpi_zap_table_signature(ACPI_SIG_PMTT);
+
+    /* Map ACPI tables 1:1 */
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        if ( d->arch.e820[i].type != E820_ACPI &&
+             d->arch.e820[i].type != E820_NVS )
+            continue;
+
+        pfn = PFN_DOWN(d->arch.e820[i].addr);
+        nr_pages = DIV_ROUND_UP(d->arch.e820[i].size, PAGE_SIZE);
+
+        rc = modify_mmio_11(d, pfn, nr_pages, true);
+        if ( rc )
+        {
+            printk(
+                "Failed to map ACPI region %#lx - %#lx into Dom0 memory map\n",
+                   pfn, pfn + nr_pages);
+            return rc;
+        }
+    }
+    /*
+     * Since most memory maps provided by hardware are wrong, make sure each
+     * top-level table is properly mapped into Dom0.
+     */
+    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
+    {
+        pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
+        nr_pages = DIV_ROUND_UP(acpi_gbl_root_table_list.tables[i].length,
+                                PAGE_SIZE);
+        rc = modify_mmio_11(d, pfn, nr_pages, true);
+        if ( rc )
+        {
+            printk(
+                "Failed to map ACPI region %#lx - %#lx into Dom0 memory map\n",
+                   pfn, pfn + nr_pages);
+            return rc;
+        }
+    }
+
+    /*
+     * Special treatment for memory < 1MB:
+     *  - Copy the data in e820 regions marked as RAM (BDA, EBDA...).
+     *  - Map any reserved regions as 1:1.
+     * NB: all this only makes sense if booted from legacy BIOSes.
+     */
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        unsigned long end = d->arch.e820[i].addr + d->arch.e820[i].size;
+        end = end > MB(1) ? MB(1) : end;
+
+        if ( d->arch.e820[i].type == E820_RAM )
+        {
+            saved_current = current;
+            set_current(v);
+            rc = hvm_copy_to_guest_phys(d->arch.e820[i].addr,
+                                        maddr_to_virt(d->arch.e820[i].addr),
+                                        end - d->arch.e820[i].addr);
+            set_current(saved_current);
+            if ( rc != HVMCOPY_okay )
+            {
+                printk("Unable to copy RAM region %#lx - %#lx\n",
+                       d->arch.e820[i].addr, end);
+                return -EFAULT;
+            }
+        }
+        else if ( d->arch.e820[i].type == E820_RESERVED )
+        {
+            pfn = PFN_DOWN(d->arch.e820[i].addr);
+            nr_pages = DIV_ROUND_UP(end - d->arch.e820[i].addr, PAGE_SIZE);
+            rc = modify_mmio_11(d, pfn, nr_pages, true);
+            if ( rc )
+            {
+                printk("Unable to map reserved region at %#lx - %#lx: %d\n",
+                       d->arch.e820[i].addr, end, rc);
+                return rc;
+            }
+        }
+        if ( end == MB(1) )
+            break;
+     }
+
+    acpi_get_table_phys(ACPI_SIG_MADT, 0, &madt_addr[0], &size);
+    if ( !madt_addr[0] )
+    {
+        printk("Unable to find ACPI MADT table\n");
+        return -EINVAL;
+    }
+    if ( size > PAGE_SIZE )
+    {
+        printk("MADT table is bigger than PAGE_SIZE, aborting\n");
+        return -EINVAL;
+    }
+
+    acpi_get_table_phys(ACPI_SIG_MADT, 2, &madt_addr[1], &size);
+    if ( madt_addr[1] != 0 && madt_addr[1] != madt_addr[0] )
+    {
+        printk("Multiple MADT tables found, aborting\n");
+        return -EINVAL;
+    }
+
+    /*
+     * Populate the guest physical memory were MADT resides with empty RAM
+     * pages. This will remove the 1:1 mapping in this area, so that Xen
+     * can modify it without any side-effects.
+     */
+    start_addr = madt_addr[0] & PAGE_MASK;
+    end_addr = PAGE_ALIGN(madt_addr[0] + size);
+    hvm_populate_memory_range(d, start_addr, end_addr - start_addr);
+
+    /* Get the address where the MADT is currently mapped. */
+    status = acpi_get_table(ACPI_SIG_MADT, 0, &table);
+    if ( !ACPI_SUCCESS(status) )
+    {
+        printk("Failed to get MADT ACPI table, aborting.\n");
+        return -EINVAL;
+    }
+
+    /*
+     * Copy the original MADT table (and whatever is around it) to the
+     * guest physmap.
+     */
+    saved_current = current;
+    set_current(v);
+    rc = hvm_copy_to_guest_phys(start_addr,
+                                (void *)((uintptr_t)table & PAGE_MASK),
+                                end_addr - start_addr);
+    set_current(saved_current);
+    if ( rc != HVMCOPY_okay )
+    {
+        printk("Unable to copy original MADT page(s)\n");
+        return -EFAULT;
+    }
+
+    /* Craft a new MADT for the guest */
+
+    /* Count number of interrupt overrides. */
+    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_count_intr_ov,
+                          MAX_IRQ_SOURCES);
+    size = sizeof(struct acpi_table_madt);
+    size += sizeof(struct acpi_madt_interrupt_override) * acpi_intr_overrrides;
+    size += sizeof(struct acpi_madt_io_apic);
+    size += sizeof(struct acpi_madt_local_apic) * dom0_max_vcpus();
+
+    madt = xzalloc_bytes(size);
+    ACPI_MEMCPY(madt, table, sizeof(*madt));
+    madt->address = APIC_DEFAULT_PHYS_BASE;
+    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
+    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
+    io_apic->header.length = sizeof(*io_apic);
+    io_apic->id = 1;
+    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;
+
+    if ( dom0_max_vcpus() > num_online_cpus() )
+    {
+        printk("CPU overcommit is not supported for Dom0\n");
+        xfree(madt);
+        return -EINVAL;
+    }
+
+    local_apic = (struct acpi_madt_local_apic *)(io_apic + 1);
+    for ( i = 0; i < dom0_max_vcpus(); i++ )
+    {
+        local_apic->header.type = ACPI_MADT_TYPE_LOCAL_APIC;
+        local_apic->header.length = sizeof(*local_apic);
+        local_apic->processor_id = i;
+        local_apic->id = i * 2;
+        local_apic->lapic_flags = ACPI_MADT_ENABLED;
+        local_apic++;
+    }
+
+    intsrcovr = (struct acpi_madt_interrupt_override *)local_apic;
+    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_set_intr_ov,
+                          MAX_IRQ_SOURCES);
+    ASSERT(((unsigned char *)intsrcovr - (unsigned char *)madt) == size);
+    madt->header.length = size;
+    madt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, madt),
+                                              madt->header.length);
+
+    /* Copy the new MADT table to the guest physmap. */
+    saved_current = current;
+    set_current(v);
+    rc = hvm_copy_to_guest_phys(madt_addr[0], madt, size);
+    set_current(saved_current);
+    if ( rc != HVMCOPY_okay )
+    {
+        printk("Unable to copy modified MADT page(s)\n");
+        xfree(madt);
+        return -EFAULT;
+    }
+
+    xfree(madt);
+
+    return 0;
+}
+
 static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
                                      unsigned long image_headroom,
                                      module_t *initrd,
@@ -2152,6 +2419,13 @@  static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
         return rc;
     }
 
+    rc = hvm_setup_acpi(d);
+    if ( rc )
+    {
+        printk("Failed to setup Dom0 ACPI tables: %d\n", rc);
+        return rc;
+    }
+
     return 0;
 }