diff mbox

[RFC,XEN,v3,05/39] x86/mm: exclude PMEM regions from initial frametable

Message ID 20170911043820.14617-6-haozhong.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Haozhong Zhang Sept. 11, 2017, 4:37 a.m. UTC
No specification defines that PMEM regions cannot appear in margins
between RAM regions. If that does happen, init_frametable() will need
to allocate RAM for the part of frametable of PMEM regions. However,
PMEM regions can be very large (several terabytes or more), so
init_frametable() may fail.

Because Xen does not use PMEM at the boot time, we can defer the
actual resource allocation of frametable of PMEM regions. At the boot
time, all pages of frametable of PMEM regions appearing between RAM
regions are mapped one RAM page filled with 0xff.

Any attempt, whichs write to those frametable pages before the their
actual resource is allocated, implies bugs in Xen. Therefore, the
read-only mapping is used here to make those bugs explicit.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
---
 xen/arch/x86/mm.c         | 117 +++++++++++++++++++++++++++++++++++++++++-----
 xen/arch/x86/setup.c      |   4 ++
 xen/drivers/acpi/Makefile |   2 +
 xen/drivers/acpi/nfit.c   | 116 +++++++++++++++++++++++++++++++++++++++++++++
 xen/include/acpi/actbl1.h |  43 +++++++++++++++++
 xen/include/xen/acpi.h    |   7 +++
 6 files changed, 278 insertions(+), 11 deletions(-)
 create mode 100644 xen/drivers/acpi/nfit.c

Comments

Chao Peng Nov. 3, 2017, 5:58 a.m. UTC | #1
> +#ifdef CONFIG_NVDIMM_PMEM
> +static void __init init_frametable_pmem_chunk(unsigned long s,
> unsigned long e)
> +{
> +    static unsigned long pmem_init_frametable_mfn;
> +
> +    ASSERT(!((s | e) & (PAGE_SIZE - 1)));
> +
> +    if ( !pmem_init_frametable_mfn )
> +    {
> +        pmem_init_frametable_mfn = alloc_boot_pages(1, 1);
> +        if ( !pmem_init_frametable_mfn )
> +            panic("Not enough memory for pmem initial frame table
> page");
> +        memset(mfn_to_virt(pmem_init_frametable_mfn), -1, PAGE_SIZE);
> +    }

Can zero_page be used instead?

> +
> +    while ( s < e )
> +    {
> +        /*
> +         * The real frame table entries of a pmem region will be
> +         * created when the pmem region is registered to hypervisor.
> +         * Any write attempt to the initial entries of that pmem
> +         * region implies potential hypervisor bugs. In order to make
> +         * those bugs explicit, map those initial entries as read-
> only.
> +         */
> +        map_pages_to_xen(s, pmem_init_frametable_mfn, 1,
> PAGE_HYPERVISOR_RO);
> +        s += PAGE_SIZE;

Don't know how much the impact of 4K mapping on boot time when pmem is
very large. Perhaps we need get such data on hardware.

Another question is do we really need to map it, e.g. can we just skip
the range here?

Chao
Haozhong Zhang Nov. 3, 2017, 6:39 a.m. UTC | #2
On 11/03/17 13:58 +0800, Chao Peng wrote:
> 
> > +#ifdef CONFIG_NVDIMM_PMEM
> > +static void __init init_frametable_pmem_chunk(unsigned long s,
> > unsigned long e)
> > +{
> > +    static unsigned long pmem_init_frametable_mfn;
> > +
> > +    ASSERT(!((s | e) & (PAGE_SIZE - 1)));
> > +
> > +    if ( !pmem_init_frametable_mfn )
> > +    {
> > +        pmem_init_frametable_mfn = alloc_boot_pages(1, 1);
> > +        if ( !pmem_init_frametable_mfn )
> > +            panic("Not enough memory for pmem initial frame table
> > page");
> > +        memset(mfn_to_virt(pmem_init_frametable_mfn), -1, PAGE_SIZE);
> > +    }
> 
> Can zero_page be used instead?

No. I intend to make the frametable entries for NVDIMM as invalid at
boot time, in order to avoid/detect accidental accesses to NVDIMM
pages before they are registered to Xen hypervisor later (by part 2
patches 14 - 25).

> 
> > +
> > +    while ( s < e )
> > +    {
> > +        /*
> > +         * The real frame table entries of a pmem region will be
> > +         * created when the pmem region is registered to hypervisor.
> > +         * Any write attempt to the initial entries of that pmem
> > +         * region implies potential hypervisor bugs. In order to make
> > +         * those bugs explicit, map those initial entries as read-
> > only.
> > +         */
> > +        map_pages_to_xen(s, pmem_init_frametable_mfn, 1,
> > PAGE_HYPERVISOR_RO);
> > +        s += PAGE_SIZE;
> 
> Don't know how much the impact of 4K mapping on boot time when pmem is
> very large. Perhaps we need get such data on hardware.
>

Well, it will be very slow because the size of NVDIMM is usually very
large (e.g. from hundreds of giga-bytes to several tera-bytes). I can
make it to use huge page if possible.

> Another question is do we really need to map it, e.g. can we just skip
> the range here?

Sadly, I cannot remind why I did this. Maybe I can just leave the
frametable of NVDIMM unmapped and accidental access to them would just
trigger page fault in hypervisor, which can makes bugs explicit as well.


Haozhong
diff mbox

Patch

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index e5a029c9be..2fdf609805 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -83,6 +83,9 @@ 
  * an application-supplied buffer).
  */
 
+#ifdef CONFIG_NVDIMM_PMEM
+#include <xen/acpi.h>
+#endif
 #include <xen/init.h>
 #include <xen/kernel.h>
 #include <xen/lib.h>
@@ -196,31 +199,123 @@  static int __init parse_mmio_relax(const char *s)
 }
 custom_param("mmio-relax", parse_mmio_relax);
 
-static void __init init_frametable_chunk(void *start, void *end)
+static void __init init_frametable_ram_chunk(unsigned long s, unsigned long e)
 {
-    unsigned long s = (unsigned long)start;
-    unsigned long e = (unsigned long)end;
-    unsigned long step, mfn;
+    unsigned long cur, step, mfn;
 
-    ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
-    for ( ; s < e; s += step << PAGE_SHIFT )
+    for ( cur = s; cur < e; cur += step << PAGE_SHIFT )
     {
         step = 1UL << (cpu_has_page1gb &&
-                       !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
+                       !(cur & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
                        L3_PAGETABLE_SHIFT - PAGE_SHIFT :
                        L2_PAGETABLE_SHIFT - PAGE_SHIFT);
         /*
          * The hardcoded 4 below is arbitrary - just pick whatever you think
          * is reasonable to waste as a trade-off for using a large page.
          */
-        while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
+        while ( step && cur + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
             step >>= PAGETABLE_ORDER;
         mfn = alloc_boot_pages(step, step);
-        map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR);
+        map_pages_to_xen(cur, mfn, step, PAGE_HYPERVISOR);
     }
 
-    memset(start, 0, end - start);
-    memset(end, -1, s - e);
+    memset((void *)s, 0, e - s);
+    memset((void *)e, -1, cur - e);
+}
+
+#ifdef CONFIG_NVDIMM_PMEM
+static void __init init_frametable_pmem_chunk(unsigned long s, unsigned long e)
+{
+    static unsigned long pmem_init_frametable_mfn;
+
+    ASSERT(!((s | e) & (PAGE_SIZE - 1)));
+
+    if ( !pmem_init_frametable_mfn )
+    {
+        pmem_init_frametable_mfn = alloc_boot_pages(1, 1);
+        if ( !pmem_init_frametable_mfn )
+            panic("Not enough memory for pmem initial frame table page");
+        memset(mfn_to_virt(pmem_init_frametable_mfn), -1, PAGE_SIZE);
+    }
+
+    while ( s < e )
+    {
+        /*
+         * The real frame table entries of a pmem region will be
+         * created when the pmem region is registered to hypervisor.
+         * Any write attempt to the initial entries of that pmem
+         * region implies potential hypervisor bugs. In order to make
+         * those bugs explicit, map those initial entries as read-only.
+         */
+        map_pages_to_xen(s, pmem_init_frametable_mfn, 1, PAGE_HYPERVISOR_RO);
+        s += PAGE_SIZE;
+    }
+}
+#endif /* CONFIG_NVDIMM_PMEM */
+
+static void __init init_frametable_chunk(void *start, void *end)
+{
+    unsigned long s = (unsigned long)start;
+    unsigned long e = (unsigned long)end;
+#ifdef CONFIG_NVDIMM_PMEM
+    unsigned long pmem_smfn, pmem_emfn;
+    unsigned long pmem_spage = s, pmem_epage = s;
+    unsigned long pmem_page_aligned;
+    bool found = false;
+#endif /* CONFIG_NVDIMM_PMEM */
+
+    ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
+
+#ifndef CONFIG_NVDIMM_PMEM
+    init_frametable_ram_chunk(s, e);
+#else
+    while ( s < e )
+    {
+        /* No previous found pmem region overlaps with s ~ e. */
+        if ( s >= (pmem_epage & PAGE_MASK) )
+        {
+            found = acpi_nfit_boot_search_pmem(
+                mfn_x(page_to_mfn((struct page_info *)s)),
+                mfn_x(page_to_mfn((struct page_info *)e)),
+                &pmem_smfn, &pmem_emfn);
+            if ( found )
+            {
+                pmem_spage = (unsigned long)mfn_to_page(_mfn(pmem_smfn));
+                pmem_epage = (unsigned long)mfn_to_page(_mfn(pmem_emfn));
+            }
+        }
+
+        /* No pmem region found in s ~ e. */
+        if ( s >= (pmem_epage & PAGE_MASK) )
+        {
+            init_frametable_ram_chunk(s, e);
+            break;
+        }
+
+        if ( s < pmem_spage )
+        {
+            init_frametable_ram_chunk(s, pmem_spage);
+            pmem_page_aligned = (pmem_spage + PAGE_SIZE - 1) & PAGE_MASK;
+            if ( pmem_page_aligned > pmem_epage )
+                memset((void *)pmem_epage, -1, pmem_page_aligned - pmem_epage);
+            s = pmem_page_aligned;
+        }
+        else
+        {
+            pmem_page_aligned = pmem_epage & PAGE_MASK;
+            if ( pmem_page_aligned > s )
+                init_frametable_pmem_chunk(s, pmem_page_aligned);
+            if ( pmem_page_aligned < pmem_epage )
+            {
+                init_frametable_ram_chunk(pmem_page_aligned,
+                                          min(pmem_page_aligned + PAGE_SIZE, e));
+                memset((void *)pmem_page_aligned, -1,
+                       pmem_epage - pmem_page_aligned);
+            }
+            s = (pmem_epage + PAGE_SIZE - 1) & PAGE_MASK;
+        }
+    }
+#endif
 }
 
 void __init init_frametable(void)
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 3cbe305202..b9ebda8f4e 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1358,6 +1358,10 @@  void __init noreturn __start_xen(unsigned long mbi_p)
     BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
     BUILD_BUG_ON(MACH2PHYS_VIRT_END   != RO_MPT_VIRT_END);
 
+#ifdef CONFIG_NVDIMM_PMEM
+    acpi_nfit_boot_init();
+#endif
+
     init_frametable();
 
     if ( !acpi_boot_table_init_done )
diff --git a/xen/drivers/acpi/Makefile b/xen/drivers/acpi/Makefile
index 444b11d583..c8bb869cb8 100644
--- a/xen/drivers/acpi/Makefile
+++ b/xen/drivers/acpi/Makefile
@@ -9,3 +9,5 @@  obj-$(CONFIG_HAS_CPUFREQ) += pmstat.o
 
 obj-$(CONFIG_X86) += hwregs.o
 obj-$(CONFIG_X86) += reboot.o
+
+obj-$(CONFIG_NVDIMM_PMEM) += nfit.o
diff --git a/xen/drivers/acpi/nfit.c b/xen/drivers/acpi/nfit.c
new file mode 100644
index 0000000000..e099378ee0
--- /dev/null
+++ b/xen/drivers/acpi/nfit.c
@@ -0,0 +1,116 @@ 
+/*
+ * xen/drivers/acpi/nfit.c
+ *
+ * Copyright (C) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/acpi.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/pfn.h>
+
+/*
+ * GUID of a byte addressable persistent memory region
+ * (ref. ACPI 6.2, Section 5.2.25.2)
+ */
+static const uint8_t nfit_spa_pmem_guid[] =
+{
+    0x79, 0xd3, 0xf0, 0x66, 0xf3, 0xb4, 0x74, 0x40,
+    0xac, 0x43, 0x0d, 0x33, 0x18, 0xb7, 0x8c, 0xdb,
+};
+
+struct acpi_nfit_desc {
+    struct acpi_table_nfit *acpi_table;
+};
+
+static struct acpi_nfit_desc nfit_desc;
+
+void __init acpi_nfit_boot_init(void)
+{
+    acpi_status status;
+    acpi_physical_address nfit_addr;
+    acpi_native_uint nfit_len;
+
+    status = acpi_get_table_phys(ACPI_SIG_NFIT, 0, &nfit_addr, &nfit_len);
+    if ( ACPI_FAILURE(status) )
+        return;
+
+    nfit_desc.acpi_table = (struct acpi_table_nfit *)__va(nfit_addr);
+    map_pages_to_xen((unsigned long)nfit_desc.acpi_table, PFN_DOWN(nfit_addr),
+                     PFN_UP(nfit_addr + nfit_len) - PFN_DOWN(nfit_addr),
+                     PAGE_HYPERVISOR);
+}
+
+/**
+ * Search pmem regions overlapped with the specified address range.
+ *
+ * Parameters:
+ *  @smfn, @emfn: the start and end MFN of address range to search
+ *  @ret_smfn, @ret_emfn: return the address range of the first pmem region
+ *                        in above range
+ *
+ * Return:
+ *  Return true if a pmem region is overlapped with @smfn - @emfn. The
+ *  start and end MFN of the lowest pmem region are returned via
+ *  @ret_smfn and @ret_emfn respectively.
+ *
+ *  Return false if no pmem region is overlapped with @smfn - @emfn.
+ */
+bool __init acpi_nfit_boot_search_pmem(unsigned long smfn, unsigned long emfn,
+                                       unsigned long *ret_smfn,
+                                       unsigned long *ret_emfn)
+{
+    struct acpi_table_nfit *nfit_table = nfit_desc.acpi_table;
+    uint32_t hdr_offset = sizeof(*nfit_table);
+    unsigned long saddr = pfn_to_paddr(smfn), eaddr = pfn_to_paddr(emfn);
+    unsigned long ret_saddr = 0, ret_eaddr = 0;
+
+    if ( !nfit_table )
+        return false;
+
+    while ( hdr_offset < nfit_table->header.length )
+    {
+        struct acpi_nfit_header *hdr = (void *)nfit_table + hdr_offset;
+        struct acpi_nfit_system_address *spa;
+        unsigned long pmem_saddr, pmem_eaddr;
+
+        hdr_offset += hdr->length;
+
+        if ( hdr->type != ACPI_NFIT_TYPE_SYSTEM_ADDRESS )
+            continue;
+
+        spa = (struct acpi_nfit_system_address *)hdr;
+        if ( memcmp(spa->range_guid, nfit_spa_pmem_guid, 16) )
+            continue;
+
+        pmem_saddr = spa->address;
+        pmem_eaddr = pmem_saddr + spa->length;
+        if ( pmem_saddr >= eaddr || pmem_eaddr <= saddr )
+            continue;
+
+        if ( ret_saddr < pmem_saddr )
+            continue;
+        ret_saddr = pmem_saddr;
+        ret_eaddr = pmem_eaddr;
+    }
+
+    if ( ret_saddr == ret_eaddr )
+        return false;
+
+    *ret_smfn = paddr_to_pfn(ret_saddr);
+    *ret_emfn = paddr_to_pfn(ret_eaddr);
+
+    return true;
+}
diff --git a/xen/include/acpi/actbl1.h b/xen/include/acpi/actbl1.h
index e1991362dc..94d8d7775c 100644
--- a/xen/include/acpi/actbl1.h
+++ b/xen/include/acpi/actbl1.h
@@ -71,6 +71,7 @@ 
 #define ACPI_SIG_SBST           "SBST"	/* Smart Battery Specification Table */
 #define ACPI_SIG_SLIT           "SLIT"	/* System Locality Distance Information Table */
 #define ACPI_SIG_SRAT           "SRAT"	/* System Resource Affinity Table */
+#define ACPI_SIG_NFIT           "NFIT"	/* NVDIMM Firmware Interface Table */
 
 /*
  * All tables must be byte-packed to match the ACPI specification, since
@@ -903,6 +904,48 @@  struct acpi_msct_proximity {
 	u64 memory_capacity;	/* In bytes */
 };
 
+/*******************************************************************************
+ *
+ * NFIT - NVDIMM Interface Table (ACPI 6.0+)
+ *		  Version 1
+ *
+ ******************************************************************************/
+
+struct acpi_table_nfit {
+	struct acpi_table_header header;	/* Common ACPI table header */
+	u32 reserved;						/* Reserved, must be zero */
+};
+
+/* Subtable header for NFIT */
+
+struct acpi_nfit_header {
+	u16 type;
+	u16 length;
+};
+
+/* Values for subtable type in struct acpi_nfit_header */
+enum acpi_nfit_type {
+	ACPI_NFIT_TYPE_SYSTEM_ADDRESS = 0,
+	ACPI_NFIT_TYPE_MEMORY_MAP = 1,
+};
+
+/*
+ * NFIT Subtables
+ */
+
+/* 0: System Physical Address Range Structure */
+struct acpi_nfit_system_address {
+	struct acpi_nfit_header header;
+	u16 range_index;
+	u16 flags;
+	u32 reserved;		/* Reseved, must be zero */
+	u32 proximity_domain;
+	u8	range_guid[16];
+	u64 address;
+	u64 length;
+	u64 memory_mapping;
+};
+
 /*******************************************************************************
  *
  * SBST - Smart Battery Specification Table
diff --git a/xen/include/xen/acpi.h b/xen/include/xen/acpi.h
index 9409350f05..1bd8f9f4e4 100644
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -180,4 +180,11 @@  void acpi_reboot(void);
 void acpi_dmar_zap(void);
 void acpi_dmar_reinstate(void);
 
+#ifdef CONFIG_NVDIMM_PMEM
+void acpi_nfit_boot_init(void);
+bool acpi_nfit_boot_search_pmem(unsigned long smfn, unsigned long emfn,
+                                unsigned long *ret_smfn,
+                                unsigned long *ret_emfn);
+#endif /* CONFIG_NVDIMM_PMEM */
+
 #endif /*_LINUX_ACPI_H*/