@@ -83,6 +83,9 @@
* an application-supplied buffer).
*/
+#ifdef CONFIG_NVDIMM_PMEM
+#include <xen/acpi.h>
+#endif
#include <xen/init.h>
#include <xen/kernel.h>
#include <xen/lib.h>
@@ -196,31 +199,123 @@ static int __init parse_mmio_relax(const char *s)
}
custom_param("mmio-relax", parse_mmio_relax);
-static void __init init_frametable_chunk(void *start, void *end)
+static void __init init_frametable_ram_chunk(unsigned long s, unsigned long e)
{
- unsigned long s = (unsigned long)start;
- unsigned long e = (unsigned long)end;
- unsigned long step, mfn;
+ unsigned long cur, step, mfn;
- ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
- for ( ; s < e; s += step << PAGE_SHIFT )
+ for ( cur = s; cur < e; cur += step << PAGE_SHIFT )
{
step = 1UL << (cpu_has_page1gb &&
- !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
+ !(cur & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
L3_PAGETABLE_SHIFT - PAGE_SHIFT :
L2_PAGETABLE_SHIFT - PAGE_SHIFT);
/*
* The hardcoded 4 below is arbitrary - just pick whatever you think
* is reasonable to waste as a trade-off for using a large page.
*/
- while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
+ while ( step && cur + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
step >>= PAGETABLE_ORDER;
mfn = alloc_boot_pages(step, step);
- map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR);
+ map_pages_to_xen(cur, mfn, step, PAGE_HYPERVISOR);
}
- memset(start, 0, end - start);
- memset(end, -1, s - e);
+ memset((void *)s, 0, e - s);
+ memset((void *)e, -1, cur - e);
+}
+
+#ifdef CONFIG_NVDIMM_PMEM
+static void __init init_frametable_pmem_chunk(unsigned long s, unsigned long e)
+{
+ static unsigned long pmem_init_frametable_mfn;
+
+ ASSERT(!((s | e) & (PAGE_SIZE - 1)));
+
+ if ( !pmem_init_frametable_mfn )
+ {
+ pmem_init_frametable_mfn = alloc_boot_pages(1, 1);
+ if ( !pmem_init_frametable_mfn )
+ panic("Not enough memory for pmem initial frame table page");
+ memset(mfn_to_virt(pmem_init_frametable_mfn), -1, PAGE_SIZE);
+ }
+
+ while ( s < e )
+ {
+ /*
+ * The real frame table entries of a pmem region will be
+ * created when the pmem region is registered to hypervisor.
+ * Any write attempt to the initial entries of that pmem
+ * region implies potential hypervisor bugs. In order to make
+ * those bugs explicit, map those initial entries as read-only.
+ */
+ map_pages_to_xen(s, pmem_init_frametable_mfn, 1, PAGE_HYPERVISOR_RO);
+ s += PAGE_SIZE;
+ }
+}
+#endif /* CONFIG_NVDIMM_PMEM */
+
+static void __init init_frametable_chunk(void *start, void *end)
+{
+ unsigned long s = (unsigned long)start;
+ unsigned long e = (unsigned long)end;
+#ifdef CONFIG_NVDIMM_PMEM
+ unsigned long pmem_smfn, pmem_emfn;
+ unsigned long pmem_spage = s, pmem_epage = s;
+ unsigned long pmem_page_aligned;
+ bool found = false;
+#endif /* CONFIG_NVDIMM_PMEM */
+
+ ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
+
+#ifndef CONFIG_NVDIMM_PMEM
+ init_frametable_ram_chunk(s, e);
+#else
+ while ( s < e )
+ {
+ /* No previous found pmem region overlaps with s ~ e. */
+ if ( s >= (pmem_epage & PAGE_MASK) )
+ {
+ found = acpi_nfit_boot_search_pmem(
+ mfn_x(page_to_mfn((struct page_info *)s)),
+ mfn_x(page_to_mfn((struct page_info *)e)),
+ &pmem_smfn, &pmem_emfn);
+ if ( found )
+ {
+ pmem_spage = (unsigned long)mfn_to_page(_mfn(pmem_smfn));
+ pmem_epage = (unsigned long)mfn_to_page(_mfn(pmem_emfn));
+ }
+ }
+
+ /* No pmem region found in s ~ e. */
+ if ( s >= (pmem_epage & PAGE_MASK) )
+ {
+ init_frametable_ram_chunk(s, e);
+ break;
+ }
+
+ if ( s < pmem_spage )
+ {
+ init_frametable_ram_chunk(s, pmem_spage);
+ pmem_page_aligned = (pmem_spage + PAGE_SIZE - 1) & PAGE_MASK;
+ if ( pmem_page_aligned > pmem_epage )
+ memset((void *)pmem_epage, -1, pmem_page_aligned - pmem_epage);
+ s = pmem_page_aligned;
+ }
+ else
+ {
+ pmem_page_aligned = pmem_epage & PAGE_MASK;
+ if ( pmem_page_aligned > s )
+ init_frametable_pmem_chunk(s, pmem_page_aligned);
+ if ( pmem_page_aligned < pmem_epage )
+ {
+ init_frametable_ram_chunk(pmem_page_aligned,
+ min(pmem_page_aligned + PAGE_SIZE, e));
+ memset((void *)pmem_page_aligned, -1,
+ pmem_epage - pmem_page_aligned);
+ }
+ s = (pmem_epage + PAGE_SIZE - 1) & PAGE_MASK;
+ }
+ }
+#endif
}
void __init init_frametable(void)
@@ -1358,6 +1358,10 @@ void __init noreturn __start_xen(unsigned long mbi_p)
BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
+#ifdef CONFIG_NVDIMM_PMEM
+ acpi_nfit_boot_init();
+#endif
+
init_frametable();
if ( !acpi_boot_table_init_done )
@@ -9,3 +9,5 @@ obj-$(CONFIG_HAS_CPUFREQ) += pmstat.o
obj-$(CONFIG_X86) += hwregs.o
obj-$(CONFIG_X86) += reboot.o
+
+obj-$(CONFIG_NVDIMM_PMEM) += nfit.o
new file mode 100644
@@ -0,0 +1,116 @@
+/*
+ * xen/drivers/acpi/nfit.c
+ *
+ * Copyright (C) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/acpi.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/pfn.h>
+
+/*
+ * GUID of a byte addressable persistent memory region
+ * (ref. ACPI 6.2, Section 5.2.25.2)
+ */
+static const uint8_t nfit_spa_pmem_guid[] =
+{
+ 0x79, 0xd3, 0xf0, 0x66, 0xf3, 0xb4, 0x74, 0x40,
+ 0xac, 0x43, 0x0d, 0x33, 0x18, 0xb7, 0x8c, 0xdb,
+};
+
+struct acpi_nfit_desc {
+ struct acpi_table_nfit *acpi_table;
+};
+
+static struct acpi_nfit_desc nfit_desc;
+
+void __init acpi_nfit_boot_init(void)
+{
+ acpi_status status;
+ acpi_physical_address nfit_addr;
+ acpi_native_uint nfit_len;
+
+ status = acpi_get_table_phys(ACPI_SIG_NFIT, 0, &nfit_addr, &nfit_len);
+ if ( ACPI_FAILURE(status) )
+ return;
+
+ nfit_desc.acpi_table = (struct acpi_table_nfit *)__va(nfit_addr);
+ map_pages_to_xen((unsigned long)nfit_desc.acpi_table, PFN_DOWN(nfit_addr),
+ PFN_UP(nfit_addr + nfit_len) - PFN_DOWN(nfit_addr),
+ PAGE_HYPERVISOR);
+}
+
+/**
+ * Search pmem regions overlapped with the specified address range.
+ *
+ * Parameters:
+ * @smfn, @emfn: the start and end MFN of address range to search
+ * @ret_smfn, @ret_emfn: return the address range of the first pmem region
+ * in above range
+ *
+ * Return:
+ * Return true if a pmem region is overlapped with @smfn - @emfn. The
+ * start and end MFN of the lowest pmem region are returned via
+ * @ret_smfn and @ret_emfn respectively.
+ *
+ * Return false if no pmem region is overlapped with @smfn - @emfn.
+ */
+bool __init acpi_nfit_boot_search_pmem(unsigned long smfn, unsigned long emfn,
+ unsigned long *ret_smfn,
+ unsigned long *ret_emfn)
+{
+ struct acpi_table_nfit *nfit_table = nfit_desc.acpi_table;
+ uint32_t hdr_offset = sizeof(*nfit_table);
+ unsigned long saddr = pfn_to_paddr(smfn), eaddr = pfn_to_paddr(emfn);
+ unsigned long ret_saddr = 0, ret_eaddr = 0;
+
+ if ( !nfit_table )
+ return false;
+
+ while ( hdr_offset < nfit_table->header.length )
+ {
+ struct acpi_nfit_header *hdr = (void *)nfit_table + hdr_offset;
+ struct acpi_nfit_system_address *spa;
+ unsigned long pmem_saddr, pmem_eaddr;
+
+ hdr_offset += hdr->length;
+
+ if ( hdr->type != ACPI_NFIT_TYPE_SYSTEM_ADDRESS )
+ continue;
+
+ spa = (struct acpi_nfit_system_address *)hdr;
+ if ( memcmp(spa->range_guid, nfit_spa_pmem_guid, 16) )
+ continue;
+
+ pmem_saddr = spa->address;
+ pmem_eaddr = pmem_saddr + spa->length;
+ if ( pmem_saddr >= eaddr || pmem_eaddr <= saddr )
+ continue;
+
+ if ( ret_saddr < pmem_saddr )
+ continue;
+ ret_saddr = pmem_saddr;
+ ret_eaddr = pmem_eaddr;
+ }
+
+ if ( ret_saddr == ret_eaddr )
+ return false;
+
+ *ret_smfn = paddr_to_pfn(ret_saddr);
+ *ret_emfn = paddr_to_pfn(ret_eaddr);
+
+ return true;
+}
@@ -71,6 +71,7 @@
#define ACPI_SIG_SBST "SBST" /* Smart Battery Specification Table */
#define ACPI_SIG_SLIT "SLIT" /* System Locality Distance Information Table */
#define ACPI_SIG_SRAT "SRAT" /* System Resource Affinity Table */
+#define ACPI_SIG_NFIT "NFIT" /* NVDIMM Firmware Interface Table */
/*
* All tables must be byte-packed to match the ACPI specification, since
@@ -903,6 +904,48 @@ struct acpi_msct_proximity {
u64 memory_capacity; /* In bytes */
};
+/*******************************************************************************
+ *
+ * NFIT - NVDIMM Interface Table (ACPI 6.0+)
+ * Version 1
+ *
+ ******************************************************************************/
+
+struct acpi_table_nfit {
+ struct acpi_table_header header; /* Common ACPI table header */
+ u32 reserved; /* Reserved, must be zero */
+};
+
+/* Subtable header for NFIT */
+
+struct acpi_nfit_header {
+ u16 type;
+ u16 length;
+};
+
+/* Values for subtable type in struct acpi_nfit_header */
+enum acpi_nfit_type {
+ ACPI_NFIT_TYPE_SYSTEM_ADDRESS = 0,
+ ACPI_NFIT_TYPE_MEMORY_MAP = 1,
+};
+
+/*
+ * NFIT Subtables
+ */
+
+/* 0: System Physical Address Range Structure */
+struct acpi_nfit_system_address {
+ struct acpi_nfit_header header;
+ u16 range_index;
+ u16 flags;
+ u32 reserved; /* Reseved, must be zero */
+ u32 proximity_domain;
+ u8 range_guid[16];
+ u64 address;
+ u64 length;
+ u64 memory_mapping;
+};
+
/*******************************************************************************
*
* SBST - Smart Battery Specification Table
@@ -180,4 +180,11 @@ void acpi_reboot(void);
void acpi_dmar_zap(void);
void acpi_dmar_reinstate(void);
+#ifdef CONFIG_NVDIMM_PMEM
+void acpi_nfit_boot_init(void);
+bool acpi_nfit_boot_search_pmem(unsigned long smfn, unsigned long emfn,
+ unsigned long *ret_smfn,
+ unsigned long *ret_emfn);
+#endif /* CONFIG_NVDIMM_PMEM */
+
#endif /*_LINUX_ACPI_H*/
No specification defines that PMEM regions cannot appear in margins between RAM regions. If that does happen, init_frametable() will need to allocate RAM for the part of frametable of PMEM regions. However, PMEM regions can be very large (several terabytes or more), so init_frametable() may fail. Because Xen does not use PMEM at the boot time, we can defer the actual resource allocation of frametable of PMEM regions. At the boot time, all pages of frametable of PMEM regions appearing between RAM regions are mapped one RAM page filled with 0xff. Any attempt, whichs write to those frametable pages before the their actual resource is allocated, implies bugs in Xen. Therefore, the read-only mapping is used here to make those bugs explicit. Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> --- Cc: Andrew Cooper <andrew.cooper3@citrix.com> Cc: George Dunlap <George.Dunlap@eu.citrix.com> Cc: Jan Beulich <jbeulich@suse.com> --- xen/arch/x86/mm.c | 117 +++++++++++++++++++++++++++++++++++++++++----- xen/arch/x86/setup.c | 4 ++ xen/drivers/acpi/Makefile | 2 + xen/drivers/acpi/nfit.c | 116 +++++++++++++++++++++++++++++++++++++++++++++ xen/include/acpi/actbl1.h | 43 +++++++++++++++++ xen/include/xen/acpi.h | 7 +++ 6 files changed, 278 insertions(+), 11 deletions(-) create mode 100644 xen/drivers/acpi/nfit.c