diff mbox

[RFC,XEN,v3,16/39] xen/pmem: add XEN_SYSCTL_nvdimm_pmem_setup to setup management PMEM region

Message ID 20170911043820.14617-17-haozhong.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Haozhong Zhang Sept. 11, 2017, 4:37 a.m. UTC
Add a command XEN_SYSCTL_nvdimm_pmem_setup to hypercall
XEN_SYSCTL_nvdimm_op to setup the frame table and M2P table of a PMEM
region. This command is currently used to setup the management PMEM
region which is used to store the frame table and M2P table of other
PMEM regions and itself. The management PMEM region should not be
mapped to guest.

PMEM pages are not added in any Xen or domain heaps. A new flag
PGC_pmem_page is used to indicate whether a page is from PMEM and
avoid returning PMEM pages to heaps.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
---
 tools/libxc/include/xenctrl.h |  16 +++++
 tools/libxc/xc_misc.c         |  34 ++++++++++
 xen/arch/x86/mm.c             |   3 +-
 xen/arch/x86/x86_64/mm.c      |  72 +++++++++++++++++++++
 xen/common/pmem.c             | 142 ++++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/mm.h      |  10 ++-
 xen/include/public/sysctl.h   |  18 ++++++
 xen/include/xen/pmem.h        |   8 +++
 8 files changed, 301 insertions(+), 2 deletions(-)
diff mbox

Patch

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index d750e67460..7c5707fe11 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2605,6 +2605,22 @@  int xc_nvdimm_pmem_get_regions_nr(xc_interface *xch,
 int xc_nvdimm_pmem_get_regions(xc_interface *xch, uint8_t type,
                                void *buffer, uint32_t *nr);
 
+/*
+ * Setup the specified PMEM pages for management usage. If success,
+ * these PMEM pages can be used to store the frametable and M2P table
+ * of itself and other PMEM pages. These management PMEM pages will
+ * never be mapped to guest.
+ *
+ * Parameters:
+ *  xch:        xc interface handle
+ *  smfn, emfn: the start and end MFN of the PMEM region
+ *
+ * Return:
+ *  On success, return 0. Otherwise, return a non-zero error code.
+ */
+int xc_nvdimm_pmem_setup_mgmt(xc_interface *xch,
+                              unsigned long smfn, unsigned long emfn);
+
 /* Compat shims */
 #include "xenctrl_compat.h"
 
diff --git a/tools/libxc/xc_misc.c b/tools/libxc/xc_misc.c
index f9ce802eda..bebe6d04c8 100644
--- a/tools/libxc/xc_misc.c
+++ b/tools/libxc/xc_misc.c
@@ -975,6 +975,40 @@  out:
     return rc;
 }
 
+static void xc_nvdimm_pmem_setup_common(struct xen_sysctl *sysctl,
+                                        unsigned long smfn, unsigned long emfn,
+                                        unsigned long mgmt_smfn,
+                                        unsigned long mgmt_emfn)
+{
+    xen_sysctl_nvdimm_op_t *nvdimm = &sysctl->u.nvdimm;
+    xen_sysctl_nvdimm_pmem_setup_t *setup = &nvdimm->u.pmem_setup;
+
+    sysctl->cmd = XEN_SYSCTL_nvdimm_op;
+    nvdimm->cmd = XEN_SYSCTL_nvdimm_pmem_setup;
+    nvdimm->pad = 0;
+    nvdimm->err = 0;
+    setup->smfn = smfn;
+    setup->emfn = emfn;
+    setup->mgmt_smfn = mgmt_smfn;
+    setup->mgmt_emfn = mgmt_emfn;
+}
+
+int xc_nvdimm_pmem_setup_mgmt(xc_interface *xch,
+                              unsigned long smfn, unsigned long emfn)
+{
+    DECLARE_SYSCTL;
+    int rc;
+
+    xc_nvdimm_pmem_setup_common(&sysctl, smfn, emfn, smfn, emfn);
+    sysctl.u.nvdimm.u.pmem_setup.type = PMEM_REGION_TYPE_MGMT;
+
+    rc = do_sysctl(xch, &sysctl);
+    if ( rc && sysctl.u.nvdimm.err )
+        rc = -sysctl.u.nvdimm.err;
+
+    return rc;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 2fdf609805..93ccf198c9 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2341,7 +2341,8 @@  void put_page(struct page_info *page)
 
     if ( unlikely((nx & PGC_count_mask) == 0) )
     {
-        if ( cleanup_page_cacheattr(page) == 0 )
+        if ( !is_pmem_page(page) /* PMEM page is not allocated from Xen heap. */
+             && cleanup_page_cacheattr(page) == 0 )
             free_domheap_page(page);
         else
             gdprintk(XENLOG_WARNING,
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index d92307ca0b..7dbc5e966c 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -1535,6 +1535,78 @@  int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
     return ret;
 }
 
+#ifdef CONFIG_NVDIMM_PMEM
+
+static void pmem_init_frame_table(unsigned long smfn, unsigned long emfn)
+{
+    struct page_info *page = mfn_to_page(smfn), *epage = mfn_to_page(emfn);
+
+    while ( page < epage )
+    {
+        page->count_info = PGC_state_free | PGC_pmem_page;
+        page++;
+    }
+}
+
+/**
+ * Initialize frametable and M2P for the specified PMEM region.
+ *
+ * Parameters:
+ *  smfn, emfn: the start and end MFN of the PMEM region
+ *  mgmt_smfn,
+ *  mgmt_emfn:  the start and end MFN of the PMEM region used to store
+ *              the frame table and M2P table of above PMEM region. If
+ *              @smfn - @emfn is going to be mapped to guest, it should
+ *              not overlap with @mgmt_smfn - @mgmt_emfn. If @smfn - @emfn
+ *              is going to be used for management purpose, it should
+ *              be identical to @mgmt_smfn - @mgnt_emfn.
+ *  used_mgmt_mfns: return the number of pages used in @mgmt_smfn - @mgmt_emfn
+ *
+ * Return:
+ *  On success, return 0. Otherwise, return a non-zero error code.
+ */
+int pmem_arch_setup(unsigned long smfn, unsigned long emfn, unsigned int pxm,
+                    unsigned long mgmt_smfn, unsigned long mgmt_emfn,
+                    unsigned long *used_mgmt_mfns)
+{
+    struct mem_hotadd_info info =
+        { .spfn = smfn, .epfn = emfn, .cur = smfn };
+    struct mem_hotadd_info mgmt_info =
+        { .spfn = mgmt_smfn, .epfn = mgmt_emfn, .cur = mgmt_smfn };
+    struct mem_hotadd_alloc alloc =
+    {
+        .alloc_mfns = alloc_hotadd_mfn,
+        .opaque     = &mgmt_info
+    };
+    bool is_mgmt = (mgmt_smfn == smfn && mgmt_emfn == emfn);
+    int rc;
+
+    if ( mgmt_smfn == mfn_x(INVALID_MFN) || mgmt_emfn == mfn_x(INVALID_MFN) ||
+         mgmt_smfn >= mgmt_emfn )
+        return -EINVAL;
+
+    if ( !is_mgmt &&
+         ((smfn >= mgmt_smfn && smfn < mgmt_emfn) ||
+          (emfn > mgmt_smfn && emfn <= mgmt_emfn)) )
+        return -EINVAL;
+
+    rc = memory_add_common(&info, pxm, false, &alloc);
+    if ( rc )
+        return rc;
+
+    pmem_init_frame_table(smfn, emfn);
+
+    if ( !is_mgmt )
+        share_hotadd_m2p_table(&info);
+
+    if ( used_mgmt_mfns )
+        *used_mgmt_mfns = mgmt_info.cur - mgmt_info.spfn;
+
+    return 0;
+}
+
+#endif /* CONFIG_NVDIMM_PMEM */
+
 #include "compat/mm.c"
 
 /*
diff --git a/xen/common/pmem.c b/xen/common/pmem.c
index a737e7dc71..7a081c2879 100644
--- a/xen/common/pmem.c
+++ b/xen/common/pmem.c
@@ -31,6 +31,15 @@ 
 static LIST_HEAD(pmem_raw_regions);
 static unsigned int nr_raw_regions;
 
+/*
+ * All PMEM regions reserved for management purpose are linked to this
+ * list. All of them must be covered by one or multiple PMEM regions
+ * in list pmem_raw_regions.
+ */
+static LIST_HEAD(pmem_mgmt_regions);
+static DEFINE_SPINLOCK(pmem_mgmt_lock);
+static unsigned int nr_mgmt_regions;
+
 struct pmem {
     struct list_head link; /* link to one of PMEM region list */
     unsigned long smfn;    /* start MFN of the PMEM region */
@@ -40,6 +49,10 @@  struct pmem {
         struct {
             unsigned int pxm; /* proximity domain of the PMEM region */
         } raw;
+
+        struct {
+            unsigned long used; /* # of used pages in MGMT PMEM region */
+        } mgmt;
     } u;
 };
 
@@ -107,6 +120,18 @@  static int pmem_list_add(struct list_head *list,
     return rc;
 }
 
+/**
+ * Delete the specified entry from the list to which it's currently linked.
+ *
+ * Parameters:
+ *  entry: the entry to be deleted
+ */
+static void pmem_list_del(struct pmem *entry)
+{
+    list_del(&entry->link);
+    xfree(entry);
+}
+
 static int pmem_get_regions_nr(xen_sysctl_nvdimm_pmem_regions_nr_t *regions_nr)
 {
     int rc = 0;
@@ -185,6 +210,114 @@  static int pmem_get_regions(xen_sysctl_nvdimm_pmem_regions_t *regions)
     return rc;
 }
 
+static bool check_mgmt_size(unsigned long mgmt_mfns, unsigned long total_mfns)
+{
+    return mgmt_mfns >=
+        ((sizeof(struct page_info) * total_mfns) >> PAGE_SHIFT) +
+        ((sizeof(*machine_to_phys_mapping) * total_mfns) >> PAGE_SHIFT);
+}
+
+static bool check_address_and_pxm(unsigned long smfn, unsigned long emfn,
+                                  unsigned int *ret_pxm)
+{
+    struct list_head *cur;
+    long pxm = -1;
+
+    list_for_each(cur, &pmem_raw_regions)
+    {
+        struct pmem *raw = list_entry(cur, struct pmem, link);
+        unsigned long raw_smfn = raw->smfn, raw_emfn = raw->emfn;
+
+        if ( !check_overlap(smfn, emfn, raw_smfn, raw_emfn) )
+            continue;
+
+        if ( smfn < raw_smfn )
+            return false;
+
+        if ( pxm != -1 && pxm != raw->u.raw.pxm )
+            return false;
+        pxm = raw->u.raw.pxm;
+
+        smfn = min(emfn, raw_emfn);
+        if ( smfn == emfn )
+            break;
+    }
+
+    *ret_pxm = pxm;
+
+    return smfn == emfn;
+}
+
+static int pmem_setup_mgmt(unsigned long smfn, unsigned long emfn)
+{
+    struct pmem *mgmt;
+    unsigned long used_mgmt_mfns;
+    unsigned int pxm;
+    int rc;
+
+    if ( smfn == mfn_x(INVALID_MFN) || emfn == mfn_x(INVALID_MFN) ||
+         smfn >= emfn )
+        return -EINVAL;
+
+    /*
+     * Require the PMEM region in one proximity domain, in order to
+     * avoid the error recovery from multiple calls to pmem_arch_setup()
+     * which is not revertible.
+     */
+    if ( !check_address_and_pxm(smfn, emfn, &pxm) )
+        return -EINVAL;
+
+    if ( !check_mgmt_size(emfn - smfn, emfn - smfn) )
+        return -ENOSPC;
+
+    spin_lock(&pmem_mgmt_lock);
+
+    rc = pmem_list_add(&pmem_mgmt_regions, smfn, emfn, &mgmt);
+    if ( rc )
+        goto out;
+
+    rc = pmem_arch_setup(smfn, emfn, pxm, smfn, emfn, &used_mgmt_mfns);
+    if ( rc )
+    {
+        pmem_list_del(mgmt);
+        goto out;
+    }
+
+    mgmt->u.mgmt.used = used_mgmt_mfns;
+    nr_mgmt_regions++;
+
+ out:
+    spin_unlock(&pmem_mgmt_lock);
+
+    return rc;
+}
+
+static int pmem_setup(unsigned long smfn, unsigned long emfn,
+                      unsigned long mgmt_smfn, unsigned long mgmt_emfn,
+                      unsigned int type)
+{
+    int rc;
+
+    switch ( type )
+    {
+    case PMEM_REGION_TYPE_MGMT:
+        if ( smfn != mgmt_smfn || emfn != mgmt_emfn )
+        {
+            rc = -EINVAL;
+            break;
+        }
+
+        rc = pmem_setup_mgmt(smfn, emfn);
+
+        break;
+
+    default:
+        rc = -EINVAL;
+    }
+
+    return rc;
+}
+
 /**
  * Register a pmem region to Xen.
  *
@@ -234,6 +367,15 @@  int pmem_do_sysctl(struct xen_sysctl_nvdimm_op *nvdimm)
         rc = pmem_get_regions(&nvdimm->u.pmem_regions);
         break;
 
+    case XEN_SYSCTL_nvdimm_pmem_setup:
+    {
+        struct xen_sysctl_nvdimm_pmem_setup *setup = &nvdimm->u.pmem_setup;
+        rc = pmem_setup(setup->smfn, setup->emfn,
+                        setup->mgmt_smfn, setup->mgmt_emfn,
+                        setup->type);
+        break;
+    }
+
     default:
         rc = -ENOSYS;
     }
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index bef45e8e9f..33a732846f 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -245,9 +245,11 @@  struct page_info
 #define PGC_state_offlined PG_mask(2, 9)
 #define PGC_state_free    PG_mask(3, 9)
 #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
+/* Page is from PMEM? */
+#define PGC_pmem_page     PG_mask(1, 10)
 
  /* Count of references to this frame. */
-#define PGC_count_width   PG_shift(9)
+#define PGC_count_width   PG_shift(10)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
 /*
@@ -264,6 +266,12 @@  struct page_info
     ((((mfn) << PAGE_SHIFT) >= __pa(&_stext)) &&  \
      (((mfn) << PAGE_SHIFT) <= __pa(&__2M_rwdata_end)))
 
+#ifdef CONFIG_NVDIMM_PMEM
+#define is_pmem_page(page) ((page)->count_info & PGC_pmem_page)
+#else
+#define is_pmem_page(page) false
+#endif
+
 #define PRtype_info "016lx"/* should only be used for printk's */
 
 /* The number of out-of-sync shadows we allow per vcpu (prime, please) */
diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
index 2635b1c911..5d208033a0 100644
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -1120,6 +1120,7 @@  DEFINE_XEN_GUEST_HANDLE(xen_sysctl_set_parameter_t);
 
 /* Types of PMEM regions */
 #define PMEM_REGION_TYPE_RAW        0 /* PMEM regions detected by Xen */
+#define PMEM_REGION_TYPE_MGMT       1 /* PMEM regions for management usage */
 
 /* PMEM_REGION_TYPE_RAW */
 struct xen_sysctl_nvdimm_pmem_raw_region {
@@ -1154,14 +1155,31 @@  struct xen_sysctl_nvdimm_pmem_regions {
 typedef struct xen_sysctl_nvdimm_pmem_regions xen_sysctl_nvdimm_pmem_regions_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_nvdimm_pmem_regions_t);
 
+/* XEN_SYSCTL_nvdimm_pmem_setup */
+struct xen_sysctl_nvdimm_pmem_setup {
+    /* IN variables */
+    uint64_t smfn;      /* start MFN of the PMEM region */
+    uint64_t emfn;      /* end MFN of the PMEM region */
+    uint64_t mgmt_smfn;
+    uint64_t mgmt_emfn; /* start and end MFN of PMEM pages used to manage */
+                        /* above PMEM region. If the above PMEM region is */
+                        /* a management region, mgmt_{s,e}mfn is required */
+                        /* to be identical to {s,e}mfn. */
+    uint8_t  type;      /* Only PMEM_REGION_TYPE_MGMT is supported now */
+};
+typedef struct xen_sysctl_nvdimm_pmem_setup xen_sysctl_nvdimm_pmem_setup_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_nvdimm_pmem_setup_t);
+
 struct xen_sysctl_nvdimm_op {
     uint32_t cmd; /* IN: XEN_SYSCTL_nvdimm_*. */
 #define XEN_SYSCTL_nvdimm_pmem_get_regions_nr     0
 #define XEN_SYSCTL_nvdimm_pmem_get_regions        1
+#define XEN_SYSCTL_nvdimm_pmem_setup              2
     uint32_t pad; /* IN: Always zero. */
     union {
         xen_sysctl_nvdimm_pmem_regions_nr_t pmem_regions_nr;
         xen_sysctl_nvdimm_pmem_regions_t pmem_regions;
+        xen_sysctl_nvdimm_pmem_setup_t pmem_setup;
     } u;
     uint32_t err; /* OUT: error code */
 };
diff --git a/xen/include/xen/pmem.h b/xen/include/xen/pmem.h
index 922b12f570..9323d679a6 100644
--- a/xen/include/xen/pmem.h
+++ b/xen/include/xen/pmem.h
@@ -29,6 +29,9 @@  int pmem_do_sysctl(struct xen_sysctl_nvdimm_op *nvdimm);
 #ifdef CONFIG_X86
 
 int pmem_dom0_setup_permission(struct domain *d);
+int pmem_arch_setup(unsigned long smfn, unsigned long emfn, unsigned int pxm,
+                    unsigned long mgmt_smfn, unsigned long mgmt_emfn,
+                    unsigned long *used_mgmt_mfns);
 
 #else /* !CONFIG_X86 */
 
@@ -37,6 +40,11 @@  static inline int pmem_dom0_setup_permission(...)
     return -ENOSYS;
 }
 
+static inline int pmem_arch_setup(...)
+{
+    return -ENOSYS;
+}
+
 #endif /* CONFIG_X86 */
 
 #endif /* CONFIG_NVDIMM_PMEM */