@@ -2605,6 +2605,22 @@ int xc_nvdimm_pmem_get_regions_nr(xc_interface *xch,
int xc_nvdimm_pmem_get_regions(xc_interface *xch, uint8_t type,
void *buffer, uint32_t *nr);
+/*
+ * Setup the specified PMEM pages for management usage. If success,
+ * these PMEM pages can be used to store the frametable and M2P table
+ * of itself and other PMEM pages. These management PMEM pages will
+ * never be mapped to guest.
+ *
+ * Parameters:
+ * xch: xc interface handle
+ * smfn, emfn: the start and end MFN of the PMEM region
+ *
+ * Return:
+ * On success, return 0. Otherwise, return a non-zero error code.
+ */
+int xc_nvdimm_pmem_setup_mgmt(xc_interface *xch,
+ unsigned long smfn, unsigned long emfn);
+
/* Compat shims */
#include "xenctrl_compat.h"
@@ -975,6 +975,40 @@ out:
return rc;
}
+static void xc_nvdimm_pmem_setup_common(struct xen_sysctl *sysctl,
+ unsigned long smfn, unsigned long emfn,
+ unsigned long mgmt_smfn,
+ unsigned long mgmt_emfn)
+{
+ xen_sysctl_nvdimm_op_t *nvdimm = &sysctl->u.nvdimm;
+ xen_sysctl_nvdimm_pmem_setup_t *setup = &nvdimm->u.pmem_setup;
+
+ sysctl->cmd = XEN_SYSCTL_nvdimm_op;
+ nvdimm->cmd = XEN_SYSCTL_nvdimm_pmem_setup;
+ nvdimm->pad = 0;
+ nvdimm->err = 0;
+ setup->smfn = smfn;
+ setup->emfn = emfn;
+ setup->mgmt_smfn = mgmt_smfn;
+ setup->mgmt_emfn = mgmt_emfn;
+}
+
+int xc_nvdimm_pmem_setup_mgmt(xc_interface *xch,
+ unsigned long smfn, unsigned long emfn)
+{
+ DECLARE_SYSCTL;
+ int rc;
+
+ xc_nvdimm_pmem_setup_common(&sysctl, smfn, emfn, smfn, emfn);
+ sysctl.u.nvdimm.u.pmem_setup.type = PMEM_REGION_TYPE_MGMT;
+
+ rc = do_sysctl(xch, &sysctl);
+ if ( rc && sysctl.u.nvdimm.err )
+ rc = -sysctl.u.nvdimm.err;
+
+ return rc;
+}
+
/*
* Local variables:
* mode: C
@@ -2341,7 +2341,8 @@ void put_page(struct page_info *page)
if ( unlikely((nx & PGC_count_mask) == 0) )
{
- if ( cleanup_page_cacheattr(page) == 0 )
+ if ( !is_pmem_page(page) /* PMEM page is not allocated from Xen heap. */
+ && cleanup_page_cacheattr(page) == 0 )
free_domheap_page(page);
else
gdprintk(XENLOG_WARNING,
@@ -1535,6 +1535,78 @@ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
return ret;
}
+#ifdef CONFIG_NVDIMM_PMEM
+
+static void pmem_init_frame_table(unsigned long smfn, unsigned long emfn)
+{
+ struct page_info *page = mfn_to_page(smfn), *epage = mfn_to_page(emfn);
+
+ while ( page < epage )
+ {
+ page->count_info = PGC_state_free | PGC_pmem_page;
+ page++;
+ }
+}
+
+/**
+ * Initialize frametable and M2P for the specified PMEM region.
+ *
+ * Parameters:
+ * smfn, emfn: the start and end MFN of the PMEM region
+ * mgmt_smfn,
+ * mgmt_emfn: the start and end MFN of the PMEM region used to store
+ * the frame table and M2P table of above PMEM region. If
+ * @smfn - @emfn is going to be mapped to guest, it should
+ * not overlap with @mgmt_smfn - @mgmt_emfn. If @smfn - @emfn
+ * is going to be used for management purpose, it should
+ * be identical to @mgmt_smfn - @mgnt_emfn.
+ * used_mgmt_mfns: return the number of pages used in @mgmt_smfn - @mgmt_emfn
+ *
+ * Return:
+ * On success, return 0. Otherwise, return a non-zero error code.
+ */
+int pmem_arch_setup(unsigned long smfn, unsigned long emfn, unsigned int pxm,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn,
+ unsigned long *used_mgmt_mfns)
+{
+ struct mem_hotadd_info info =
+ { .spfn = smfn, .epfn = emfn, .cur = smfn };
+ struct mem_hotadd_info mgmt_info =
+ { .spfn = mgmt_smfn, .epfn = mgmt_emfn, .cur = mgmt_smfn };
+ struct mem_hotadd_alloc alloc =
+ {
+ .alloc_mfns = alloc_hotadd_mfn,
+ .opaque = &mgmt_info
+ };
+ bool is_mgmt = (mgmt_smfn == smfn && mgmt_emfn == emfn);
+ int rc;
+
+ if ( mgmt_smfn == mfn_x(INVALID_MFN) || mgmt_emfn == mfn_x(INVALID_MFN) ||
+ mgmt_smfn >= mgmt_emfn )
+ return -EINVAL;
+
+ if ( !is_mgmt &&
+ ((smfn >= mgmt_smfn && smfn < mgmt_emfn) ||
+ (emfn > mgmt_smfn && emfn <= mgmt_emfn)) )
+ return -EINVAL;
+
+ rc = memory_add_common(&info, pxm, false, &alloc);
+ if ( rc )
+ return rc;
+
+ pmem_init_frame_table(smfn, emfn);
+
+ if ( !is_mgmt )
+ share_hotadd_m2p_table(&info);
+
+ if ( used_mgmt_mfns )
+ *used_mgmt_mfns = mgmt_info.cur - mgmt_info.spfn;
+
+ return 0;
+}
+
+#endif /* CONFIG_NVDIMM_PMEM */
+
#include "compat/mm.c"
/*
@@ -31,6 +31,15 @@
static LIST_HEAD(pmem_raw_regions);
static unsigned int nr_raw_regions;
+/*
+ * All PMEM regions reserved for management purpose are linked to this
+ * list. All of them must be covered by one or multiple PMEM regions
+ * in list pmem_raw_regions.
+ */
+static LIST_HEAD(pmem_mgmt_regions);
+static DEFINE_SPINLOCK(pmem_mgmt_lock);
+static unsigned int nr_mgmt_regions;
+
struct pmem {
struct list_head link; /* link to one of PMEM region list */
unsigned long smfn; /* start MFN of the PMEM region */
@@ -40,6 +49,10 @@ struct pmem {
struct {
unsigned int pxm; /* proximity domain of the PMEM region */
} raw;
+
+ struct {
+ unsigned long used; /* # of used pages in MGMT PMEM region */
+ } mgmt;
} u;
};
@@ -107,6 +120,18 @@ static int pmem_list_add(struct list_head *list,
return rc;
}
+/**
+ * Delete the specified entry from the list to which it's currently linked.
+ *
+ * Parameters:
+ * entry: the entry to be deleted
+ */
+static void pmem_list_del(struct pmem *entry)
+{
+ list_del(&entry->link);
+ xfree(entry);
+}
+
static int pmem_get_regions_nr(xen_sysctl_nvdimm_pmem_regions_nr_t *regions_nr)
{
int rc = 0;
@@ -185,6 +210,114 @@ static int pmem_get_regions(xen_sysctl_nvdimm_pmem_regions_t *regions)
return rc;
}
+static bool check_mgmt_size(unsigned long mgmt_mfns, unsigned long total_mfns)
+{
+ return mgmt_mfns >=
+ ((sizeof(struct page_info) * total_mfns) >> PAGE_SHIFT) +
+ ((sizeof(*machine_to_phys_mapping) * total_mfns) >> PAGE_SHIFT);
+}
+
+static bool check_address_and_pxm(unsigned long smfn, unsigned long emfn,
+ unsigned int *ret_pxm)
+{
+ struct list_head *cur;
+ long pxm = -1;
+
+ list_for_each(cur, &pmem_raw_regions)
+ {
+ struct pmem *raw = list_entry(cur, struct pmem, link);
+ unsigned long raw_smfn = raw->smfn, raw_emfn = raw->emfn;
+
+ if ( !check_overlap(smfn, emfn, raw_smfn, raw_emfn) )
+ continue;
+
+ if ( smfn < raw_smfn )
+ return false;
+
+ if ( pxm != -1 && pxm != raw->u.raw.pxm )
+ return false;
+ pxm = raw->u.raw.pxm;
+
+ smfn = min(emfn, raw_emfn);
+ if ( smfn == emfn )
+ break;
+ }
+
+ *ret_pxm = pxm;
+
+ return smfn == emfn;
+}
+
+static int pmem_setup_mgmt(unsigned long smfn, unsigned long emfn)
+{
+ struct pmem *mgmt;
+ unsigned long used_mgmt_mfns;
+ unsigned int pxm;
+ int rc;
+
+ if ( smfn == mfn_x(INVALID_MFN) || emfn == mfn_x(INVALID_MFN) ||
+ smfn >= emfn )
+ return -EINVAL;
+
+ /*
+ * Require the PMEM region in one proximity domain, in order to
+ * avoid the error recovery from multiple calls to pmem_arch_setup()
+ * which is not revertible.
+ */
+ if ( !check_address_and_pxm(smfn, emfn, &pxm) )
+ return -EINVAL;
+
+ if ( !check_mgmt_size(emfn - smfn, emfn - smfn) )
+ return -ENOSPC;
+
+ spin_lock(&pmem_mgmt_lock);
+
+ rc = pmem_list_add(&pmem_mgmt_regions, smfn, emfn, &mgmt);
+ if ( rc )
+ goto out;
+
+ rc = pmem_arch_setup(smfn, emfn, pxm, smfn, emfn, &used_mgmt_mfns);
+ if ( rc )
+ {
+ pmem_list_del(mgmt);
+ goto out;
+ }
+
+ mgmt->u.mgmt.used = used_mgmt_mfns;
+ nr_mgmt_regions++;
+
+ out:
+ spin_unlock(&pmem_mgmt_lock);
+
+ return rc;
+}
+
+static int pmem_setup(unsigned long smfn, unsigned long emfn,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn,
+ unsigned int type)
+{
+ int rc;
+
+ switch ( type )
+ {
+ case PMEM_REGION_TYPE_MGMT:
+ if ( smfn != mgmt_smfn || emfn != mgmt_emfn )
+ {
+ rc = -EINVAL;
+ break;
+ }
+
+ rc = pmem_setup_mgmt(smfn, emfn);
+
+ break;
+
+ default:
+ rc = -EINVAL;
+ }
+
+ return rc;
+}
+
/**
* Register a pmem region to Xen.
*
@@ -234,6 +367,15 @@ int pmem_do_sysctl(struct xen_sysctl_nvdimm_op *nvdimm)
rc = pmem_get_regions(&nvdimm->u.pmem_regions);
break;
+ case XEN_SYSCTL_nvdimm_pmem_setup:
+ {
+ struct xen_sysctl_nvdimm_pmem_setup *setup = &nvdimm->u.pmem_setup;
+ rc = pmem_setup(setup->smfn, setup->emfn,
+ setup->mgmt_smfn, setup->mgmt_emfn,
+ setup->type);
+ break;
+ }
+
default:
rc = -ENOSYS;
}
@@ -245,9 +245,11 @@ struct page_info
#define PGC_state_offlined PG_mask(2, 9)
#define PGC_state_free PG_mask(3, 9)
#define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
+/* Page is from PMEM? */
+#define PGC_pmem_page PG_mask(1, 10)
/* Count of references to this frame. */
-#define PGC_count_width PG_shift(9)
+#define PGC_count_width PG_shift(10)
#define PGC_count_mask ((1UL<<PGC_count_width)-1)
/*
@@ -264,6 +266,12 @@ struct page_info
((((mfn) << PAGE_SHIFT) >= __pa(&_stext)) && \
(((mfn) << PAGE_SHIFT) <= __pa(&__2M_rwdata_end)))
+#ifdef CONFIG_NVDIMM_PMEM
+#define is_pmem_page(page) ((page)->count_info & PGC_pmem_page)
+#else
+#define is_pmem_page(page) false
+#endif
+
#define PRtype_info "016lx"/* should only be used for printk's */
/* The number of out-of-sync shadows we allow per vcpu (prime, please) */
@@ -1120,6 +1120,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_sysctl_set_parameter_t);
/* Types of PMEM regions */
#define PMEM_REGION_TYPE_RAW 0 /* PMEM regions detected by Xen */
+#define PMEM_REGION_TYPE_MGMT 1 /* PMEM regions for management usage */
/* PMEM_REGION_TYPE_RAW */
struct xen_sysctl_nvdimm_pmem_raw_region {
@@ -1154,14 +1155,31 @@ struct xen_sysctl_nvdimm_pmem_regions {
typedef struct xen_sysctl_nvdimm_pmem_regions xen_sysctl_nvdimm_pmem_regions_t;
DEFINE_XEN_GUEST_HANDLE(xen_sysctl_nvdimm_pmem_regions_t);
+/* XEN_SYSCTL_nvdimm_pmem_setup */
+struct xen_sysctl_nvdimm_pmem_setup {
+ /* IN variables */
+ uint64_t smfn; /* start MFN of the PMEM region */
+ uint64_t emfn; /* end MFN of the PMEM region */
+ uint64_t mgmt_smfn;
+ uint64_t mgmt_emfn; /* start and end MFN of PMEM pages used to manage */
+ /* above PMEM region. If the above PMEM region is */
+ /* a management region, mgmt_{s,e}mfn is required */
+ /* to be identical to {s,e}mfn. */
+ uint8_t type; /* Only PMEM_REGION_TYPE_MGMT is supported now */
+};
+typedef struct xen_sysctl_nvdimm_pmem_setup xen_sysctl_nvdimm_pmem_setup_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_nvdimm_pmem_setup_t);
+
struct xen_sysctl_nvdimm_op {
uint32_t cmd; /* IN: XEN_SYSCTL_nvdimm_*. */
#define XEN_SYSCTL_nvdimm_pmem_get_regions_nr 0
#define XEN_SYSCTL_nvdimm_pmem_get_regions 1
+#define XEN_SYSCTL_nvdimm_pmem_setup 2
uint32_t pad; /* IN: Always zero. */
union {
xen_sysctl_nvdimm_pmem_regions_nr_t pmem_regions_nr;
xen_sysctl_nvdimm_pmem_regions_t pmem_regions;
+ xen_sysctl_nvdimm_pmem_setup_t pmem_setup;
} u;
uint32_t err; /* OUT: error code */
};
@@ -29,6 +29,9 @@ int pmem_do_sysctl(struct xen_sysctl_nvdimm_op *nvdimm);
#ifdef CONFIG_X86
int pmem_dom0_setup_permission(struct domain *d);
+int pmem_arch_setup(unsigned long smfn, unsigned long emfn, unsigned int pxm,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn,
+ unsigned long *used_mgmt_mfns);
#else /* !CONFIG_X86 */
@@ -37,6 +40,11 @@ static inline int pmem_dom0_setup_permission(...)
return -ENOSYS;
}
+static inline int pmem_arch_setup(...)
+{
+ return -ENOSYS;
+}
+
#endif /* CONFIG_X86 */
#endif /* CONFIG_NVDIMM_PMEM */
Add a command XEN_SYSCTL_nvdimm_pmem_setup to hypercall XEN_SYSCTL_nvdimm_op to setup the frame table and M2P table of a PMEM region. This command is currently used to setup the management PMEM region which is used to store the frame table and M2P table of other PMEM regions and itself. The management PMEM region should not be mapped to guest. PMEM pages are not added in any Xen or domain heaps. A new flag PGC_pmem_page is used to indicate whether a page is from PMEM and avoid returning PMEM pages to heaps. Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> --- Cc: Ian Jackson <ian.jackson@eu.citrix.com> Cc: Wei Liu <wei.liu2@citrix.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> Cc: George Dunlap <George.Dunlap@eu.citrix.com> Cc: Jan Beulich <jbeulich@suse.com> --- tools/libxc/include/xenctrl.h | 16 +++++ tools/libxc/xc_misc.c | 34 ++++++++++ xen/arch/x86/mm.c | 3 +- xen/arch/x86/x86_64/mm.c | 72 +++++++++++++++++++++ xen/common/pmem.c | 142 ++++++++++++++++++++++++++++++++++++++++++ xen/include/asm-x86/mm.h | 10 ++- xen/include/public/sysctl.h | 18 ++++++ xen/include/xen/pmem.h | 8 +++ 8 files changed, 301 insertions(+), 2 deletions(-)