@@ -16,7 +16,7 @@ allow dom0_t xen_t:xen {
allow dom0_t xen_t:xen2 {
resource_op psr_cmt_op psr_cat_op pmu_ctrl get_symbol
get_cpu_levelling_caps get_cpu_featureset livepatch_op
- gcov_op
+ gcov_op nvdimm_op
};
# Allow dom0 to use all XENVER_ subops that have checks.
@@ -2534,6 +2534,25 @@ int xc_livepatch_replace(xc_interface *xch, char *name, uint32_t timeout);
int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
xen_pfn_t start_pfn, xen_pfn_t nr_pfns);
+/*
+ * Query Xen hypervisor to prepare for mapping host pmem pages.
+ *
+ * Parameters:
+ * xch: xc interface handler
+ * smfn: the start MFN of the host pmem pages to be mapped
+ * emfn: the end MFN of the host pmem pages to be mapped
+ * mgmt_smfn: If not INVALID_MFN, the start MFN of host pmem pages for managing
+ * above pmem pages
+ * mgmt_emfn: If not INVALID_MFN, the end MFN of host pmem pages for managing
+ * above pmem pages
+ *
+ * Return:
+ * 0 on success; non-zero error code on failures.
+ */
+int xc_nvdimm_pmem_setup(xc_interface *xch,
+ unsigned long smfn, unsigned long emfn,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn);
+
/* Compat shims */
#include "xenctrl_compat.h"
@@ -817,6 +817,23 @@ int xc_livepatch_replace(xc_interface *xch, char *name, uint32_t timeout)
return _xc_livepatch_action(xch, name, LIVEPATCH_ACTION_REPLACE, timeout);
}
+int xc_nvdimm_pmem_setup(xc_interface *xch,
+ unsigned long smfn, unsigned long emfn,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn)
+{
+ DECLARE_SYSCTL;
+
+ sysctl.cmd = XEN_SYSCTL_nvdimm_op;
+ sysctl.u.nvdimm.cmd = XEN_SYSCTL_nvdimm_pmem_setup;
+ sysctl.u.nvdimm.pad = 0;
+ sysctl.u.nvdimm.u.setup.smfn = smfn;
+ sysctl.u.nvdimm.u.setup.emfn = emfn;
+ sysctl.u.nvdimm.u.setup.mgmt_smfn = mgmt_smfn;
+ sysctl.u.nvdimm.u.setup.mgmt_emfn = mgmt_emfn;
+
+ return do_sysctl(xch, &sysctl);
+}
+
/*
* Local variables:
* mode: C
new file mode 100644
@@ -19,6 +19,7 @@
#include <xen/trace.h>
#include <xen/console.h>
#include <xen/iocap.h>
+#include <xen/pmem.h>
#include <asm/irq.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/support.h>
@@ -250,6 +251,27 @@ long arch_do_sysctl(
break;
}
+#ifdef CONFIG_PMEM
+ case XEN_SYSCTL_nvdimm_op:
+ {
+ xen_sysctl_nvdimm_pmem_setup_t *setup;
+
+ switch ( sysctl->u.nvdimm.cmd )
+ {
+ case XEN_SYSCTL_nvdimm_pmem_setup:
+ setup = &sysctl->u.nvdimm.u.setup;
+ ret = pmem_setup(setup->smfn, setup->emfn,
+ setup->mgmt_smfn, setup->mgmt_emfn);
+ break;
+
+ default:
+ ret = -ENOSYS;
+ }
+
+ break;
+ }
+#endif /* CONFIG_PMEM */
+
default:
ret = -ENOSYS;
break;
@@ -27,6 +27,7 @@ asm(".file \"" __FILE__ "\"");
#include <xen/guest_access.h>
#include <xen/hypercall.h>
#include <xen/mem_access.h>
+#include <xen/pmem.h>
#include <asm/current.h>
#include <asm/asm_defns.h>
#include <asm/page.h>
@@ -1522,6 +1523,116 @@ destroy_frametable:
return ret;
}
+#ifdef CONFIG_PMEM
+
+static unsigned long pmem_alloc_from_ram(struct mem_hotadd_info *unused)
+{
+ unsigned long mfn = mfn_x(INVALID_MFN);
+ struct page_info *page = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0);
+
+ if ( page )
+ mfn = page_to_mfn(page);
+
+ return mfn;
+}
+
+static int pmem_setup_m2p_table(const struct mem_hotadd_info *info,
+ mfns_alloc_fn_t alloc_fn,
+ struct mem_hotadd_info *alloc_info)
+{
+ unsigned long smfn = info->spfn;
+ unsigned long emfn = info->epfn;
+
+ if ( max_page < emfn )
+ {
+ max_page = emfn;
+ max_pdx = pfn_to_pdx(max_page - 1) + 1;
+ }
+ total_pages += emfn - smfn;
+
+ set_pdx_range(smfn, emfn);
+
+ return setup_m2p_table(info, alloc_fn, alloc_info);
+}
+
+int pmem_arch_setup(unsigned long data_smfn, unsigned long data_emfn,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn)
+{
+ int ret;
+ unsigned old_max_mgmt = max_page, old_total_mgmt = total_pages;
+ unsigned old_max_data, old_total_data;
+ bool mgmt_in_pmem = (mgmt_smfn != mfn_x(INVALID_MFN) &&
+ mgmt_emfn != mfn_x(INVALID_MFN));
+ mfns_alloc_fn_t alloc_fn = pmem_alloc_from_ram;
+ struct mem_hotadd_info *alloc_info = NULL;
+ struct mem_hotadd_info data_info =
+ { .spfn = data_smfn, .epfn = data_emfn, .cur = data_smfn };
+ struct mem_hotadd_info mgmt_info =
+ { .spfn = mgmt_smfn, .epfn = mgmt_emfn, .cur = mgmt_smfn };
+
+ if ( !mem_hotadd_check(data_smfn, data_emfn) )
+ return -EINVAL;
+
+ if ( mgmt_in_pmem )
+ {
+ if ( !mem_hotadd_check(mgmt_smfn, mgmt_emfn) )
+ return -EINVAL;
+
+ alloc_fn = alloc_hotadd_mfn;
+ alloc_info = &mgmt_info;
+
+ ret = extend_frame_table(&mgmt_info, alloc_fn, alloc_info);
+ if ( ret )
+ goto destroy_frametable_mgmt;
+
+ ret = pmem_setup_m2p_table(&mgmt_info, alloc_fn, alloc_info);
+ if ( ret )
+ goto destroy_m2p_mgmt;
+ }
+
+ ret = extend_frame_table(&data_info, alloc_fn, alloc_info);
+ if ( ret )
+ goto destroy_frametable_data;
+
+ old_max_data = max_page;
+ old_total_data = total_pages;
+ ret = pmem_setup_m2p_table(&data_info, alloc_fn, alloc_info);
+ if ( ret )
+ goto destroy_m2p_data;
+
+ share_hotadd_m2p_table(&data_info);
+ if ( mgmt_in_pmem )
+ share_hotadd_m2p_table(&mgmt_info);
+
+ return 0;
+
+destroy_m2p_data:
+ destroy_m2p_mapping(&data_info);
+ max_page = old_max_data;
+ total_pages = old_total_data;
+ max_pdx = pfn_to_pdx(max_page - 1) + 1;
+
+destroy_frametable_data:
+ cleanup_frame_table(&data_info);
+
+destroy_m2p_mgmt:
+ if ( mgmt_in_pmem )
+ {
+ destroy_m2p_mapping(&mgmt_info);
+ max_page = old_max_mgmt;
+ total_pages = old_total_mgmt;
+ max_pdx = pfn_to_pdx(max_page - 1) + 1;
+ }
+
+destroy_frametable_mgmt:
+ if ( mgmt_in_pmem )
+ cleanup_frame_table(&mgmt_info);
+
+ return ret;
+}
+
+#endif /* CONFIG_PMEM */
+
#include "compat/mm.c"
/*
@@ -18,6 +18,7 @@
#include <xen/errno.h>
#include <xen/list.h>
+#include <xen/mm.h>
#include <xen/pmem.h>
#include <xen/spinlock.h>
@@ -28,10 +29,33 @@
static DEFINE_SPINLOCK(pmem_regions_lock);
static LIST_HEAD(pmem_regions);
+/*
+ * Two types of pmem regions are linked in this list and are
+ * distinguished by their ready flags.
+ * - Data pmem regions that can be mapped to guest, and their ready
+ * flags are true.
+ * - Management pmem regions that are used to management data regions
+ * and never mapped to guest, and their ready flags are false.
+ *
+ * All regions linked in this list must be covered by one or multiple
+ * regions in list pmem_regions as well.
+ */
+static DEFINE_SPINLOCK(pmem_gregions_lock);
+static LIST_HEAD(pmem_gregions);
+
struct pmem {
struct list_head link; /* link to pmem_list */
unsigned long smfn; /* start MFN of the whole pmem region */
unsigned long emfn; /* end MFN of the whole pmem region */
+
+ /*
+ * If frametable and M2P of this pmem region is stored in the
+ * regular RAM, mgmt will be NULL. Otherwise, it refers to another
+ * pmem region used for those management structures.
+ */
+ struct pmem *mgmt;
+
+ bool ready; /* indicate whether it can be mapped to guest */
};
static bool check_overlap(unsigned long smfn1, unsigned long emfn1,
@@ -76,6 +100,82 @@ static int pmem_list_add(struct list_head *list, struct pmem *entry)
return 0;
}
+static void pmem_list_remove(struct pmem *entry)
+{
+ list_del(&entry->link);
+}
+
+static struct pmem *get_first_overlap(const struct list_head *list,
+ unsigned long smfn, unsigned emfn)
+{
+ struct list_head *cur;
+ struct pmem *overlap = NULL;
+
+ list_for_each(cur, list)
+ {
+ struct pmem *cur_pmem = list_entry(cur, struct pmem, link);
+ unsigned long cur_smfn = cur_pmem->smfn;
+ unsigned long cur_emfn = cur_pmem->emfn;
+
+ if ( emfn <= cur_smfn )
+ break;
+
+ if ( check_overlap(smfn, emfn, cur_smfn, cur_emfn) )
+ {
+ overlap = cur_pmem;
+ break;
+ }
+ }
+
+ return overlap;
+}
+
+static bool pmem_list_covered(const struct list_head *list,
+ unsigned long smfn, unsigned emfn)
+{
+ struct pmem *overlap;
+ bool covered = false;
+
+ do {
+ overlap = get_first_overlap(list, smfn, emfn);
+
+ if ( !overlap || smfn < overlap->smfn )
+ break;
+
+ if ( emfn <= overlap->emfn )
+ {
+ covered = true;
+ break;
+ }
+
+ smfn = overlap->emfn;
+ list = &overlap->link;
+ } while ( list );
+
+ return covered;
+}
+
+static bool check_mgmt_size(unsigned long mgmt_mfns, unsigned long total_mfns)
+{
+ return mgmt_mfns >=
+ ((sizeof(struct page_info) * total_mfns) >> PAGE_SHIFT) +
+ ((sizeof(*machine_to_phys_mapping) * total_mfns) >> PAGE_SHIFT);
+}
+
+static bool check_region(unsigned long smfn, unsigned long emfn)
+{
+ bool rc;
+
+ if ( smfn >= emfn )
+ return false;
+
+ spin_lock(&pmem_regions_lock);
+ rc = pmem_list_covered(&pmem_regions, smfn, emfn);
+ spin_unlock(&pmem_regions_lock);
+
+ return rc;
+}
+
/**
* Register a pmem region to Xen. It's used by Xen hypervisor to collect
* all pmem regions can be used later.
@@ -104,3 +204,100 @@ int pmem_register(unsigned long smfn, unsigned long emfn)
return rc;
}
+
+/**
+ * Setup a data pmem region that can be used by guest later. A
+ * separate pmem region, or the management region, can be specified to
+ * store the frametable and M2P tables of the data pmem region.
+ *
+ * Parameters:
+ * data_smfn/_emfn: start and end MFNs of the data pmem region
+ * mgmt_emfn/_emfn: If not mfn_x(INVALID_MFN), then the pmem region from
+ * mgmt_smfn to mgmt_emfn will be used for the frametable
+ * M2P of itself and the data pmem region. Otherwise, the
+ * regular RAM will be used.
+ *
+ * Return:
+ * On success, return 0. Otherwise, an error number will be returned.
+ */
+int pmem_setup(unsigned long data_smfn, unsigned long data_emfn,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn)
+{
+ int rc = 0;
+ bool mgmt_in_pmem = mgmt_smfn != mfn_x(INVALID_MFN) &&
+ mgmt_emfn != mfn_x(INVALID_MFN);
+ struct pmem *pmem, *mgmt = NULL;
+ unsigned long mgmt_mfns = mgmt_emfn - mgmt_smfn;
+ unsigned long total_mfns = data_emfn - data_smfn + mgmt_mfns;
+ unsigned long i;
+ struct page_info *pg;
+
+ if ( !check_region(data_smfn, data_emfn) )
+ return -EINVAL;
+
+ if ( mgmt_in_pmem &&
+ (!check_region(mgmt_smfn, mgmt_emfn) ||
+ !check_mgmt_size(mgmt_mfns, total_mfns)) )
+ return -EINVAL;
+
+ pmem = alloc_pmem_struct(data_smfn, data_emfn);
+ if ( !pmem )
+ return -ENOMEM;
+ if ( mgmt_in_pmem )
+ {
+ mgmt = alloc_pmem_struct(mgmt_smfn, mgmt_emfn);
+ if ( !mgmt )
+ return -ENOMEM;
+ }
+
+ spin_lock(&pmem_gregions_lock);
+ rc = pmem_list_add(&pmem_gregions, pmem);
+ if ( rc )
+ {
+ spin_unlock(&pmem_gregions_lock);
+ goto out;
+ }
+ if ( mgmt_in_pmem )
+ {
+ rc = pmem_list_add(&pmem_gregions, mgmt);
+ if ( rc )
+ {
+ spin_unlock(&pmem_gregions_lock);
+ goto out_remove_pmem;
+ }
+ }
+ spin_unlock(&pmem_gregions_lock);
+
+ rc = pmem_arch_setup(data_smfn, data_emfn, mgmt_smfn, mgmt_emfn);
+ if ( rc )
+ goto out_remove_mgmt;
+
+ for ( i = data_smfn; i < data_emfn; i++ )
+ {
+ pg = mfn_to_page(i);
+ pg->count_info = PGC_state_free;
+ }
+
+ if ( mgmt_in_pmem )
+ pmem->mgmt = mgmt->mgmt = mgmt;
+ /* As mgmt is never mapped to guest, we do not set its ready flag. */
+ pmem->ready = true;
+
+ return 0;
+
+ out_remove_mgmt:
+ if ( mgmt )
+ {
+ spin_lock(&pmem_gregions_lock);
+ pmem_list_remove(mgmt);
+ spin_unlock(&pmem_gregions_lock);
+ xfree(mgmt);
+ }
+ out_remove_pmem:
+ spin_lock(&pmem_gregions_lock);
+ pmem_list_remove(pmem);
+ spin_unlock(&pmem_gregions_lock);
+ xfree(pmem);
+ out:
+ return rc;
+}
@@ -36,7 +36,7 @@
#include "physdev.h"
#include "tmem.h"
-#define XEN_SYSCTL_INTERFACE_VERSION 0x0000000F
+#define XEN_SYSCTL_INTERFACE_VERSION 0x00000010
/*
* Read console content from Xen buffer ring.
@@ -1088,6 +1088,31 @@ struct xen_sysctl_livepatch_op {
typedef struct xen_sysctl_livepatch_op xen_sysctl_livepatch_op_t;
DEFINE_XEN_GUEST_HANDLE(xen_sysctl_livepatch_op_t);
+#define XEN_SYSCTL_nvdimm_pmem_setup 0
+struct xen_sysctl_nvdimm_pmem_setup {
+ /* IN variables */
+ uint64_t smfn; /* start MFN of the pmem region */
+ uint64_t emfn; /* end MFN of the pmem region */
+ uint64_t mgmt_smfn; /* If not INVALID_MFN, start MFN of another pmem */
+ /* region that will be used to manage above pmem */
+ /* region. */
+ uint64_t mgmt_emfn; /* If not INVALID_MFN, end MFN of another pmem */
+ /* region that will be used to manage above pmem */
+ /* region. */
+};
+typedef struct xen_sysctl_nvdimm_pmem_setup xen_sysctl_nvdimm_pmem_setup_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_nvdimm_pmem_setup_t);
+
+struct xen_sysctl_nvdimm_op {
+ uint32_t cmd; /* IN: XEN_SYSCTL_NVDIMM_*. */
+ uint32_t pad; /* IN: Always zero. */
+ union {
+ xen_sysctl_nvdimm_pmem_setup_t setup;
+ } u;
+};
+typedef struct xen_sysctl_nvdimm_op xen_sysctl_nvdimm_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_nvdimm_op_t);
+
struct xen_sysctl {
uint32_t cmd;
#define XEN_SYSCTL_readconsole 1
@@ -1116,6 +1141,7 @@ struct xen_sysctl {
#define XEN_SYSCTL_get_cpu_levelling_caps 25
#define XEN_SYSCTL_get_cpu_featureset 26
#define XEN_SYSCTL_livepatch_op 27
+#define XEN_SYSCTL_nvdimm_op 28
uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
union {
struct xen_sysctl_readconsole readconsole;
@@ -1144,6 +1170,7 @@ struct xen_sysctl {
struct xen_sysctl_cpu_levelling_caps cpu_levelling_caps;
struct xen_sysctl_cpu_featureset cpu_featureset;
struct xen_sysctl_livepatch_op livepatch;
+ struct xen_sysctl_nvdimm_op nvdimm;
uint8_t pad[128];
} u;
};
@@ -23,6 +23,20 @@
#include <xen/types.h>
int pmem_register(unsigned long smfn, unsigned long emfn);
+int pmem_setup(unsigned long data_spfn, unsigned long data_emfn,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn);
+
+#ifdef CONFIG_X86
+int pmem_arch_setup(unsigned long data_smfn, unsigned long data_emfn,
+ unsigned long mgmt_smfn, unsigned long mgmt_emfn);
+#else /* !CONFIG_X86 */
+static inline int
+pmem_arch_setup(unsigned long data_smfn, unsigned long data_emfn,
+ unsigned mgmt_smfn, unsigned long mgmt_emfn)
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_X86 */
#endif /* CONFIG_PMEM */
#endif /* __XEN_PMEM_H__ */
@@ -826,6 +826,10 @@ static int flask_sysctl(int cmd)
return avc_current_has_perm(SECINITSID_XEN, SECCLASS_XEN2,
XEN2__GCOV_OP, NULL);
+ case XEN_SYSCTL_nvdimm_op:
+ return avc_current_has_perm(SECINITSID_XEN, SECCLASS_XEN2,
+ XEN2__NVDIMM_OP, NULL);
+
default:
return avc_unknown_permission("sysctl", cmd);
}
@@ -101,6 +101,8 @@ class xen2
livepatch_op
# XEN_SYSCTL_gcov_op
gcov_op
+# XEN_SYSCTL_nvdimm_op
+ nvdimm_op
}
# Classes domain and domain2 consist of operations that a domain performs on
Xen hypervisor is not aware which portions of pmem can be used to store the frame table and M2P table of pmem. Instead, it provides users or admins in Dom0 with a sysctl XEN_SYSCTL_nvdimm_pmem_setup to specify the location. XEN_SYSCTL_nvdimm_pmem_setup receives four arguments: data_smfn, data_emfn, mgmt_smfn and mgmt_emfn. - data_smfn and data_emfn specify the start and end MFN of a host pmem region that can be used by guest. - If neither mgmt_smfn nor mgmt_emfn is INVALID_MFN, the host pmem pages from mgmt_smfn to mgmt_emfn will be used to store the frametable and M2P table of the pmem region data_smfn ~ data_emfn and itself. data_smfn ~ data_emfn and mgmt_smfn ~ mgmt_emfn should not overlap with each other. - If either mgmt_smfn or mgmt_emfn is INVALID_MFN, Xen hypervisor will store the frametable and M2P table of the pmem region data_smfn ~ data_emfn in the regular RAM. XEN_SYSCTL_nvdimm_pmem_setup currently only works on x86, and returns -ENOSYS on other architectures. Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> --- Cc: Daniel De Graaf <dgdegra@tycho.nsa.gov> Cc: Ian Jackson <ian.jackson@eu.citrix.com> Cc: Wei Liu <wei.liu2@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> Changes in v2: * Convert XENPF_pmem_add in v1 to XEN_SYSCTL_nvdimm_pmem_setup in v2. v2 patch series relies on users/admins in Dom0 instead of Dom0 driver to indicate the location to store the frametable and M2P of pmem. * Separate the architecture-independent and -depend code to pmem_setup and pmem_arch_setup. Currently, only pmem_arch_setup on x86 is implemented, while it returns -ENOSYS on other architectures. * Add XSM check for XEN_SYSCTL_nvdimm_pmem_setup. --- tools/flask/policy/modules/dom0.te | 2 +- tools/libxc/include/xenctrl.h | 19 ++++ tools/libxc/xc_misc.c | 17 ++++ tools/misc/xen-ndctl.c | 0 xen/arch/x86/sysctl.c | 22 ++++ xen/arch/x86/x86_64/mm.c | 111 ++++++++++++++++++++ xen/common/pmem.c | 197 ++++++++++++++++++++++++++++++++++++ xen/include/public/sysctl.h | 29 +++++- xen/include/xen/pmem.h | 14 +++ xen/xsm/flask/hooks.c | 4 + xen/xsm/flask/policy/access_vectors | 2 + 11 files changed, 415 insertions(+), 2 deletions(-) create mode 100644 tools/misc/xen-ndctl.c