@@ -408,6 +408,13 @@
*/
#define LIBXL_HAVE_PHYSINFO_CAP_IOMMU_HAP_PT_SHARE 1
+/*
+ * LIBXL_HAVE_BUILDINFO_IOMMU_MEMKB indicates thate libxl_domain_build_info
+ * has an iommu_memkb field which should be set with the amount of memory
+ * overhead needed by the domain for populating IOMMU page tables.
+ */
+#define LIBXL_HAVE_BUILDINFO_IOMMU_MEMKB 1
+
/*
* libxl ABI compatibility
*
@@ -461,15 +461,17 @@ int libxl_domain_need_memory(libxl_ctx *ctx,
if (rc) goto out;
*need_memkb = b_info->target_memkb;
+ *need_memkb += b_info->shadow_memkb + b_info->iommu_memkb;
+
switch (b_info->type) {
case LIBXL_DOMAIN_TYPE_PVH:
case LIBXL_DOMAIN_TYPE_HVM:
- *need_memkb += b_info->shadow_memkb + LIBXL_HVM_EXTRA_MEMORY;
+ *need_memkb += LIBXL_HVM_EXTRA_MEMORY;
if (libxl_defbool_val(b_info->device_model_stubdomain))
*need_memkb += 32 * 1024;
break;
case LIBXL_DOMAIN_TYPE_PV:
- *need_memkb += b_info->shadow_memkb + LIBXL_PV_EXTRA_MEMORY;
+ *need_memkb += LIBXL_PV_EXTRA_MEMORY;
break;
default:
rc = ERROR_INVAL;
@@ -486,6 +486,7 @@ libxl_domain_build_info = Struct("domain_build_info",[
("target_memkb", MemKB),
("video_memkb", MemKB),
("shadow_memkb", MemKB),
+ ("iommu_memkb", MemKB),
("rtc_timeoffset", uint32),
("exec_ssidref", uint32),
("exec_ssid_label", string),
@@ -48,6 +48,21 @@ unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned
return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
}
+unsigned long libxl_get_required_iommu_memory(unsigned long maxmem_kb)
+{
+ unsigned long iommu_pages = 0, mem_pages = maxmem_kb / 4;
+ unsigned int level;
+
+ /* Assume a 4 level page table with 512 entries per level */
+ for (level = 0; level < 4; level++)
+ {
+ mem_pages = DIV_ROUNDUP(mem_pages, 512);
+ iommu_pages += mem_pages;
+ }
+
+ return iommu_pages * 4;
+}
+
char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid)
{
unsigned int len;
@@ -24,6 +24,7 @@ const
char *libxl_basename(const char *name); /* returns string from strdup */
unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus);
+unsigned long libxl_get_required_iommu_memory(unsigned long maxmem_kb);
int libxl_name_to_domid(libxl_ctx *ctx, const char *name, uint32_t *domid);
int libxl_domain_qualifier_to_domid(libxl_ctx *ctx, const char *name, uint32_t *domid);
char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid);
@@ -1207,6 +1207,7 @@ void parse_config_data(const char *config_source,
int config_len,
libxl_domain_config *d_config)
{
+ libxl_physinfo physinfo;
const char *buf;
long l, vcpus = 0;
XLU_Config *config;
@@ -1221,10 +1222,22 @@ void parse_config_data(const char *config_source,
int pci_seize = 0;
int i, e;
char *kernel_basename;
+ bool iommu_enabled, iommu_hap_pt_share;
libxl_domain_create_info *c_info = &d_config->c_info;
libxl_domain_build_info *b_info = &d_config->b_info;
+ libxl_physinfo_init(&physinfo);
+ if (libxl_get_physinfo(ctx, &physinfo) != 0) {
+ libxl_physinfo_dispose(&physinfo);
+ fprintf(stderr, "libxl_get_physinfo failed\n");
+ exit(EXIT_FAILURE);
+ }
+
+ iommu_enabled = physinfo.cap_hvm_directio;
+ iommu_hap_pt_share = physinfo.cap_iommu_hap_pt_share;
+ libxl_physinfo_dispose(&physinfo);
+
config= xlu_cfg_init(stderr, config_source);
if (!config) {
fprintf(stderr, "Failed to allocate for configuration\n");
@@ -1448,14 +1461,21 @@ void parse_config_data(const char *config_source,
exit(1);
}
- /* libxl_get_required_shadow_memory() must be called after final values
+ /* libxl_get_required_shadow_memory() and
+ * libxl_get_required_iommu_memory() must be called after final values
* (default or specified) for vcpus and memory are set, because the
- * calculation depends on those values. */
+ * calculations depend on those values. */
b_info->shadow_memkb = !xlu_cfg_get_long(config, "shadow_memory", &l, 0)
? l * 1024
: libxl_get_required_shadow_memory(b_info->max_memkb,
b_info->max_vcpus);
+ /* No IOMMU reservation is needed if either the IOMMU is disabled or it
+ * can share the P2M. */
+ b_info->iommu_memkb = (!iommu_enabled || iommu_hap_pt_share)
+ ? 0
+ : libxl_get_required_iommu_memory(b_info->max_memkb);
+
xlu_cfg_get_defbool(config, "nomigrate", &b_info->disable_migrate, 0);
if (!xlu_cfg_get_long(config, "tsc_mode", &l, 1)) {
@@ -1056,7 +1056,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m,
!mfn_eq(lpae_get_mfn(*entry), lpae_get_mfn(orig_pte)) )
p2m_free_entry(p2m, orig_pte, level);
- if ( has_iommu_pt(p2m->domain) &&
+ if ( is_iommu_enabled(p2m->domain) &&
(lpae_is_valid(orig_pte) || lpae_is_valid(*entry)) )
{
unsigned int flush_flags = 0;
@@ -365,7 +365,7 @@ unsigned long __init dom0_compute_nr_pages(
}
need_paging = is_hvm_domain(d) &&
- (!iommu_hap_pt_share || !paging_mode_hap(d));
+ (!iommu_use_hap_pt(d) || !paging_mode_hap(d));
for ( ; ; need_paging = false )
{
nr_pages = get_memsize(&dom0_size, avail);
@@ -783,7 +783,8 @@ HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr, 1,
void memory_type_changed(struct domain *d)
{
- if ( (has_iommu_pt(d) || cache_flush_permitted(d)) && d->vcpu && d->vcpu[0] )
+ if ( (is_iommu_enabled(d) || cache_flush_permitted(d)) &&
+ d->vcpu && d->vcpu[0] )
{
p2m_memory_type_changed(d);
flush_all(FLUSH_CACHE);
@@ -831,7 +832,7 @@ int epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn,
return MTRR_TYPE_UNCACHABLE;
}
- if ( !has_iommu_pt(d) && !cache_flush_permitted(d) )
+ if ( !is_iommu_enabled(d) && !cache_flush_permitted(d) )
{
*ipat = 1;
return MTRR_TYPE_WRBACK;
@@ -1664,7 +1664,7 @@ int mem_sharing_domctl(struct domain *d, struct xen_domctl_mem_sharing_op *mec)
case XEN_DOMCTL_MEM_SHARING_CONTROL:
{
rc = 0;
- if ( unlikely(has_iommu_pt(d) && mec->u.enable) )
+ if ( unlikely(is_iommu_enabled(d) && mec->u.enable) )
rc = -EXDEV;
else
d->arch.hvm.mem_sharing_enabled = mec->u.enable;
@@ -1341,7 +1341,7 @@ int set_identity_p2m_entry(struct domain *d, unsigned long gfn_l,
if ( !paging_mode_translate(p2m->domain) )
{
- if ( !has_iommu_pt(d) )
+ if ( !is_iommu_enabled(d) )
return 0;
return iommu_legacy_map(d, _dfn(gfn_l), _mfn(gfn_l), PAGE_ORDER_4K,
IOMMUF_readable | IOMMUF_writable);
@@ -1432,7 +1432,7 @@ int clear_identity_p2m_entry(struct domain *d, unsigned long gfn_l)
if ( !paging_mode_translate(d) )
{
- if ( !has_iommu_pt(d) )
+ if ( !is_iommu_enabled(d) )
return 0;
return iommu_legacy_unmap(d, _dfn(gfn_l), PAGE_ORDER_4K);
}
@@ -213,7 +213,7 @@ int paging_log_dirty_enable(struct domain *d, bool_t log_global)
{
int ret;
- if ( has_iommu_pt(d) && log_global )
+ if ( is_iommu_enabled(d) && log_global )
{
/*
* Refuse to turn on global log-dirty mode
@@ -1434,7 +1434,7 @@ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
* shared or being kept in sync then newly added memory needs to be
* mapped here.
*/
- if ( has_iommu_pt(hardware_domain) &&
+ if ( is_iommu_enabled(hardware_domain) &&
!iommu_use_hap_pt(hardware_domain) &&
!need_iommu_pt_sync(hardware_domain) )
{
@@ -823,7 +823,7 @@ int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
xatp->gpfn += start;
xatp->size -= start;
- if ( has_iommu_pt(d) )
+ if ( is_iommu_enabled(d) )
this_cpu(iommu_dont_flush_iotlb) = 1;
while ( xatp->size > done )
@@ -844,7 +844,7 @@ int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
}
}
- if ( has_iommu_pt(d) )
+ if ( is_iommu_enabled(d) )
{
int ret;
@@ -630,7 +630,7 @@ int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
/* No paging if iommu is used */
rc = -EMLINK;
- if ( unlikely(has_iommu_pt(d)) )
+ if ( unlikely(is_iommu_enabled(d)) )
break;
rc = -EXDEV;
@@ -40,17 +40,6 @@ int iommu_assign_dt_device(struct domain *d, struct dt_device_node *dev)
if ( !list_empty(&dev->domain_list) )
goto fail;
- /*
- * The hwdom is forced to use IOMMU for protecting assigned
- * device. Therefore the IOMMU data is already set up.
- */
- ASSERT(!is_hardware_domain(d) ||
- hd->status == IOMMU_STATUS_initialized);
-
- rc = iommu_construct(d);
- if ( rc )
- goto fail;
-
/* The flag field doesn't matter to DT device. */
rc = hd->platform_ops->assign_device(d, 0, dt_to_dev(dev), 0);
@@ -157,6 +157,17 @@ static int __init parse_dom0_iommu_param(const char *s)
}
custom_param("dom0-iommu", parse_dom0_iommu_param);
+static void __hwdom_init check_hwdom_reqs(struct domain *d)
+{
+ if ( iommu_hwdom_none || !paging_mode_translate(d) )
+ return;
+
+ arch_iommu_check_autotranslated_hwdom(d);
+
+ iommu_hwdom_passthrough = false;
+ iommu_hwdom_strict = true;
+}
+
int iommu_domain_init(struct domain *d)
{
struct domain_iommu *hd = dom_iommu(d);
@@ -174,129 +185,44 @@ int iommu_domain_init(struct domain *d)
return ret;
hd->platform_ops = iommu_get_ops();
- return hd->platform_ops->init(d);
-}
+ ret = hd->platform_ops->init(d);
+ if ( ret )
+ return ret;
-static void __hwdom_init check_hwdom_reqs(struct domain *d)
-{
- if ( iommu_hwdom_none || !paging_mode_translate(d) )
- return;
+ if ( is_hardware_domain(d) )
+ check_hwdom_reqs(d); /* may modify iommu_hwdom_strict */
- arch_iommu_check_autotranslated_hwdom(d);
+ /*
+ * NB: 'relaxed' h/w domains don't need the IOMMU mappings to be kept
+ * in-sync with their assigned pages because all host RAM will be
+ * mapped during hwdom_init().
+ */
+ if ( !is_hardware_domain(d) || iommu_hwdom_strict )
+ hd->need_sync = !iommu_use_hap_pt(d);
- iommu_hwdom_passthrough = false;
- iommu_hwdom_strict = true;
+ return 0;
}
void __hwdom_init iommu_hwdom_init(struct domain *d)
{
struct domain_iommu *hd = dom_iommu(d);
- check_hwdom_reqs(d);
-
if ( !is_iommu_enabled(d) )
return;
register_keyhandler('o', &iommu_dump_p2m_table, "dump iommu p2m table", 0);
- hd->status = IOMMU_STATUS_initializing;
- /*
- * NB: relaxed hw domains don't need sync because all ram is already
- * mapped in the iommu page tables.
- */
- hd->need_sync = iommu_hwdom_strict && !iommu_use_hap_pt(d);
- if ( need_iommu_pt_sync(d) )
- {
- struct page_info *page;
- unsigned int i = 0, flush_flags = 0;
- int rc = 0;
-
- page_list_for_each ( page, &d->page_list )
- {
- unsigned long mfn = mfn_x(page_to_mfn(page));
- unsigned long dfn = mfn_to_gmfn(d, mfn);
- unsigned int mapping = IOMMUF_readable;
- int ret;
-
- if ( ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
- ((page->u.inuse.type_info & PGT_type_mask)
- == PGT_writable_page) )
- mapping |= IOMMUF_writable;
-
- ret = iommu_map(d, _dfn(dfn), _mfn(mfn), 0, mapping,
- &flush_flags);
-
- if ( !rc )
- rc = ret;
-
- if ( !(i++ & 0xfffff) )
- process_pending_softirqs();
- }
-
- /* Use while-break to avoid compiler warning */
- while ( iommu_iotlb_flush_all(d, flush_flags) )
- break;
-
- if ( rc )
- printk(XENLOG_WARNING "d%d: IOMMU mapping failed: %d\n",
- d->domain_id, rc);
- }
-
hd->platform_ops->hwdom_init(d);
-
- hd->status = IOMMU_STATUS_initialized;
}
-void iommu_teardown(struct domain *d)
+static void iommu_teardown(struct domain *d)
{
struct domain_iommu *hd = dom_iommu(d);
- hd->status = IOMMU_STATUS_disabled;
hd->platform_ops->teardown(d);
tasklet_schedule(&iommu_pt_cleanup_tasklet);
}
-int iommu_construct(struct domain *d)
-{
- struct domain_iommu *hd = dom_iommu(d);
-
- if ( hd->status == IOMMU_STATUS_initialized )
- return 0;
-
- hd->status = IOMMU_STATUS_initializing;
-
- if ( !iommu_use_hap_pt(d) )
- {
- int rc;
-
- hd->need_sync = true;
-
- rc = arch_iommu_populate_page_table(d);
- if ( rc )
- {
- if ( rc != -ERESTART )
- {
- hd->need_sync = false;
- hd->status = IOMMU_STATUS_disabled;
- }
-
- return rc;
- }
- }
-
- hd->status = IOMMU_STATUS_initialized;
-
- /*
- * There may be dirty cache lines when a device is assigned
- * and before has_iommu_pt(d) becoming true, this will cause
- * memory_type_changed lose effect if memory type changes.
- * Call memory_type_changed here to amend this.
- */
- memory_type_changed(d);
-
- return 0;
-}
-
void iommu_domain_destroy(struct domain *d)
{
if ( !is_iommu_enabled(d) )
@@ -594,11 +520,8 @@ int iommu_do_domctl(
void iommu_share_p2m_table(struct domain* d)
{
ASSERT(hap_enabled(d));
- /*
- * iommu_use_hap_pt(d) cannot be used here because during domain
- * construction has_iommu_pt(d) will always return false here.
- */
- if ( is_iommu_enabled(d) && iommu_hap_pt_share )
+
+ if ( iommu_use_hap_pt(d) )
iommu_get_ops()->share_p2m(d);
}
@@ -645,8 +568,7 @@ static void iommu_dump_p2m_table(unsigned char key)
ops = iommu_get_ops();
for_each_domain(d)
{
- if ( is_hardware_domain(d) ||
- dom_iommu(d)->status < IOMMU_STATUS_initialized )
+ if ( is_hardware_domain(d) || !is_iommu_enabled(d) )
continue;
if ( iommu_use_hap_pt(d) )
@@ -933,9 +933,6 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus,
pdev->fault.count = 0;
- if ( !has_arch_pdevs(d) && has_iommu_pt(d) )
- iommu_teardown(d);
-
return ret;
}
@@ -1484,13 +1481,6 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
if ( !pcidevs_trylock() )
return -ERESTART;
- rc = iommu_construct(d);
- if ( rc )
- {
- pcidevs_unlock();
- return rc;
- }
-
pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
if ( !pdev )
{
@@ -1519,8 +1509,6 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
}
done:
- if ( !has_arch_pdevs(d) && has_iommu_pt(d) )
- iommu_teardown(d);
pcidevs_unlock();
return rc;
@@ -1721,15 +1721,7 @@ static void iommu_domain_teardown(struct domain *d)
ASSERT(is_iommu_enabled(d));
- /*
- * We can't use iommu_use_hap_pt here because either IOMMU state
- * is already changed to IOMMU_STATUS_disabled at this point or
- * has always been IOMMU_STATUS_disabled.
- *
- * We also need to test if HAP is enabled because PV guests can
- * enter this path too.
- */
- if ( hap_enabled(d) && iommu_hap_pt_share )
+ if ( iommu_use_hap_pt(d) )
return;
spin_lock(&hd->arch.mapping_lock);
@@ -81,103 +81,6 @@ int __init iommu_setup_hpet_msi(struct msi_desc *msi)
return ops->setup_hpet_msi ? ops->setup_hpet_msi(msi) : -ENODEV;
}
-int arch_iommu_populate_page_table(struct domain *d)
-{
- struct page_info *page;
- int rc = 0, n = 0;
-
- spin_lock(&d->page_alloc_lock);
-
- if ( unlikely(d->is_dying) )
- rc = -ESRCH;
-
- while ( !rc && (page = page_list_remove_head(&d->page_list)) )
- {
- if ( is_hvm_domain(d) ||
- (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
- {
- mfn_t mfn = page_to_mfn(page);
- gfn_t gfn = mfn_to_gfn(d, mfn);
- unsigned int flush_flags = 0;
-
- if ( !gfn_eq(gfn, INVALID_GFN) )
- {
- dfn_t dfn = _dfn(gfn_x(gfn));
-
- ASSERT(!(gfn_x(gfn) >> DEFAULT_DOMAIN_ADDRESS_WIDTH));
- BUG_ON(SHARED_M2P(gfn_x(gfn)));
- rc = iommu_map(d, dfn, mfn, PAGE_ORDER_4K,
- IOMMUF_readable | IOMMUF_writable,
- &flush_flags);
-
- /*
- * We may be working behind the back of a running guest, which
- * may change the type of a page at any time. We can't prevent
- * this (for instance, by bumping the type count while mapping
- * the page) without causing legitimate guest type-change
- * operations to fail. So after adding the page to the IOMMU,
- * check again to make sure this is still valid. NB that the
- * writable entry in the iommu is harmless until later, when
- * the actual device gets assigned.
- */
- if ( !rc && !is_hvm_domain(d) &&
- ((page->u.inuse.type_info & PGT_type_mask) !=
- PGT_writable_page) )
- {
- rc = iommu_unmap(d, dfn, PAGE_ORDER_4K, &flush_flags);
- /* If the type changed yet again, simply force a retry. */
- if ( !rc && ((page->u.inuse.type_info & PGT_type_mask) ==
- PGT_writable_page) )
- rc = -ERESTART;
- }
- }
- if ( rc )
- {
- page_list_add(page, &d->page_list);
- break;
- }
- }
- page_list_add_tail(page, &d->arch.relmem_list);
- if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
- hypercall_preempt_check() )
- rc = -ERESTART;
- }
-
- if ( !rc )
- {
- /*
- * The expectation here is that generally there are many normal pages
- * on relmem_list (the ones we put there) and only few being in an
- * offline/broken state. The latter ones are always at the head of the
- * list. Hence we first move the whole list, and then move back the
- * first few entries.
- */
- page_list_move(&d->page_list, &d->arch.relmem_list);
- while ( !page_list_empty(&d->page_list) &&
- (page = page_list_first(&d->page_list),
- (page->count_info & (PGC_state|PGC_broken))) )
- {
- page_list_del(page, &d->page_list);
- page_list_add_tail(page, &d->arch.relmem_list);
- }
- }
-
- spin_unlock(&d->page_alloc_lock);
-
- if ( !rc )
- /*
- * flush_flags are not tracked across hypercall pre-emption so
- * assume a full flush is necessary.
- */
- rc = iommu_iotlb_flush_all(
- d, IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified);
-
- if ( rc && rc != -ERESTART )
- iommu_teardown(d);
-
- return rc;
-}
-
void __hwdom_init arch_iommu_check_autotranslated_hwdom(struct domain *d)
{
if ( !is_iommu_enabled(d) )
@@ -21,7 +21,7 @@ struct arch_iommu
};
/* Always share P2M Table between the CPU and the IOMMU */
-#define iommu_use_hap_pt(d) (has_iommu_pt(d))
+#define iommu_use_hap_pt(d) is_iommu_enabled(d)
const struct iommu_ops *iommu_get_ops(void);
void iommu_set_ops(const struct iommu_ops *ops);
@@ -88,7 +88,7 @@ extern const struct iommu_init_ops *iommu_init_ops;
/* Are we using the domain P2M table as its IOMMU pagetable? */
#define iommu_use_hap_pt(d) \
- (hap_enabled(d) && has_iommu_pt(d) && iommu_hap_pt_share)
+ (hap_enabled(d) && is_iommu_enabled(d) && iommu_hap_pt_share)
void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value);
unsigned int iommu_read_apic_from_ire(unsigned int apic, unsigned int reg);
@@ -79,15 +79,9 @@ void iommu_domain_destroy(struct domain *d);
void arch_iommu_domain_destroy(struct domain *d);
int arch_iommu_domain_init(struct domain *d);
-int arch_iommu_populate_page_table(struct domain *d);
void arch_iommu_check_autotranslated_hwdom(struct domain *d);
void arch_iommu_hwdom_init(struct domain *d);
-int iommu_construct(struct domain *d);
-
-/* Function used internally, use iommu_domain_destroy */
-void iommu_teardown(struct domain *d);
-
/*
* The following flags are passed to map operations and passed by lookup
* operations.
@@ -254,13 +248,6 @@ struct iommu_ops {
# define iommu_vcall iommu_call
#endif
-enum iommu_status
-{
- IOMMU_STATUS_disabled,
- IOMMU_STATUS_initializing,
- IOMMU_STATUS_initialized
-};
-
struct domain_iommu {
struct arch_iommu arch;
@@ -280,9 +267,6 @@ struct domain_iommu {
/* Features supported by the IOMMU */
DECLARE_BITMAP(features, IOMMU_FEAT_count);
- /* Status of guest IOMMU mappings */
- enum iommu_status status;
-
/*
* Does the guest reqire mappings to be synchonized, to maintain
* the default dfn == pfn map. (See comment on dfn at the top of
@@ -966,10 +966,8 @@ static inline bool is_hwdom_pinned_vcpu(const struct vcpu *v)
}
#ifdef CONFIG_HAS_PASSTHROUGH
-#define has_iommu_pt(d) (dom_iommu(d)->status != IOMMU_STATUS_disabled)
#define need_iommu_pt_sync(d) (dom_iommu(d)->need_sync)
#else
-#define has_iommu_pt(d) false
#define need_iommu_pt_sync(d) false
#endif