Message ID | 20240507142600.23844-6-zong.li@sifive.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | RISC-V IOMMU HPM and nested IOMMU support | expand |
Context | Check | Description |
---|---|---|
conchuod/vmtest-fixes-PR | fail | merge-conflict |
On Tue, May 07, 2024 at 10:25:59PM +0800, Zong Li wrote: > This patch implements .domain_alloc_user operation for creating domains > owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent > domain for second stage, s1 domain will be the first stage. > > Don't remove IOMMU private data of dev in blocked domain, because it > holds the user data of device, which is used when attaching device into > s1 domain. > > Signed-off-by: Zong Li <zong.li@sifive.com> > --- > drivers/iommu/riscv/iommu.c | 227 ++++++++++++++++++++++++++++++++++- > include/uapi/linux/iommufd.h | 17 +++ > 2 files changed, 243 insertions(+), 1 deletion(-) > > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c > index 072251f6ad85..7eda850df475 100644 > --- a/drivers/iommu/riscv/iommu.c > +++ b/drivers/iommu/riscv/iommu.c > @@ -827,6 +827,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, > > /* This struct contains protection domain specific IOMMU driver data. */ > struct riscv_iommu_domain { > + struct riscv_iommu_domain *s2; > struct iommu_domain domain; > struct list_head bonds; > spinlock_t lock; /* protect bonds list updates. */ > @@ -844,6 +845,7 @@ struct riscv_iommu_domain { > /* Private IOMMU data for managed devices, dev_iommu_priv_* */ > struct riscv_iommu_info { > struct riscv_iommu_domain *domain; > + struct riscv_iommu_dc dc_user; > }; > > /* Linkage between an iommu_domain and attached devices. */ > @@ -1454,7 +1456,6 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, > > riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0, 0); > riscv_iommu_bond_unlink(info->domain, dev); > - info->domain = NULL; Nope, whatever is going on here, this can't be correct. Once a domain is detached the driver must drop all references to it immediately. > return 0; > } > @@ -1486,6 +1487,229 @@ static struct iommu_domain riscv_iommu_identity_domain = { > } > }; > > +/** > + * Nested IOMMU operations > + */ > + > +static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev) > +{ > + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain); > + struct riscv_iommu_device *iommu = dev_to_iommu(dev); > + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); > + > + if (riscv_domain->numa_node == NUMA_NO_NODE) > + riscv_domain->numa_node = dev_to_node(iommu->dev); Why? The kernel doesn't do any memory allocation for nested domains, does it? > + riscv_iommu_bond_unlink(info->domain, dev); > + > + if (riscv_iommu_bond_link(riscv_domain, dev)) > + return -ENOMEM; > + > + riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX); Why? The cache tags should be in good shape before they are assigned to the device. Wiping it on every attach is just needless flushing that isn't useful. > + riscv_iommu_iodir_update(iommu, dev, info->dc_user.fsc, info->dc_user.ta, > + info->dc_user.iohgatp); This isn't ideal. Ideally the driver should make changing from S2 to S2+S1 and back to be hitless. Wiping the valid bit as part of the programming is not good. As I said before, this driver probably needs the programming sequencer from ARM. > + info->domain = riscv_domain; > + > + return 0; > +} > + > +static void riscv_iommu_domain_free_nested(struct iommu_domain *domain) > +{ > + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain); > + > + kfree(riscv_domain); > +} Doesn't the driver already have this function? > +static struct iommu_domain * > +riscv_iommu_domain_alloc_nested(struct device *dev, > + struct iommu_domain *parent, > + const struct iommu_user_data *user_data) > +{ > + struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent); > + struct riscv_iommu_domain *s1_domain; > + struct riscv_iommu_device *iommu = dev_to_iommu(dev); > + struct iommu_hwpt_riscv_iommu arg; > + int ret, va_bits; > + > + if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU) > + return ERR_PTR(-EOPNOTSUPP); > + > + if (parent->type != IOMMU_DOMAIN_UNMANAGED) > + return ERR_PTR(-EINVAL); > + > + ret = iommu_copy_struct_from_user(&arg, > + user_data, > + IOMMU_HWPT_DATA_RISCV_IOMMU, > + out_event_uptr); > + if (ret) > + return ERR_PTR(ret); > + > + s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL); > + if (!s1_domain) > + return ERR_PTR(-ENOMEM); > + > + spin_lock_init(&s1_domain->lock); > + INIT_LIST_HEAD_RCU(&s1_domain->bonds); > + > + s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, > + RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); > + if (s1_domain->pscid < 0) { > + iommu_free_page(s1_domain->pgd_root); > + kfree(s1_domain); > + return ERR_PTR(-ENOMEM); > + } > + > + /* Get device context of stage-1 from user*/ > + ret = riscv_iommu_get_dc_user(dev, &arg); > + if (ret) { > + kfree(s1_domain); > + return ERR_PTR(-EINVAL); > + } > + > + if (!iommu) { > + va_bits = VA_BITS; > + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV57) { > + va_bits = 57; > + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV48) { > + va_bits = 48; > + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV39) { > + va_bits = 39; > + } else { > + dev_err(dev, "cannot find supported page table mode\n"); > + return ERR_PTR(-ENODEV); > + } > + > + /* > + * The ops->domain_alloc_user could be directly called by the iommufd core, > + * instead of iommu core. So, this function need to set the default value of > + * following data member: > + * - domain->pgsize_bitmap > + * - domain->geometry > + * - domain->type > + * - domain->ops > + */ > + s1_domain->s2 = s2_domain; > + s1_domain->domain.type = IOMMU_DOMAIN_NESTED; > + s1_domain->domain.ops = &riscv_iommu_nested_domain_ops; > + s1_domain->domain.pgsize_bitmap = SZ_4K; > + s1_domain->domain.geometry.aperture_start = 0; > + s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1); > + s1_domain->domain.geometry.force_aperture = true; nested domains don't support paging/map so they don't use any of this. Don't set it. Will remove the confusing va_bit stuff. > +static struct iommu_domain * > +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags, > + struct iommu_domain *parent, > + const struct iommu_user_data *user_data) > +{ > + struct iommu_domain *domain; > + struct riscv_iommu_domain *riscv_domain; > + > + /* Allocate stage-1 domain if it has stage-2 parent domain */ > + if (parent) > + return riscv_iommu_domain_alloc_nested(dev, parent, user_data); > + > + if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) > + return ERR_PTR(-EOPNOTSUPP); This if should be moved up to the top and this driver does not support diry_tracking, so don't test it. > + if (user_data) > + return ERR_PTR(-EINVAL); Really? Don't you need to ask for the s2? > + /* domain_alloc_user op needs to be fully initialized */ > + domain = iommu_domain_alloc(dev->bus); Please no, structure your driver properly so you can call only internal functions. This was a hack for intel only. > + if (!domain) > + return ERR_PTR(-ENOMEM); > + > + /* > + * We assume that nest-parent or g-stage only will come here No... That isn't right. Normal non-virtualization paging domains come here too. The default of this function, with an empty user_data, should be to create the same domain as alloc_domain_paging. > + * TODO: Shadow page table doesn't be supported now. > + * We currently can't distinguish g-stage and shadow > + * page table here. Shadow page table shouldn't be > + * put at stage-2. > + */ > + riscv_domain = iommu_domain_to_riscv(domain); What is a shadow page table? > + /* pgd_root may be allocated in .domain_alloc_paging */ > + if (riscv_domain->pgd_root) > + iommu_free_page(riscv_domain->pgd_root); And this is gross, structure your driver right so you don't have to undo what prior functions did. > + riscv_domain->pgd_root = iommu_alloc_pages_node(riscv_domain->numa_node, > + GFP_KERNEL_ACCOUNT, > + 2); > + if (!riscv_domain->pgd_root) > + return ERR_PTR(-ENOMEM); > + > + riscv_domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1, > + RISCV_IOMMU_MAX_GSCID, GFP_KERNEL); > + if (riscv_domain->gscid < 0) { > + iommu_free_pages(riscv_domain->pgd_root, 2); > + kfree(riscv_domain); > + return ERR_PTR(-ENOMEM); > + } > + > + return domain; > +} > + > static void *riscv_iommu_hw_info(struct device *dev, u32 *length, u32 *type) > { > struct riscv_iommu_device *iommu = dev_to_iommu(dev); > @@ -1587,6 +1811,7 @@ static const struct iommu_ops riscv_iommu_ops = { > .blocked_domain = &riscv_iommu_blocking_domain, > .release_domain = &riscv_iommu_blocking_domain, > .domain_alloc_paging = riscv_iommu_alloc_paging_domain, > + .domain_alloc_user = riscv_iommu_domain_alloc_user, > .def_domain_type = riscv_iommu_device_domain_type, > .device_group = riscv_iommu_device_group, > .probe_device = riscv_iommu_probe_device, > diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h > index ec9aafd7d373..e10b6e236647 100644 > --- a/include/uapi/linux/iommufd.h > +++ b/include/uapi/linux/iommufd.h > @@ -390,14 +390,31 @@ struct iommu_hwpt_vtd_s1 { > __u32 __reserved; > }; > > +/** > + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table > + * info (IOMMU_HWPT_TYPE_RISCV_IOMMU) > + * @dc_len: Length of device context > + * @dc_uptr: User pointer to the address of device context > + * @event_len: Length of an event record > + * @out_event_uptr: User pointer to the address of event record > + */ > +struct iommu_hwpt_riscv_iommu { > + __aligned_u64 dc_len; > + __aligned_u64 dc_uptr; So far other drivers have been inlining their "device context" in this struct, why do you need a pointer and length? > + __aligned_u64 event_len; > + __aligned_u64 out_event_uptr; Weird? Why? Jason
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 072251f6ad85..7eda850df475 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -827,6 +827,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, /* This struct contains protection domain specific IOMMU driver data. */ struct riscv_iommu_domain { + struct riscv_iommu_domain *s2; struct iommu_domain domain; struct list_head bonds; spinlock_t lock; /* protect bonds list updates. */ @@ -844,6 +845,7 @@ struct riscv_iommu_domain { /* Private IOMMU data for managed devices, dev_iommu_priv_* */ struct riscv_iommu_info { struct riscv_iommu_domain *domain; + struct riscv_iommu_dc dc_user; }; /* Linkage between an iommu_domain and attached devices. */ @@ -1454,7 +1456,6 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0, 0); riscv_iommu_bond_unlink(info->domain, dev); - info->domain = NULL; return 0; } @@ -1486,6 +1487,229 @@ static struct iommu_domain riscv_iommu_identity_domain = { } }; +/** + * Nested IOMMU operations + */ + +static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev) +{ + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain); + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + if (riscv_domain->numa_node == NUMA_NO_NODE) + riscv_domain->numa_node = dev_to_node(iommu->dev); + + riscv_iommu_bond_unlink(info->domain, dev); + + if (riscv_iommu_bond_link(riscv_domain, dev)) + return -ENOMEM; + + riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX); + + riscv_iommu_iodir_update(iommu, dev, info->dc_user.fsc, info->dc_user.ta, + info->dc_user.iohgatp); + + info->domain = riscv_domain; + + return 0; +} + +static void riscv_iommu_domain_free_nested(struct iommu_domain *domain) +{ + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain); + + kfree(riscv_domain); +} + +static const struct iommu_domain_ops riscv_iommu_nested_domain_ops = { + .attach_dev = riscv_iommu_attach_dev_nested, + .free = riscv_iommu_domain_free_nested, +}; + +static int +riscv_iommu_get_dc_user(struct device *dev, struct iommu_hwpt_riscv_iommu *user_arg) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + struct riscv_iommu_dc dc; + struct riscv_iommu_fq_record event; + u64 dc_len = sizeof(struct riscv_iommu_dc) >> (!(iommu->caps & RISCV_IOMMU_CAP_MSI_FLAT)); + u64 event_len = sizeof(struct riscv_iommu_fq_record); + void __user *event_user = NULL; + + for (int i = 0; i < fwspec->num_ids; i++) { + event.hdr = + FIELD_PREP(RISCV_IOMMU_FQ_HDR_CAUSE, RISCV_IOMMU_FQ_CAUSE_DDT_INVALID) | + FIELD_PREP(RISCV_IOMMU_FQ_HDR_DID, fwspec->ids[i]); + + /* Sanity check DC of stage-1 from user data */ + if (!user_arg->out_event_uptr || user_arg->event_len != event_len) + return -EINVAL; + + event_user = u64_to_user_ptr(user_arg->out_event_uptr); + + if (!user_arg->dc_uptr || user_arg->dc_len != dc_len) + return -EINVAL; + + if (copy_from_user(&dc, u64_to_user_ptr(user_arg->dc_uptr), dc_len)) + return -EFAULT; + + if (!(dc.tc & RISCV_IOMMU_DDTE_VALID)) { + dev_dbg(dev, "Invalid DDT from user data\n"); + if (copy_to_user(event_user, &event, event_len)) + return -EFAULT; + } + + if (!dc.fsc || dc.iohgatp) { + dev_dbg(dev, "Wrong page table from user data\n"); + if (copy_to_user(event_user, &event, event_len)) + return -EFAULT; + } + + /* Save DC of stage-1 from user data */ + memcpy(&info->dc_user, + riscv_iommu_get_dc(iommu, fwspec->ids[i]), + sizeof(struct riscv_iommu_dc)); + info->dc_user.fsc = dc.fsc; + } + + return 0; +} + +static struct iommu_domain * +riscv_iommu_domain_alloc_nested(struct device *dev, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent); + struct riscv_iommu_domain *s1_domain; + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct iommu_hwpt_riscv_iommu arg; + int ret, va_bits; + + if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU) + return ERR_PTR(-EOPNOTSUPP); + + if (parent->type != IOMMU_DOMAIN_UNMANAGED) + return ERR_PTR(-EINVAL); + + ret = iommu_copy_struct_from_user(&arg, + user_data, + IOMMU_HWPT_DATA_RISCV_IOMMU, + out_event_uptr); + if (ret) + return ERR_PTR(ret); + + s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL); + if (!s1_domain) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&s1_domain->lock); + INIT_LIST_HEAD_RCU(&s1_domain->bonds); + + s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, + RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); + if (s1_domain->pscid < 0) { + iommu_free_page(s1_domain->pgd_root); + kfree(s1_domain); + return ERR_PTR(-ENOMEM); + } + + /* Get device context of stage-1 from user*/ + ret = riscv_iommu_get_dc_user(dev, &arg); + if (ret) { + kfree(s1_domain); + return ERR_PTR(-EINVAL); + } + + if (!iommu) { + va_bits = VA_BITS; + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV57) { + va_bits = 57; + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV48) { + va_bits = 48; + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV39) { + va_bits = 39; + } else { + dev_err(dev, "cannot find supported page table mode\n"); + return ERR_PTR(-ENODEV); + } + + /* + * The ops->domain_alloc_user could be directly called by the iommufd core, + * instead of iommu core. So, this function need to set the default value of + * following data member: + * - domain->pgsize_bitmap + * - domain->geometry + * - domain->type + * - domain->ops + */ + s1_domain->s2 = s2_domain; + s1_domain->domain.type = IOMMU_DOMAIN_NESTED; + s1_domain->domain.ops = &riscv_iommu_nested_domain_ops; + s1_domain->domain.pgsize_bitmap = SZ_4K; + s1_domain->domain.geometry.aperture_start = 0; + s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1); + s1_domain->domain.geometry.force_aperture = true; + + return &s1_domain->domain; +} + +static struct iommu_domain * +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct iommu_domain *domain; + struct riscv_iommu_domain *riscv_domain; + + /* Allocate stage-1 domain if it has stage-2 parent domain */ + if (parent) + return riscv_iommu_domain_alloc_nested(dev, parent, user_data); + + if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + return ERR_PTR(-EOPNOTSUPP); + + if (user_data) + return ERR_PTR(-EINVAL); + + /* domain_alloc_user op needs to be fully initialized */ + domain = iommu_domain_alloc(dev->bus); + if (!domain) + return ERR_PTR(-ENOMEM); + + /* + * We assume that nest-parent or g-stage only will come here + * TODO: Shadow page table doesn't be supported now. + * We currently can't distinguish g-stage and shadow + * page table here. Shadow page table shouldn't be + * put at stage-2. + */ + riscv_domain = iommu_domain_to_riscv(domain); + + /* pgd_root may be allocated in .domain_alloc_paging */ + if (riscv_domain->pgd_root) + iommu_free_page(riscv_domain->pgd_root); + + riscv_domain->pgd_root = iommu_alloc_pages_node(riscv_domain->numa_node, + GFP_KERNEL_ACCOUNT, + 2); + if (!riscv_domain->pgd_root) + return ERR_PTR(-ENOMEM); + + riscv_domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1, + RISCV_IOMMU_MAX_GSCID, GFP_KERNEL); + if (riscv_domain->gscid < 0) { + iommu_free_pages(riscv_domain->pgd_root, 2); + kfree(riscv_domain); + return ERR_PTR(-ENOMEM); + } + + return domain; +} + static void *riscv_iommu_hw_info(struct device *dev, u32 *length, u32 *type) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); @@ -1587,6 +1811,7 @@ static const struct iommu_ops riscv_iommu_ops = { .blocked_domain = &riscv_iommu_blocking_domain, .release_domain = &riscv_iommu_blocking_domain, .domain_alloc_paging = riscv_iommu_alloc_paging_domain, + .domain_alloc_user = riscv_iommu_domain_alloc_user, .def_domain_type = riscv_iommu_device_domain_type, .device_group = riscv_iommu_device_group, .probe_device = riscv_iommu_probe_device, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index ec9aafd7d373..e10b6e236647 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -390,14 +390,31 @@ struct iommu_hwpt_vtd_s1 { __u32 __reserved; }; +/** + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table + * info (IOMMU_HWPT_TYPE_RISCV_IOMMU) + * @dc_len: Length of device context + * @dc_uptr: User pointer to the address of device context + * @event_len: Length of an event record + * @out_event_uptr: User pointer to the address of event record + */ +struct iommu_hwpt_riscv_iommu { + __aligned_u64 dc_len; + __aligned_u64 dc_uptr; + __aligned_u64 event_len; + __aligned_u64 out_event_uptr; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + * @IOMMU_HWPT_DATA_RISCV_IOMMU: RISC-V IOMMU device context table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE, IOMMU_HWPT_DATA_VTD_S1, + IOMMU_HWPT_DATA_RISCV_IOMMU, }; /**
This patch implements .domain_alloc_user operation for creating domains owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent domain for second stage, s1 domain will be the first stage. Don't remove IOMMU private data of dev in blocked domain, because it holds the user data of device, which is used when attaching device into s1 domain. Signed-off-by: Zong Li <zong.li@sifive.com> --- drivers/iommu/riscv/iommu.c | 227 ++++++++++++++++++++++++++++++++++- include/uapi/linux/iommufd.h | 17 +++ 2 files changed, 243 insertions(+), 1 deletion(-)