Message ID | 1571919983-3231-4-git-send-email-yi.l.liu@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio: support Shared Virtual Addressing | expand |
On Thu, 24 Oct 2019 08:26:23 -0400 Liu Yi L <yi.l.liu@intel.com> wrote: > This patch adds vfio support to bind guest translation structure > to host iommu. VFIO exposes iommu programming capability to user- > space. Guest is a user-space application in host under KVM solution. > For SVA usage in Virtual Machine, guest owns GVA->GPA translation > structure. And this part should be passdown to host to enable nested > translation (or say two stage translation). This patch reuses the > VFIO_IOMMU_BIND proposal from Jean-Philippe Brucker, and adds new > bind type for binding guest owned translation structure to host. > > *) Add two new ioctls for VFIO containers. > > - VFIO_IOMMU_BIND: for bind request from userspace, it could be > bind a process to a pasid or bind a guest pasid > to a device, this is indicated by type > - VFIO_IOMMU_UNBIND: for unbind request from userspace, it could be > unbind a process to a pasid or unbind a guest pasid > to a device, also indicated by type > - Bind type: > VFIO_IOMMU_BIND_PROCESS: user-space request to bind a process > to a device > VFIO_IOMMU_BIND_GUEST_PASID: bind guest owned translation > structure to host iommu. e.g. guest page table > > *) Code logic in vfio_iommu_type1_ioctl() to handle VFIO_IOMMU_BIND/UNBIND > > Cc: Kevin Tian <kevin.tian@intel.com> > Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > --- > drivers/vfio/vfio_iommu_type1.c | 136 ++++++++++++++++++++++++++++++++++++++++ > include/uapi/linux/vfio.h | 44 +++++++++++++ > 2 files changed, 180 insertions(+) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 3d73a7d..1a27e25 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -2325,6 +2325,104 @@ static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu, > return ret; > } > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) > +{ > + struct domain_capsule *dc = (struct domain_capsule *)data; > + struct iommu_gpasid_bind_data *ustruct = > + (struct iommu_gpasid_bind_data *) dc->data; > + > + return iommu_sva_bind_gpasid(dc->domain, dev, ustruct); > +} > + > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) > +{ > + struct domain_capsule *dc = (struct domain_capsule *)data; > + struct iommu_gpasid_bind_data *ustruct = > + (struct iommu_gpasid_bind_data *) dc->data; > + > + return iommu_sva_unbind_gpasid(dc->domain, dev, > + ustruct->hpasid); > +} > + > +/* > + * unbind specific gpasid, caller of this function requires hold > + * vfio_iommu->lock > + */ > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, > + struct iommu_gpasid_bind_data *gbind_data) > +{ > + return vfio_iommu_lookup_dev(iommu, vfio_unbind_gpasid_fn, gbind_data); > +} > + > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > + void __user *arg, > + struct vfio_iommu_type1_bind *bind) > +{ > + struct iommu_gpasid_bind_data gbind_data; > + unsigned long minsz; > + int ret = 0; > + > + minsz = sizeof(*bind) + sizeof(gbind_data); > + if (bind->argsz < minsz) > + return -EINVAL; > + > + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) > + return -EFAULT; > + > + mutex_lock(&iommu->lock); > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + ret = vfio_iommu_lookup_dev(iommu, vfio_bind_gpasid_fn, &gbind_data); > + /* > + * If bind failed, it may not be a total failure. Some devices within > + * the iommu group may have bind successfully. Although we don't enable > + * pasid capability for non-singletion iommu groups, a unbind operation > + * would be helpful to ensure no partial binding for an iommu group. > + */ > + if (ret) > + /* > + * Undo all binds that already succeeded, no need to check the > + * return value here since some device within the group has no > + * successful bind when coming to this place switch. > + */ > + vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); > + > +out_unlock: > + mutex_unlock(&iommu->lock); > + return ret; > +} > + > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > + void __user *arg, > + struct vfio_iommu_type1_bind *bind) > +{ > + struct iommu_gpasid_bind_data gbind_data; > + unsigned long minsz; > + int ret = 0; > + > + minsz = sizeof(*bind) + sizeof(gbind_data); > + if (bind->argsz < minsz) > + return -EINVAL; But gbind_data can change size if new vendor specific data is added to the union, so kernel updates break existing userspace. Fail. > + > + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) > + return -EFAULT; > + > + mutex_lock(&iommu->lock); > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + ret = vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); > + > +out_unlock: > + mutex_unlock(&iommu->lock); > + return ret; > +} > + > static long vfio_iommu_type1_ioctl(void *iommu_data, > unsigned int cmd, unsigned long arg) > { > @@ -2484,6 +2582,44 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, > default: > return -EINVAL; > } > + > + } else if (cmd == VFIO_IOMMU_BIND) { > + struct vfio_iommu_type1_bind bind; > + > + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz) > + return -EINVAL; > + > + switch (bind.bind_type) { > + case VFIO_IOMMU_BIND_GUEST_PASID: > + return vfio_iommu_type1_bind_gpasid(iommu, > + (void __user *)(arg + minsz), &bind); Why are we defining BIND_PROCESS if it's not supported? How does the user learn it's not supported? > + default: > + return -EINVAL; > + } > + > + } else if (cmd == VFIO_IOMMU_UNBIND) { > + struct vfio_iommu_type1_bind bind; > + > + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz) > + return -EINVAL; > + > + switch (bind.bind_type) { > + case VFIO_IOMMU_BIND_GUEST_PASID: > + return vfio_iommu_type1_unbind_gpasid(iommu, > + (void __user *)(arg + minsz), &bind); > + default: > + return -EINVAL; > + } > } > > return -ENOTTY; > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 04de290..78e8c64 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -832,6 +832,50 @@ struct vfio_iommu_type1_pasid_request { > */ > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 27) > > +enum vfio_iommu_bind_type { > + VFIO_IOMMU_BIND_PROCESS, > + VFIO_IOMMU_BIND_GUEST_PASID, > +}; > + > +/* > + * Supported types: > + * - VFIO_IOMMU_BIND_GUEST_PASID: bind guest pasid, which invoked > + * by guest, it takes iommu_gpasid_bind_data in data. > + */ > +struct vfio_iommu_type1_bind { > + __u32 argsz; > + enum vfio_iommu_bind_type bind_type; > + __u8 data[]; > +}; I don't think enum defines a compiler invariant data size. We can't use it for a kernel/user interface. Also why no flags field as is essentially standard for every vfio ioctl? Couldn't we specify process/guest-pasid with flags? For that matter couldn't we specify bind/unbind using a single ioctl? I think that would be more consistent with the pasid alloc/free ioctl in the previous patch. Why are we appending opaque data to the end of the structure when clearly we expect a struct iommu_gpasid_bind_data? That bind data structure expects a format (ex. IOMMU_PASID_FORMAT_INTEL_VTD). How does a user determine what formats are accepted from within the vfio API (or even outside of the vfio API)? > + > +/* > + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 28, struct vfio_iommu_bind) ^ The semantics appear to just be _IOW, nothing is written back to the userspace buffer on return. > + * > + * Manage address spaces of devices in this container. Initially a TYPE1 > + * container can only have one address space, managed with > + * VFIO_IOMMU_MAP/UNMAP_DMA. > + * > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page > + * tables, and BIND manages the stage-1 (guest) page tables. Other types of > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls > + * non-PASID traffic and BIND controls PASID traffic. But this depends on the > + * underlying IOMMU architecture and isn't guaranteed. > + * > + * Availability of this feature depends on the device, its bus, the underlying > + * IOMMU and the CPU architecture. And the user discovers this is available by...? There's no probe here, are they left only to setup a VM to the point of trying to use this before they fail the ioctl? Could VFIO_IOMMU_GET_INFO fill this gap? Thanks, Alex > + * > + * returns: 0 on success, -errno on failure. > + */ > +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 28) > + > +/* > + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 29, struct vfio_iommu_bind) > + * > + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. > + */ > +#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 29) > + > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ > > /*
> From: Alex Williamson < alex.williamson@redhat.com > > Sent: Friday, November 8, 2019 7:21 AM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > On Thu, 24 Oct 2019 08:26:23 -0400 > Liu Yi L <yi.l.liu@intel.com> wrote: > > > This patch adds vfio support to bind guest translation structure > > to host iommu. VFIO exposes iommu programming capability to user- > > space. Guest is a user-space application in host under KVM solution. > > For SVA usage in Virtual Machine, guest owns GVA->GPA translation > > structure. And this part should be passdown to host to enable nested > > translation (or say two stage translation). This patch reuses the > > VFIO_IOMMU_BIND proposal from Jean-Philippe Brucker, and adds new > > bind type for binding guest owned translation structure to host. > > > > *) Add two new ioctls for VFIO containers. > > > > - VFIO_IOMMU_BIND: for bind request from userspace, it could be > > bind a process to a pasid or bind a guest pasid > > to a device, this is indicated by type > > - VFIO_IOMMU_UNBIND: for unbind request from userspace, it could be > > unbind a process to a pasid or unbind a guest pasid > > to a device, also indicated by type > > - Bind type: > > VFIO_IOMMU_BIND_PROCESS: user-space request to bind a process > > to a device > > VFIO_IOMMU_BIND_GUEST_PASID: bind guest owned translation > > structure to host iommu. e.g. guest page table > > > > *) Code logic in vfio_iommu_type1_ioctl() to handle VFIO_IOMMU_BIND/UNBIND > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > --- > > drivers/vfio/vfio_iommu_type1.c | 136 > ++++++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/vfio.h | 44 +++++++++++++ > > 2 files changed, 180 insertions(+) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > index 3d73a7d..1a27e25 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -2325,6 +2325,104 @@ static int vfio_iommu_type1_pasid_free(struct > vfio_iommu *iommu, > > return ret; > > } > > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) > > +{ > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > + struct iommu_gpasid_bind_data *ustruct = > > + (struct iommu_gpasid_bind_data *) dc->data; > > + > > + return iommu_sva_bind_gpasid(dc->domain, dev, ustruct); > > +} > > + > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) > > +{ > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > + struct iommu_gpasid_bind_data *ustruct = > > + (struct iommu_gpasid_bind_data *) dc->data; > > + > > + return iommu_sva_unbind_gpasid(dc->domain, dev, > > + ustruct->hpasid); > > +} > > + > > +/* > > + * unbind specific gpasid, caller of this function requires hold > > + * vfio_iommu->lock > > + */ > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, > > + struct iommu_gpasid_bind_data *gbind_data) > > +{ > > + return vfio_iommu_lookup_dev(iommu, vfio_unbind_gpasid_fn, gbind_data); > > +} > > + > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > > + void __user *arg, > > + struct vfio_iommu_type1_bind *bind) > > +{ > > + struct iommu_gpasid_bind_data gbind_data; > > + unsigned long minsz; > > + int ret = 0; > > + > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > + if (bind->argsz < minsz) > > + return -EINVAL; > > + > > + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) > > + return -EFAULT; > > + > > + mutex_lock(&iommu->lock); > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + ret = vfio_iommu_lookup_dev(iommu, vfio_bind_gpasid_fn, &gbind_data); > > + /* > > + * If bind failed, it may not be a total failure. Some devices within > > + * the iommu group may have bind successfully. Although we don't enable > > + * pasid capability for non-singletion iommu groups, a unbind operation > > + * would be helpful to ensure no partial binding for an iommu group. > > + */ > > + if (ret) > > + /* > > + * Undo all binds that already succeeded, no need to check the > > + * return value here since some device within the group has no > > + * successful bind when coming to this place switch. > > + */ > > + vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); > > + > > +out_unlock: > > + mutex_unlock(&iommu->lock); > > + return ret; > > +} > > + > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > + void __user *arg, > > + struct vfio_iommu_type1_bind *bind) > > +{ > > + struct iommu_gpasid_bind_data gbind_data; > > + unsigned long minsz; > > + int ret = 0; > > + > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > + if (bind->argsz < minsz) > > + return -EINVAL; > > But gbind_data can change size if new vendor specific data is added to > the union, so kernel updates break existing userspace. Fail. yes, we have a version field in struct iommu_gpasid_bind_data. How about doing sanity check per versions? kernel knows the gbind_data size of specific versions. Does it make sense? If yes, I'll also apply it to the other sanity check in this series to avoid userspace fail after kernel update. > > + > > + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) > > + return -EFAULT; > > + > > + mutex_lock(&iommu->lock); > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + ret = vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); > > + > > +out_unlock: > > + mutex_unlock(&iommu->lock); > > + return ret; > > +} > > + > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > unsigned int cmd, unsigned long arg) > > { > > @@ -2484,6 +2582,44 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, > > default: > > return -EINVAL; > > } > > + > > + } else if (cmd == VFIO_IOMMU_BIND) { > > + struct vfio_iommu_type1_bind bind; > > + > > + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); > > + > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (bind.argsz < minsz) > > + return -EINVAL; > > + > > + switch (bind.bind_type) { > > + case VFIO_IOMMU_BIND_GUEST_PASID: > > + return vfio_iommu_type1_bind_gpasid(iommu, > > + (void __user *)(arg + minsz), &bind); > > Why are we defining BIND_PROCESS if it's not supported? How does the > user learn it's not supported? I think I should drop it so far since I only add BIND_GUEST_PASID. I think Jean Philippe may need it in his native SVA enabling patchset. For the way to let user learn it, may be using VFIO_IOMMU_GET_INFO as you mentioned below? > > > + default: > > + return -EINVAL; > > + } > > + > > + } else if (cmd == VFIO_IOMMU_UNBIND) { > > + struct vfio_iommu_type1_bind bind; > > + > > + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); > > + > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (bind.argsz < minsz) > > + return -EINVAL; > > + > > + switch (bind.bind_type) { > > + case VFIO_IOMMU_BIND_GUEST_PASID: > > + return vfio_iommu_type1_unbind_gpasid(iommu, > > + (void __user *)(arg + minsz), &bind); > > + default: > > + return -EINVAL; > > + } > > } > > > > return -ENOTTY; > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index 04de290..78e8c64 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -832,6 +832,50 @@ struct vfio_iommu_type1_pasid_request { > > */ > > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 27) > > > > +enum vfio_iommu_bind_type { > > + VFIO_IOMMU_BIND_PROCESS, > > + VFIO_IOMMU_BIND_GUEST_PASID, > > +}; > > + > > +/* > > + * Supported types: > > + * - VFIO_IOMMU_BIND_GUEST_PASID: bind guest pasid, which invoked > > + * by guest, it takes iommu_gpasid_bind_data in data. > > + */ > > +struct vfio_iommu_type1_bind { > > + __u32 argsz; > > + enum vfio_iommu_bind_type bind_type; > > + __u8 data[]; > > +}; > > I don't think enum defines a compiler invariant data size. We can't > use it for a kernel/user interface. Also why no flags field as is > essentially standard for every vfio ioctl? Couldn't we specify > process/guest-pasid with flags? I remember there is an early comment in community which pointed out that using flags potentially allows to config multiples types in one IOCTL. Regards to it, defining explicit emums avoids it. But I agree with you, it makes variant size. I'll fix it if this matter more. > For that matter couldn't we specify > bind/unbind using a single ioctl? I think that would be more > consistent with the pasid alloc/free ioctl in the previous patch. yes, let me make it in next version. > Why are we appending opaque data to the end of the structure when > clearly we expect a struct iommu_gpasid_bind_data? This is due to the intention to support BIND_GUEST_PASID and BIND_PROCESS with a single IOCTL. Maybe we can use a separate IOCTL for BIND_PROCESS. what's your opinion here? > That bind data > structure expects a format (ex. IOMMU_PASID_FORMAT_INTEL_VTD). How does > a user determine what formats are accepted from within the vfio API (or > even outside of the vfio API)? The info is provided by vIOMMU emulator (e.g. virtual VT-d). The vSVA patch from Jacob has a sanity check on it. https://lkml.org/lkml/2019/10/28/873 > > + > > +/* > > + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 28, struct > vfio_iommu_bind) > ^ > The semantics appear to just be _IOW, nothing is written back to the > userspace buffer on return. will fix it. thanks. > > + * > > + * Manage address spaces of devices in this container. Initially a TYPE1 > > + * container can only have one address space, managed with > > + * VFIO_IOMMU_MAP/UNMAP_DMA. > > + * > > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both > MAP/UNMAP > > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page > > + * tables, and BIND manages the stage-1 (guest) page tables. Other types of > > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP > controls > > + * non-PASID traffic and BIND controls PASID traffic. But this depends on the > > + * underlying IOMMU architecture and isn't guaranteed. > > + * > > + * Availability of this feature depends on the device, its bus, the underlying > > + * IOMMU and the CPU architecture. > > And the user discovers this is available by...? There's no probe here, > are they left only to setup a VM to the point of trying to use this > before they fail the ioctl? Could VFIO_IOMMU_GET_INFO fill this gap? > Thanks, I think VFIO_IOMMU_GET_INFO could help. let me extend it to fill this gap if you agree. > Alex Thanks, Yi Liu > > > + * > > + * returns: 0 on success, -errno on failure. > > + */ > > +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 28) > > + > > +/* > > + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 29, struct > vfio_iommu_bind) > > + * > > + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. > > + */ > > +#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 29) > > + > > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ > > > > /*
On Tue, 12 Nov 2019 11:21:40 +0000 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > From: Alex Williamson < alex.williamson@redhat.com > > > Sent: Friday, November 8, 2019 7:21 AM > > To: Liu, Yi L <yi.l.liu@intel.com> > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > > > On Thu, 24 Oct 2019 08:26:23 -0400 > > Liu Yi L <yi.l.liu@intel.com> wrote: > > > > > This patch adds vfio support to bind guest translation structure > > > to host iommu. VFIO exposes iommu programming capability to user- > > > space. Guest is a user-space application in host under KVM solution. > > > For SVA usage in Virtual Machine, guest owns GVA->GPA translation > > > structure. And this part should be passdown to host to enable nested > > > translation (or say two stage translation). This patch reuses the > > > VFIO_IOMMU_BIND proposal from Jean-Philippe Brucker, and adds new > > > bind type for binding guest owned translation structure to host. > > > > > > *) Add two new ioctls for VFIO containers. > > > > > > - VFIO_IOMMU_BIND: for bind request from userspace, it could be > > > bind a process to a pasid or bind a guest pasid > > > to a device, this is indicated by type > > > - VFIO_IOMMU_UNBIND: for unbind request from userspace, it could be > > > unbind a process to a pasid or unbind a guest pasid > > > to a device, also indicated by type > > > - Bind type: > > > VFIO_IOMMU_BIND_PROCESS: user-space request to bind a process > > > to a device > > > VFIO_IOMMU_BIND_GUEST_PASID: bind guest owned translation > > > structure to host iommu. e.g. guest page table > > > > > > *) Code logic in vfio_iommu_type1_ioctl() to handle VFIO_IOMMU_BIND/UNBIND > > > > > > Cc: Kevin Tian <kevin.tian@intel.com> > > > Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com> > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com> > > > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> > > > --- > > > drivers/vfio/vfio_iommu_type1.c | 136 > > ++++++++++++++++++++++++++++++++++++++++ > > > include/uapi/linux/vfio.h | 44 +++++++++++++ > > > 2 files changed, 180 insertions(+) > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > > > index 3d73a7d..1a27e25 100644 > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > @@ -2325,6 +2325,104 @@ static int vfio_iommu_type1_pasid_free(struct > > vfio_iommu *iommu, > > > return ret; > > > } > > > > > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) > > > +{ > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > + struct iommu_gpasid_bind_data *ustruct = > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > + > > > + return iommu_sva_bind_gpasid(dc->domain, dev, ustruct); > > > +} > > > + > > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) > > > +{ > > > + struct domain_capsule *dc = (struct domain_capsule *)data; > > > + struct iommu_gpasid_bind_data *ustruct = > > > + (struct iommu_gpasid_bind_data *) dc->data; > > > + > > > + return iommu_sva_unbind_gpasid(dc->domain, dev, > > > + ustruct->hpasid); > > > +} > > > + > > > +/* > > > + * unbind specific gpasid, caller of this function requires hold > > > + * vfio_iommu->lock > > > + */ > > > +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, > > > + struct iommu_gpasid_bind_data *gbind_data) > > > +{ > > > + return vfio_iommu_lookup_dev(iommu, vfio_unbind_gpasid_fn, gbind_data); > > > +} > > > + > > > +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, > > > + void __user *arg, > > > + struct vfio_iommu_type1_bind *bind) > > > +{ > > > + struct iommu_gpasid_bind_data gbind_data; > > > + unsigned long minsz; > > > + int ret = 0; > > > + > > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > > + if (bind->argsz < minsz) > > > + return -EINVAL; > > > + > > > + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) > > > + return -EFAULT; > > > + > > > + mutex_lock(&iommu->lock); > > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > > + ret = -EINVAL; > > > + goto out_unlock; > > > + } > > > + > > > + ret = vfio_iommu_lookup_dev(iommu, vfio_bind_gpasid_fn, &gbind_data); > > > + /* > > > + * If bind failed, it may not be a total failure. Some devices within > > > + * the iommu group may have bind successfully. Although we don't enable > > > + * pasid capability for non-singletion iommu groups, a unbind operation > > > + * would be helpful to ensure no partial binding for an iommu group. > > > + */ > > > + if (ret) > > > + /* > > > + * Undo all binds that already succeeded, no need to check the > > > + * return value here since some device within the group has no > > > + * successful bind when coming to this place switch. > > > + */ > > > + vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); > > > + > > > +out_unlock: > > > + mutex_unlock(&iommu->lock); > > > + return ret; > > > +} > > > + > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > + void __user *arg, > > > + struct vfio_iommu_type1_bind *bind) > > > +{ > > > + struct iommu_gpasid_bind_data gbind_data; > > > + unsigned long minsz; > > > + int ret = 0; > > > + > > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > > + if (bind->argsz < minsz) > > > + return -EINVAL; > > > > But gbind_data can change size if new vendor specific data is added to > > the union, so kernel updates break existing userspace. Fail. > > yes, we have a version field in struct iommu_gpasid_bind_data. How > about doing sanity check per versions? kernel knows the gbind_data > size of specific versions. Does it make sense? If yes, I'll also apply it > to the other sanity check in this series to avoid userspace fail after > kernel update. Has it already been decided that the version field will be updated for every addition to the union? It seems there are two options, either the version definition includes the possible contents of the union, which means we need to support multiple versions concurrently in the kernel to maintain compatibility with userspace and follow deprecation protocols for removing that support, or we need to consider version to be the general form of the structure and interpret the format field to determine necessary length to copy from the user. > > > + > > > + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) > > > + return -EFAULT; > > > + > > > + mutex_lock(&iommu->lock); > > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > > + ret = -EINVAL; > > > + goto out_unlock; > > > + } > > > + > > > + ret = vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); > > > + > > > +out_unlock: > > > + mutex_unlock(&iommu->lock); > > > + return ret; > > > +} > > > + > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > unsigned int cmd, unsigned long arg) > > > { > > > @@ -2484,6 +2582,44 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, > > > default: > > > return -EINVAL; > > > } > > > + > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > + struct vfio_iommu_type1_bind bind; > > > + > > > + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); > > > + > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > + return -EFAULT; > > > + > > > + if (bind.argsz < minsz) > > > + return -EINVAL; > > > + > > > + switch (bind.bind_type) { > > > + case VFIO_IOMMU_BIND_GUEST_PASID: > > > + return vfio_iommu_type1_bind_gpasid(iommu, > > > + (void __user *)(arg + minsz), &bind); > > > > Why are we defining BIND_PROCESS if it's not supported? How does the > > user learn it's not supported? > > I think I should drop it so far since I only add BIND_GUEST_PASID. I think > Jean Philippe may need it in his native SVA enabling patchset. For the way > to let user learn it, may be using VFIO_IOMMU_GET_INFO as you mentioned > below? > > > > > > + default: > > > + return -EINVAL; > > > + } > > > + > > > + } else if (cmd == VFIO_IOMMU_UNBIND) { > > > + struct vfio_iommu_type1_bind bind; > > > + > > > + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); > > > + > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > + return -EFAULT; > > > + > > > + if (bind.argsz < minsz) > > > + return -EINVAL; > > > + > > > + switch (bind.bind_type) { > > > + case VFIO_IOMMU_BIND_GUEST_PASID: > > > + return vfio_iommu_type1_unbind_gpasid(iommu, > > > + (void __user *)(arg + minsz), &bind); > > > + default: > > > + return -EINVAL; > > > + } > > > } > > > > > > return -ENOTTY; > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > index 04de290..78e8c64 100644 > > > --- a/include/uapi/linux/vfio.h > > > +++ b/include/uapi/linux/vfio.h > > > @@ -832,6 +832,50 @@ struct vfio_iommu_type1_pasid_request { > > > */ > > > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 27) > > > > > > +enum vfio_iommu_bind_type { > > > + VFIO_IOMMU_BIND_PROCESS, > > > + VFIO_IOMMU_BIND_GUEST_PASID, > > > +}; > > > + > > > +/* > > > + * Supported types: > > > + * - VFIO_IOMMU_BIND_GUEST_PASID: bind guest pasid, which invoked > > > + * by guest, it takes iommu_gpasid_bind_data in data. > > > + */ > > > +struct vfio_iommu_type1_bind { > > > + __u32 argsz; > > > + enum vfio_iommu_bind_type bind_type; > > > + __u8 data[]; > > > +}; > > > > I don't think enum defines a compiler invariant data size. We can't > > use it for a kernel/user interface. Also why no flags field as is > > essentially standard for every vfio ioctl? Couldn't we specify > > process/guest-pasid with flags? > > I remember there is an early comment in community which pointed out > that using flags potentially allows to config multiples types in one IOCTL. > Regards to it, defining explicit emums avoids it. But I agree with you, > it makes variant size. I'll fix it if this matter more. > > > For that matter couldn't we specify > > bind/unbind using a single ioctl? I think that would be more > > consistent with the pasid alloc/free ioctl in the previous patch. > > yes, let me make it in next version. > > > Why are we appending opaque data to the end of the structure when > > clearly we expect a struct iommu_gpasid_bind_data? > > This is due to the intention to support BIND_GUEST_PASID and > BIND_PROCESS with a single IOCTL. Maybe we can use a separate > IOCTL for BIND_PROCESS. what's your opinion here? If the ioctls have similar purpose and form, then re-using a single ioctl might make sense, but BIND_PROCESS is only a place-holder in this series, which is not acceptable. A dual purpose ioctl does not preclude that we could also use a union for the data field to make the structure well specified. > > That bind data > > structure expects a format (ex. IOMMU_PASID_FORMAT_INTEL_VTD). How does > > a user determine what formats are accepted from within the vfio API (or > > even outside of the vfio API)? > > The info is provided by vIOMMU emulator (e.g. virtual VT-d). The vSVA patch > from Jacob has a sanity check on it. > https://lkml.org/lkml/2019/10/28/873 The vIOMMU emulator runs at a layer above vfio. How does the vIOMMU emulator know that the vfio interface supports virtual VT-d? IMO, it's not acceptable that the user simply assume that an Intel host platform supports VT-d. For example, consider what happens when we need to define IOMMU_PASID_FORMAT_INTEL_VTDv2. How would the user learn that VTDv2 is supported and the original VTD format is not supported? > > > + > > > +/* > > > + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 28, struct > > vfio_iommu_bind) > > ^ > > The semantics appear to just be _IOW, nothing is written back to the > > userspace buffer on return. > > will fix it. thanks. > > > > + * > > > + * Manage address spaces of devices in this container. Initially a TYPE1 > > > + * container can only have one address space, managed with > > > + * VFIO_IOMMU_MAP/UNMAP_DMA. > > > + * > > > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both > > MAP/UNMAP > > > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page > > > + * tables, and BIND manages the stage-1 (guest) page tables. Other types of > > > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP > > controls > > > + * non-PASID traffic and BIND controls PASID traffic. But this depends on the > > > + * underlying IOMMU architecture and isn't guaranteed. > > > + * > > > + * Availability of this feature depends on the device, its bus, the underlying > > > + * IOMMU and the CPU architecture. > > > > And the user discovers this is available by...? There's no probe here, > > are they left only to setup a VM to the point of trying to use this > > before they fail the ioctl? Could VFIO_IOMMU_GET_INFO fill this gap? > > Thanks, > > I think VFIO_IOMMU_GET_INFO could help. let me extend it to fill this gap > if you agree. It's a start. Thanks, Alex
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Wednesday, November 13, 2019 1:26 AM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > On Tue, 12 Nov 2019 11:21:40 +0000 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > From: Alex Williamson < alex.williamson@redhat.com > > > > Sent: Friday, November 8, 2019 7:21 AM > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > > > > > On Thu, 24 Oct 2019 08:26:23 -0400 > > > Liu Yi L <yi.l.liu@intel.com> wrote: > > > > > > > This patch adds vfio support to bind guest translation structure > > > > to host iommu. VFIO exposes iommu programming capability to user- > > > > space. Guest is a user-space application in host under KVM solution. > > > > For SVA usage in Virtual Machine, guest owns GVA->GPA translation > > > > structure. And this part should be passdown to host to enable nested > > > > translation (or say two stage translation). This patch reuses the > > > > VFIO_IOMMU_BIND proposal from Jean-Philippe Brucker, and adds new > > > > bind type for binding guest owned translation structure to host. > > > > > > > > *) Add two new ioctls for VFIO containers. > > > > > > > > - VFIO_IOMMU_BIND: for bind request from userspace, it could be > > > > bind a process to a pasid or bind a guest pasid > > > > to a device, this is indicated by type > > > > - VFIO_IOMMU_UNBIND: for unbind request from userspace, it could be > > > > unbind a process to a pasid or unbind a guest pasid > > > > to a device, also indicated by type > > > > - Bind type: > > > > VFIO_IOMMU_BIND_PROCESS: user-space request to bind a process > > > > to a device > > > > VFIO_IOMMU_BIND_GUEST_PASID: bind guest owned translation > > > > structure to host iommu. e.g. guest page table > > > > > > > > *) Code logic in vfio_iommu_type1_ioctl() to handle > VFIO_IOMMU_BIND/UNBIND > > > > [...] > > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > > + void __user *arg, > > > > + struct vfio_iommu_type1_bind *bind) > > > > +{ > > > > + struct iommu_gpasid_bind_data gbind_data; > > > > + unsigned long minsz; > > > > + int ret = 0; > > > > + > > > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > > > + if (bind->argsz < minsz) > > > > + return -EINVAL; > > > > > > But gbind_data can change size if new vendor specific data is added to > > > the union, so kernel updates break existing userspace. Fail. > > > > yes, we have a version field in struct iommu_gpasid_bind_data. How > > about doing sanity check per versions? kernel knows the gbind_data > > size of specific versions. Does it make sense? If yes, I'll also apply it > > to the other sanity check in this series to avoid userspace fail after > > kernel update. > > Has it already been decided that the version field will be updated for > every addition to the union? No, just my proposal. Jacob may help to explain the purpose of version field. But if we may be too "frequent" for an uapi version number updating if we inc version for each change in the union part. I may vote for the second option from you below. > It seems there are two options, either > the version definition includes the possible contents of the union, > which means we need to support multiple versions concurrently in the > kernel to maintain compatibility with userspace and follow deprecation > protocols for removing that support, or we need to consider version to > be the general form of the structure and interpret the format field to > determine necessary length to copy from the user. As I mentioned above, may be better to let @version field only over the general fields and let format to cover the possible changes in union. e.g. IOMMU_PASID_FORMAT_INTEL_VTD2 may means version 2 of Intel VT-d bind. But either way, I think we need to let kernel maintain multiple versions to support compatible userspace. e.g. may have multiple versions iommu_gpasid_bind_data_vtd struct in the union part. > > > > > + > > > > + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) > > > > + return -EFAULT; > > > > + > > > > + mutex_lock(&iommu->lock); > > > > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > > > > + ret = -EINVAL; > > > > + goto out_unlock; > > > > + } > > > > + > > > > + ret = vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); > > > > + > > > > +out_unlock: > > > > + mutex_unlock(&iommu->lock); > > > > + return ret; > > > > +} > > > > + > > > > static long vfio_iommu_type1_ioctl(void *iommu_data, > > > > unsigned int cmd, unsigned long arg) > > > > { > > > > @@ -2484,6 +2582,44 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > > > > default: > > > > return -EINVAL; > > > > } > > > > + > > > > + } else if (cmd == VFIO_IOMMU_BIND) { > > > > + struct vfio_iommu_type1_bind bind; > > > > + > > > > + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); > > > > + > > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > > + return -EFAULT; > > > > + > > > > + if (bind.argsz < minsz) > > > > + return -EINVAL; > > > > + > > > > + switch (bind.bind_type) { > > > > + case VFIO_IOMMU_BIND_GUEST_PASID: > > > > + return vfio_iommu_type1_bind_gpasid(iommu, > > > > + (void __user *)(arg + minsz), &bind); > > > > > > Why are we defining BIND_PROCESS if it's not supported? How does the > > > user learn it's not supported? > > > > I think I should drop it so far since I only add BIND_GUEST_PASID. I think > > Jean Philippe may need it in his native SVA enabling patchset. For the way > > to let user learn it, may be using VFIO_IOMMU_GET_INFO as you mentioned > > below? > > > > > > > > > + default: > > > > + return -EINVAL; > > > > + } > > > > + > > > > + } else if (cmd == VFIO_IOMMU_UNBIND) { > > > > + struct vfio_iommu_type1_bind bind; > > > > + > > > > + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); > > > > + > > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > > + return -EFAULT; > > > > + > > > > + if (bind.argsz < minsz) > > > > + return -EINVAL; > > > > + > > > > + switch (bind.bind_type) { > > > > + case VFIO_IOMMU_BIND_GUEST_PASID: > > > > + return vfio_iommu_type1_unbind_gpasid(iommu, > > > > + (void __user *)(arg + minsz), &bind); > > > > + default: > > > > + return -EINVAL; > > > > + } > > > > } > > > > > > > > return -ENOTTY; > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > > > index 04de290..78e8c64 100644 > > > > --- a/include/uapi/linux/vfio.h > > > > +++ b/include/uapi/linux/vfio.h > > > > @@ -832,6 +832,50 @@ struct vfio_iommu_type1_pasid_request { > > > > */ > > > > #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 27) > > > > > > > > +enum vfio_iommu_bind_type { > > > > + VFIO_IOMMU_BIND_PROCESS, > > > > + VFIO_IOMMU_BIND_GUEST_PASID, > > > > +}; > > > > + > > > > +/* > > > > + * Supported types: > > > > + * - VFIO_IOMMU_BIND_GUEST_PASID: bind guest pasid, which > invoked > > > > + * by guest, it takes iommu_gpasid_bind_data in data. > > > > + */ > > > > +struct vfio_iommu_type1_bind { > > > > + __u32 argsz; > > > > + enum vfio_iommu_bind_type bind_type; > > > > + __u8 data[]; > > > > +}; > > > > > > I don't think enum defines a compiler invariant data size. We can't > > > use it for a kernel/user interface. Also why no flags field as is > > > essentially standard for every vfio ioctl? Couldn't we specify > > > process/guest-pasid with flags? > > > > I remember there is an early comment in community which pointed out > > that using flags potentially allows to config multiples types in one IOCTL. > > Regards to it, defining explicit emums avoids it. But I agree with you, > > it makes variant size. I'll fix it if this matter more. > > > > > For that matter couldn't we specify > > > bind/unbind using a single ioctl? I think that would be more > > > consistent with the pasid alloc/free ioctl in the previous patch. > > > > yes, let me make it in next version. > > > > > Why are we appending opaque data to the end of the structure when > > > clearly we expect a struct iommu_gpasid_bind_data? > > > > This is due to the intention to support BIND_GUEST_PASID and > > BIND_PROCESS with a single IOCTL. Maybe we can use a separate > > IOCTL for BIND_PROCESS. what's your opinion here? > > If the ioctls have similar purpose and form, then re-using a single > ioctl might make sense, but BIND_PROCESS is only a place-holder in this > series, which is not acceptable. A dual purpose ioctl does not > preclude that we could also use a union for the data field to make the > structure well specified. yes, BIND_PROCESS is only a place-holder here. From kernel p.o.v., both BIND_GUEST_PASID and BIND_PROCESS are bind requests from userspace. So the purposes are aligned. Below is the content the @data[] field supposed to convey for BIND_PROCESS. If we use union, it would leave space for extending it to support BIND_PROCESS. If only data[], it is a little bit confusing why we define it in such manner if BIND_PROCESS is included in this series. Please feel free let me know which one suits better. +struct vfio_iommu_type1_bind_process { + __u32 flags; +#define VFIO_IOMMU_BIND_PID (1 << 0) + __u32 pasid; + __s32 pid; +}; https://patchwork.kernel.org/patch/10394927/ > > > That bind data > > > structure expects a format (ex. IOMMU_PASID_FORMAT_INTEL_VTD). How > does > > > a user determine what formats are accepted from within the vfio API (or > > > even outside of the vfio API)? > > > > The info is provided by vIOMMU emulator (e.g. virtual VT-d). The vSVA patch > > from Jacob has a sanity check on it. > > https://lkml.org/lkml/2019/10/28/873 > > The vIOMMU emulator runs at a layer above vfio. How does the vIOMMU > emulator know that the vfio interface supports virtual VT-d? IMO, it's > not acceptable that the user simply assume that an Intel host platform > supports VT-d. For example, consider what happens when we need to > define IOMMU_PASID_FORMAT_INTEL_VTDv2. How would the user learn that > VTDv2 is supported and the original VTD format is not supported? I guess this may be another info VFIO_IOMMU_GET_INFO should provide. It makes sense that vfio be aware of what platform it is running on. right? After vfio gets the info, may let vfio fill in the format info. Is it the correct direction? > > > > > + > > > > +/* > > > > + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 28, struct > > > vfio_iommu_bind) > > > ^ > > > The semantics appear to just be _IOW, nothing is written back to the > > > userspace buffer on return. > > > > will fix it. thanks. > > > > > > + * > > > > + * Manage address spaces of devices in this container. Initially a TYPE1 > > > > + * container can only have one address space, managed with > > > > + * VFIO_IOMMU_MAP/UNMAP_DMA. > > > > + * > > > > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by > both > > > MAP/UNMAP > > > > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) > page > > > > + * tables, and BIND manages the stage-1 (guest) page tables. Other types of > > > > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP > > > controls > > > > + * non-PASID traffic and BIND controls PASID traffic. But this depends on the > > > > + * underlying IOMMU architecture and isn't guaranteed. > > > > + * > > > > + * Availability of this feature depends on the device, its bus, the underlying > > > > + * IOMMU and the CPU architecture. > > > > > > And the user discovers this is available by...? There's no probe here, > > > are they left only to setup a VM to the point of trying to use this > > > before they fail the ioctl? Could VFIO_IOMMU_GET_INFO fill this gap? > > > Thanks, > > > > I think VFIO_IOMMU_GET_INFO could help. let me extend it to fill this gap > > if you agree. > > It's a start. Thanks, Got it. will show the code in next version. Thanks for your patient review. > Alex Regards, Yi Liu
On Wed, Nov 13, 2019 at 07:43:43AM +0000, Liu, Yi L wrote: > > From: Alex Williamson <alex.williamson@redhat.com> > > Sent: Wednesday, November 13, 2019 1:26 AM > > To: Liu, Yi L <yi.l.liu@intel.com> > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > > > On Tue, 12 Nov 2019 11:21:40 +0000 > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > > From: Alex Williamson < alex.williamson@redhat.com > > > > > Sent: Friday, November 8, 2019 7:21 AM > > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > > > > > > > On Thu, 24 Oct 2019 08:26:23 -0400 > > > > Liu Yi L <yi.l.liu@intel.com> wrote: > > > > > > > > > This patch adds vfio support to bind guest translation structure > > > > > to host iommu. VFIO exposes iommu programming capability to user- > > > > > space. Guest is a user-space application in host under KVM solution. > > > > > For SVA usage in Virtual Machine, guest owns GVA->GPA translation > > > > > structure. And this part should be passdown to host to enable nested > > > > > translation (or say two stage translation). This patch reuses the > > > > > VFIO_IOMMU_BIND proposal from Jean-Philippe Brucker, and adds new > > > > > bind type for binding guest owned translation structure to host. > > > > > > > > > > *) Add two new ioctls for VFIO containers. > > > > > > > > > > - VFIO_IOMMU_BIND: for bind request from userspace, it could be > > > > > bind a process to a pasid or bind a guest pasid > > > > > to a device, this is indicated by type > > > > > - VFIO_IOMMU_UNBIND: for unbind request from userspace, it could be > > > > > unbind a process to a pasid or unbind a guest pasid > > > > > to a device, also indicated by type > > > > > - Bind type: > > > > > VFIO_IOMMU_BIND_PROCESS: user-space request to bind a process > > > > > to a device > > > > > VFIO_IOMMU_BIND_GUEST_PASID: bind guest owned translation > > > > > structure to host iommu. e.g. guest page table > > > > > > > > > > *) Code logic in vfio_iommu_type1_ioctl() to handle > > VFIO_IOMMU_BIND/UNBIND > > > > > > [...] > > > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > > > + void __user *arg, > > > > > + struct vfio_iommu_type1_bind *bind) > > > > > +{ > > > > > + struct iommu_gpasid_bind_data gbind_data; > > > > > + unsigned long minsz; > > > > > + int ret = 0; > > > > > + > > > > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > > > > + if (bind->argsz < minsz) > > > > > + return -EINVAL; > > > > > > > > But gbind_data can change size if new vendor specific data is added to > > > > the union, so kernel updates break existing userspace. Fail. I guess we could take minsz up to the vendor-specific data, copy @format, and then check the size of vendor-specific data? > > > > > > yes, we have a version field in struct iommu_gpasid_bind_data. How > > > about doing sanity check per versions? kernel knows the gbind_data > > > size of specific versions. Does it make sense? If yes, I'll also apply it > > > to the other sanity check in this series to avoid userspace fail after > > > kernel update. > > > > Has it already been decided that the version field will be updated for > > every addition to the union? > > No, just my proposal. Jacob may help to explain the purpose of version > field. But if we may be too "frequent" for an uapi version number updating > if we inc version for each change in the union part. I may vote for the > second option from you below. > > > It seems there are two options, either > > the version definition includes the possible contents of the union, > > which means we need to support multiple versions concurrently in the > > kernel to maintain compatibility with userspace and follow deprecation > > protocols for removing that support, or we need to consider version to > > be the general form of the structure and interpret the format field to > > determine necessary length to copy from the user. > > As I mentioned above, may be better to let @version field only over the > general fields and let format to cover the possible changes in union. e.g. > IOMMU_PASID_FORMAT_INTEL_VTD2 may means version 2 of Intel > VT-d bind. But either way, I think we need to let kernel maintain multiple > versions to support compatible userspace. e.g. may have multiple versions > iommu_gpasid_bind_data_vtd struct in the union part. I couldn't find where the @version field originated in our old discussions, but I believe our plan for allowing future extensions was: * Add new vendor-specific data by introducing a new format (IOMMU_PASID_FORMAT_INTEL_VTD2, IOMMU_PASID_FORMAT_ARM_SMMUV2...), and extend the union. * Add a new common field, if it fits in the existing padding bytes, by adding a flag (IOMMU_SVA_GPASID_*). * Add a new common field, if it doesn't fit in the current padding bytes, or completely change the structure layout, by introducing a new version (IOMMU_GPASID_BIND_VERSION_2). In that case the kernel has to handle both new and old structure versions. It would have both iommu_gpasid_bind_data and iommu_gpasid_bind_data_v2 structs. I think iommu_cache_invalidate_info and iommu_page_response use the same scheme. iommu_fault is a bit more complicated because it's kernel->userspace and requires some negotiation: https://lore.kernel.org/linux-iommu/77405d39-81a4-d9a8-5d35-27602199867a@arm.com/ [...] > > If the ioctls have similar purpose and form, then re-using a single > > ioctl might make sense, but BIND_PROCESS is only a place-holder in this > > series, which is not acceptable. A dual purpose ioctl does not > > preclude that we could also use a union for the data field to make the > > structure well specified. > > yes, BIND_PROCESS is only a place-holder here. From kernel p.o.v., both > BIND_GUEST_PASID and BIND_PROCESS are bind requests from userspace. > So the purposes are aligned. Below is the content the @data[] field > supposed to convey for BIND_PROCESS. If we use union, it would leave > space for extending it to support BIND_PROCESS. If only data[], it is a little > bit confusing why we define it in such manner if BIND_PROCESS is included > in this series. Please feel free let me know which one suits better. > > +struct vfio_iommu_type1_bind_process { > + __u32 flags; > +#define VFIO_IOMMU_BIND_PID (1 << 0) > + __u32 pasid; > + __s32 pid; > +}; > https://patchwork.kernel.org/patch/10394927/ Note that I don't plan to upstream BIND_PROCESS at the moment. It was useful for testing but I don't know of anyone actually needing it. > > > > That bind data > > > > structure expects a format (ex. IOMMU_PASID_FORMAT_INTEL_VTD). How > > does > > > > a user determine what formats are accepted from within the vfio API (or > > > > even outside of the vfio API)? > > > > > > The info is provided by vIOMMU emulator (e.g. virtual VT-d). The vSVA patch > > > from Jacob has a sanity check on it. > > > https://lkml.org/lkml/2019/10/28/873 > > > > The vIOMMU emulator runs at a layer above vfio. How does the vIOMMU > > emulator know that the vfio interface supports virtual VT-d? IMO, it's > > not acceptable that the user simply assume that an Intel host platform > > supports VT-d. For example, consider what happens when we need to > > define IOMMU_PASID_FORMAT_INTEL_VTDv2. How would the user learn that > > VTDv2 is supported and the original VTD format is not supported? > > I guess this may be another info VFIO_IOMMU_GET_INFO should provide. > It makes sense that vfio be aware of what platform it is running on. right? > After vfio gets the info, may let vfio fill in the format info. Is it the correct > direction? I thought you were planning to put that information in sysfs? We last discussed this over a year ago so I don't remember where we left it. I know Alex isn't keen on putting in sysfs what can be communicated through VFIO, but it is a convenient way to describe IOMMU features: http://www.linux-arm.org/git?p=linux-jpb.git;a=commitdiff;h=665370d5b5e0022c24b2d2b57975ef6fe7b40870;hp=7ce780d838889b53f5e04ba5d444520621261eda My problem with GET_INFO was that it could be difficult to extend, and to describe things like variable-size list of supported page table formats, but I guess the new info capabilities make this easier. Thanks, Jean
> From: Jean-Philippe Brucker [mailto:jean-philippe@linaro.org] > Sent: Wednesday, November 13, 2019 6:29 PM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > On Wed, Nov 13, 2019 at 07:43:43AM +0000, Liu, Yi L wrote: > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Wednesday, November 13, 2019 1:26 AM > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > > > > > On Tue, 12 Nov 2019 11:21:40 +0000 > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > > > > From: Alex Williamson < alex.williamson@redhat.com > > > > > > Sent: Friday, November 8, 2019 7:21 AM > > > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to > host > > > > > > > > > > On Thu, 24 Oct 2019 08:26:23 -0400 > > > > > Liu Yi L <yi.l.liu@intel.com> wrote: > > > > > > > > > > > This patch adds vfio support to bind guest translation structure > > > > > > to host iommu. VFIO exposes iommu programming capability to user- > > > > > > space. Guest is a user-space application in host under KVM solution. > > > > > > For SVA usage in Virtual Machine, guest owns GVA->GPA translation > > > > > > structure. And this part should be passdown to host to enable nested > > > > > > translation (or say two stage translation). This patch reuses the > > > > > > VFIO_IOMMU_BIND proposal from Jean-Philippe Brucker, and adds new > > > > > > bind type for binding guest owned translation structure to host. > > > > > > > > > > > > *) Add two new ioctls for VFIO containers. > > > > > > > > > > > > - VFIO_IOMMU_BIND: for bind request from userspace, it could be > > > > > > bind a process to a pasid or bind a guest pasid > > > > > > to a device, this is indicated by type > > > > > > - VFIO_IOMMU_UNBIND: for unbind request from userspace, it could be > > > > > > unbind a process to a pasid or unbind a guest pasid > > > > > > to a device, also indicated by type > > > > > > - Bind type: > > > > > > VFIO_IOMMU_BIND_PROCESS: user-space request to bind a > process > > > > > > to a device > > > > > > VFIO_IOMMU_BIND_GUEST_PASID: bind guest owned translation > > > > > > structure to host iommu. e.g. guest page table > > > > > > > > > > > > *) Code logic in vfio_iommu_type1_ioctl() to handle > > > VFIO_IOMMU_BIND/UNBIND > > > > > > > > [...] > > > > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > > > > + void __user *arg, > > > > > > + struct vfio_iommu_type1_bind > *bind) > > > > > > +{ > > > > > > + struct iommu_gpasid_bind_data gbind_data; > > > > > > + unsigned long minsz; > > > > > > + int ret = 0; > > > > > > + > > > > > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > > > > > + if (bind->argsz < minsz) > > > > > > + return -EINVAL; > > > > > > > > > > But gbind_data can change size if new vendor specific data is added to > > > > > the union, so kernel updates break existing userspace. Fail. > > I guess we could take minsz up to the vendor-specific data, copy @format, > and then check the size of vendor-specific data? Agreed. > > > > > > > > > yes, we have a version field in struct iommu_gpasid_bind_data. How > > > > about doing sanity check per versions? kernel knows the gbind_data > > > > size of specific versions. Does it make sense? If yes, I'll also apply it > > > > to the other sanity check in this series to avoid userspace fail after > > > > kernel update. > > > > > > Has it already been decided that the version field will be updated for > > > every addition to the union? > > > > No, just my proposal. Jacob may help to explain the purpose of version > > field. But if we may be too "frequent" for an uapi version number updating > > if we inc version for each change in the union part. I may vote for the > > second option from you below. > > > > > It seems there are two options, either > > > the version definition includes the possible contents of the union, > > > which means we need to support multiple versions concurrently in the > > > kernel to maintain compatibility with userspace and follow deprecation > > > protocols for removing that support, or we need to consider version to > > > be the general form of the structure and interpret the format field to > > > determine necessary length to copy from the user. > > > > As I mentioned above, may be better to let @version field only over the > > general fields and let format to cover the possible changes in union. e.g. > > IOMMU_PASID_FORMAT_INTEL_VTD2 may means version 2 of Intel > > VT-d bind. But either way, I think we need to let kernel maintain multiple > > versions to support compatible userspace. e.g. may have multiple versions > > iommu_gpasid_bind_data_vtd struct in the union part. > > I couldn't find where the @version field originated in our old > discussions, but I believe our plan for allowing future extensions was: > > * Add new vendor-specific data by introducing a new format > (IOMMU_PASID_FORMAT_INTEL_VTD2, > IOMMU_PASID_FORMAT_ARM_SMMUV2...), and > extend the union. > > * Add a new common field, if it fits in the existing padding bytes, by > adding a flag (IOMMU_SVA_GPASID_*). > > * Add a new common field, if it doesn't fit in the current padding bytes, > or completely change the structure layout, by introducing a new version > (IOMMU_GPASID_BIND_VERSION_2). In that case the kernel has to handle > both new and old structure versions. It would have both > iommu_gpasid_bind_data and iommu_gpasid_bind_data_v2 structs. > > I think iommu_cache_invalidate_info and iommu_page_response use the same > scheme. iommu_fault is a bit more complicated because it's > kernel->userspace and requires some negotiation: > https://lore.kernel.org/linux-iommu/77405d39-81a4-d9a8-5d35- > 27602199867a@arm.com/ Thanks for the excellent recap. > [...] > > > If the ioctls have similar purpose and form, then re-using a single > > > ioctl might make sense, but BIND_PROCESS is only a place-holder in this > > > series, which is not acceptable. A dual purpose ioctl does not > > > preclude that we could also use a union for the data field to make the > > > structure well specified. > > > > yes, BIND_PROCESS is only a place-holder here. From kernel p.o.v., both > > BIND_GUEST_PASID and BIND_PROCESS are bind requests from userspace. > > So the purposes are aligned. Below is the content the @data[] field > > supposed to convey for BIND_PROCESS. If we use union, it would leave > > space for extending it to support BIND_PROCESS. If only data[], it is a little > > bit confusing why we define it in such manner if BIND_PROCESS is included > > in this series. Please feel free let me know which one suits better. > > > > +struct vfio_iommu_type1_bind_process { > > + __u32 flags; > > +#define VFIO_IOMMU_BIND_PID (1 << 0) > > + __u32 pasid; > > + __s32 pid; > > +}; > > https://patchwork.kernel.org/patch/10394927/ > > Note that I don't plan to upstream BIND_PROCESS at the moment. It was > useful for testing but I don't know of anyone actually needing it. yes, you told me during KVM forum. But if we want to share IOCTL, may need to leave a place for you to extend. If @data[] is not good, then may use union. > > > > > That bind data > > > > > structure expects a format (ex. IOMMU_PASID_FORMAT_INTEL_VTD). How > > > does > > > > > a user determine what formats are accepted from within the vfio API (or > > > > > even outside of the vfio API)? > > > > > > > > The info is provided by vIOMMU emulator (e.g. virtual VT-d). The vSVA patch > > > > from Jacob has a sanity check on it. > > > > https://lkml.org/lkml/2019/10/28/873 > > > > > > The vIOMMU emulator runs at a layer above vfio. How does the vIOMMU > > > emulator know that the vfio interface supports virtual VT-d? IMO, it's > > > not acceptable that the user simply assume that an Intel host platform > > > supports VT-d. For example, consider what happens when we need to > > > define IOMMU_PASID_FORMAT_INTEL_VTDv2. How would the user learn that > > > VTDv2 is supported and the original VTD format is not supported? > > > > I guess this may be another info VFIO_IOMMU_GET_INFO should provide. > > It makes sense that vfio be aware of what platform it is running on. right? > > After vfio gets the info, may let vfio fill in the format info. Is it the correct > > direction? > > I thought you were planning to put that information in sysfs? We last > discussed this over a year ago so I don't remember where we left it. I yes, we did have such discussion to do hardware iommu capability query via sysfs. If only want to let vIOMMU learn what format it should use, then GET_INFO may be enough. e.g. vfio just asks its backed iommu driver. hey, do you support nested translation? what format do you prefer? But I'm open on it. > know Alex isn't keen on putting in sysfs what can be communicated through > VFIO, but it is a convenient way to describe IOMMU features: > http://www.linux-arm.org/git?p=linux- > jpb.git;a=commitdiff;h=665370d5b5e0022c24b2d2b57975ef6fe7b40870;hp=7ce780 > d838889b53f5e04ba5d444520621261eda > > My problem with GET_INFO was that it could be difficult to extend, and > to describe things like variable-size list of supported page table > formats, but I guess the new info capabilities make this easier. yeah, you also need to make the info generic if want to extend something. As I said, I'm open with it. Please feel free let me know if you've got other ideas. Regards, Yi Liu > Thanks, > Jean
Hi Alex, Thanks for the review. Here I'd like to conclude the major opens in this thread and see if we can get some agreements to prepare a new version. a) IOCTLs for BIND_GPASID and BIND_PROCESS, share a single IOCTL or two separate IOCTLs? Yi: It may be helpful to have separate IOCTLs. The bind data conveyed for BIND_GPASID and BIND_PROCESS are totally different, and the struct iommu_gpasid_bind_data has vendor specific data and may even have more versions in future. To better maintain it, I guess separate IOCTLs for the two bind types would be better. The structure for BIND_GPASID is as below: struct vfio_iommu_type1_bind { __u32 argsz; struct iommu_gpasid_bind_data bind_data; }; b) how kernel-space learns the number of bytes to be copied (a.k.a. the usage of @version field and @format field of struct iommu_gpasid_bind_data) Yi: Jean has an excellent recap in prior reply on the plan of future extensions regards to @version field and @format field. Based on the plan, kernel space needs to parse the @version field and @format field to get the length of the current BIND_GPASID request. Also kernel needs to maintain the new and old structure versions. Follow specific deprecation policy in future. c) how can vIOMMU emulator know that the vfio interface supports to config dual stage translation for vIOMMU? Yi: may do it via VFIO_IOMMU_GET_INFO. d) how can vIOMMU emulator know what @version and @format should be set in struct iommu_gpasid_bind_data? Yi: currently, we have two ways. First one, may do it via VFIO_IOMMU_GET_INFO. This is a natural idea as here @version and @format are used in vfio apis. It makes sense to let vfio to provide related info to vIOMMU emulator after checking with vendor specific iommu driver. Also, there is idea to do it via sysfs (/sys/class/iommu/dmar#) as we have plan to do IOMMU capability sync between vIOMMU and pIOMMU via sysfs. I have two concern on this option. Current iommu sysfs only provides vendor specific hardware infos. I'm not sure if it is good to expose infos defined in IOMMU generic layer via iommu sysfs. If this concern is not a big thing, I'm fine with both options. Thoughts? Would also be happy to know more from you guys. Regards, Yi Liu > From: Jean-Philippe Brucker [mailto:jean-philippe@linaro.org] > Sent: Wednesday, November 13, 2019 6:29 PM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > On Wed, Nov 13, 2019 at 07:43:43AM +0000, Liu, Yi L wrote: > > > From: Alex Williamson <alex.williamson@redhat.com> > > > Sent: Wednesday, November 13, 2019 1:26 AM > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > > > > > On Tue, 12 Nov 2019 11:21:40 +0000 > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > > > > From: Alex Williamson < alex.williamson@redhat.com > > > > > > Sent: Friday, November 8, 2019 7:21 AM > > > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to > host > > > > > > > > > > On Thu, 24 Oct 2019 08:26:23 -0400 > > > > > Liu Yi L <yi.l.liu@intel.com> wrote: > > > > > > > > > > > This patch adds vfio support to bind guest translation structure > > > > > > to host iommu. VFIO exposes iommu programming capability to user- > > > > > > space. Guest is a user-space application in host under KVM solution. > > > > > > For SVA usage in Virtual Machine, guest owns GVA->GPA translation > > > > > > structure. And this part should be passdown to host to enable nested > > > > > > translation (or say two stage translation). This patch reuses the > > > > > > VFIO_IOMMU_BIND proposal from Jean-Philippe Brucker, and adds new > > > > > > bind type for binding guest owned translation structure to host. > > > > > > > > > > > > *) Add two new ioctls for VFIO containers. > > > > > > > > > > > > - VFIO_IOMMU_BIND: for bind request from userspace, it could be > > > > > > bind a process to a pasid or bind a guest pasid > > > > > > to a device, this is indicated by type > > > > > > - VFIO_IOMMU_UNBIND: for unbind request from userspace, it could be > > > > > > unbind a process to a pasid or unbind a guest pasid > > > > > > to a device, also indicated by type > > > > > > - Bind type: > > > > > > VFIO_IOMMU_BIND_PROCESS: user-space request to bind a > process > > > > > > to a device > > > > > > VFIO_IOMMU_BIND_GUEST_PASID: bind guest owned translation > > > > > > structure to host iommu. e.g. guest page table > > > > > > > > > > > > *) Code logic in vfio_iommu_type1_ioctl() to handle > > > VFIO_IOMMU_BIND/UNBIND > > > > > > > > [...] > > > > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > > > > + void __user *arg, > > > > > > + struct vfio_iommu_type1_bind > *bind) > > > > > > +{ > > > > > > + struct iommu_gpasid_bind_data gbind_data; > > > > > > + unsigned long minsz; > > > > > > + int ret = 0; > > > > > > + > > > > > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > > > > > + if (bind->argsz < minsz) > > > > > > + return -EINVAL; > > > > > > > > > > But gbind_data can change size if new vendor specific data is added to > > > > > the union, so kernel updates break existing userspace. Fail. > > I guess we could take minsz up to the vendor-specific data, copy @format, > and then check the size of vendor-specific data? > > > > > > > > > yes, we have a version field in struct iommu_gpasid_bind_data. How > > > > about doing sanity check per versions? kernel knows the gbind_data > > > > size of specific versions. Does it make sense? If yes, I'll also apply it > > > > to the other sanity check in this series to avoid userspace fail after > > > > kernel update. > > > > > > Has it already been decided that the version field will be updated for > > > every addition to the union? > > > > No, just my proposal. Jacob may help to explain the purpose of version > > field. But if we may be too "frequent" for an uapi version number updating > > if we inc version for each change in the union part. I may vote for the > > second option from you below. > > > > > It seems there are two options, either > > > the version definition includes the possible contents of the union, > > > which means we need to support multiple versions concurrently in the > > > kernel to maintain compatibility with userspace and follow deprecation > > > protocols for removing that support, or we need to consider version to > > > be the general form of the structure and interpret the format field to > > > determine necessary length to copy from the user. > > > > As I mentioned above, may be better to let @version field only over the > > general fields and let format to cover the possible changes in union. e.g. > > IOMMU_PASID_FORMAT_INTEL_VTD2 may means version 2 of Intel > > VT-d bind. But either way, I think we need to let kernel maintain multiple > > versions to support compatible userspace. e.g. may have multiple versions > > iommu_gpasid_bind_data_vtd struct in the union part. > > I couldn't find where the @version field originated in our old > discussions, but I believe our plan for allowing future extensions was: > > * Add new vendor-specific data by introducing a new format > (IOMMU_PASID_FORMAT_INTEL_VTD2, > IOMMU_PASID_FORMAT_ARM_SMMUV2...), and > extend the union. > > * Add a new common field, if it fits in the existing padding bytes, by > adding a flag (IOMMU_SVA_GPASID_*). > > * Add a new common field, if it doesn't fit in the current padding bytes, > or completely change the structure layout, by introducing a new version > (IOMMU_GPASID_BIND_VERSION_2). In that case the kernel has to handle > both new and old structure versions. It would have both > iommu_gpasid_bind_data and iommu_gpasid_bind_data_v2 structs. > > I think iommu_cache_invalidate_info and iommu_page_response use the same > scheme. iommu_fault is a bit more complicated because it's > kernel->userspace and requires some negotiation: > https://lore.kernel.org/linux-iommu/77405d39-81a4-d9a8-5d35- > 27602199867a@arm.com/ > > [...] > > > If the ioctls have similar purpose and form, then re-using a single > > > ioctl might make sense, but BIND_PROCESS is only a place-holder in this > > > series, which is not acceptable. A dual purpose ioctl does not > > > preclude that we could also use a union for the data field to make the > > > structure well specified. > > > > yes, BIND_PROCESS is only a place-holder here. From kernel p.o.v., both > > BIND_GUEST_PASID and BIND_PROCESS are bind requests from userspace. > > So the purposes are aligned. Below is the content the @data[] field > > supposed to convey for BIND_PROCESS. If we use union, it would leave > > space for extending it to support BIND_PROCESS. If only data[], it is a little > > bit confusing why we define it in such manner if BIND_PROCESS is included > > in this series. Please feel free let me know which one suits better. > > > > +struct vfio_iommu_type1_bind_process { > > + __u32 flags; > > +#define VFIO_IOMMU_BIND_PID (1 << 0) > > + __u32 pasid; > > + __s32 pid; > > +}; > > https://patchwork.kernel.org/patch/10394927/ > > Note that I don't plan to upstream BIND_PROCESS at the moment. It was > useful for testing but I don't know of anyone actually needing it. > > > > > > That bind data > > > > > structure expects a format (ex. IOMMU_PASID_FORMAT_INTEL_VTD). How > > > does > > > > > a user determine what formats are accepted from within the vfio API (or > > > > > even outside of the vfio API)? > > > > > > > > The info is provided by vIOMMU emulator (e.g. virtual VT-d). The vSVA patch > > > > from Jacob has a sanity check on it. > > > > https://lkml.org/lkml/2019/10/28/873 > > > > > > The vIOMMU emulator runs at a layer above vfio. How does the vIOMMU > > > emulator know that the vfio interface supports virtual VT-d? IMO, it's > > > not acceptable that the user simply assume that an Intel host platform > > > supports VT-d. For example, consider what happens when we need to > > > define IOMMU_PASID_FORMAT_INTEL_VTDv2. How would the user learn that > > > VTDv2 is supported and the original VTD format is not supported? > > > > I guess this may be another info VFIO_IOMMU_GET_INFO should provide. > > It makes sense that vfio be aware of what platform it is running on. right? > > After vfio gets the info, may let vfio fill in the format info. Is it the correct > > direction? > > I thought you were planning to put that information in sysfs? We last > discussed this over a year ago so I don't remember where we left it. I > know Alex isn't keen on putting in sysfs what can be communicated through > VFIO, but it is a convenient way to describe IOMMU features: > http://www.linux-arm.org/git?p=linux- > jpb.git;a=commitdiff;h=665370d5b5e0022c24b2d2b57975ef6fe7b40870;hp=7ce780 > d838889b53f5e04ba5d444520621261eda > > My problem with GET_INFO was that it could be difficult to extend, and > to describe things like variable-size list of supported page table > formats, but I guess the new info capabilities make this easier. > > Thanks, > Jean
On Mon, 25 Nov 2019 07:45:18 +0000 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > Hi Alex, > > Thanks for the review. Here I'd like to conclude the major opens in this > thread and see if we can get some agreements to prepare a new version. > > a) IOCTLs for BIND_GPASID and BIND_PROCESS, share a single IOCTL or two > separate IOCTLs? > Yi: It may be helpful to have separate IOCTLs. The bind data conveyed > for BIND_GPASID and BIND_PROCESS are totally different, and the struct > iommu_gpasid_bind_data has vendor specific data and may even have more > versions in future. To better maintain it, I guess separate IOCTLs for > the two bind types would be better. The structure for BIND_GPASID is > as below: > > struct vfio_iommu_type1_bind { > __u32 argsz; > struct iommu_gpasid_bind_data bind_data; > }; We've been rather successful at extending ioctls in vfio and I'm generally opposed to rampant ioctl proliferation. If we added @flags to the above struct (as pretty much the standard for vfio ioctls), then we could use it to describe the type of binding to perform and therefore the type of data provided. I think my major complaint here was that we were defining PROCESS but not implementing it. We can design the ioctl to enable it, but not define it until it's implemented. > b) how kernel-space learns the number of bytes to be copied (a.k.a. the > usage of @version field and @format field of struct > iommu_gpasid_bind_data) > Yi: Jean has an excellent recap in prior reply on the plan of future > extensions regards to @version field and @format field. Based on the > plan, kernel space needs to parse the @version field and @format field > to get the length of the current BIND_GPASID request. Also kernel needs > to maintain the new and old structure versions. Follow specific > deprecation policy in future. Yes, it seems reasonable, so from the struct above (plus @flags) we could determine we have struct iommu_gpasid_bind_data as the payload and read that using @version and @format as outlined. > c) how can vIOMMU emulator know that the vfio interface supports to config > dual stage translation for vIOMMU? > Yi: may do it via VFIO_IOMMU_GET_INFO. Yes please. > d) how can vIOMMU emulator know what @version and @format should be set > in struct iommu_gpasid_bind_data? > Yi: currently, we have two ways. First one, may do it via > VFIO_IOMMU_GET_INFO. This is a natural idea as here @version and @format > are used in vfio apis. It makes sense to let vfio to provide related info > to vIOMMU emulator after checking with vendor specific iommu driver. Also, > there is idea to do it via sysfs (/sys/class/iommu/dmar#) as we have plan > to do IOMMU capability sync between vIOMMU and pIOMMU via sysfs. I have > two concern on this option. Current iommu sysfs only provides vendor > specific hardware infos. I'm not sure if it is good to expose infos > defined in IOMMU generic layer via iommu sysfs. If this concern is not > a big thing, I'm fine with both options. This seems like the same issue we had with IOMMU reserved regions, I'd prefer that a user can figure out how to interact with the vfio interface through the vfio interface. Forcing the user to poke around in sysfs requires the user to have read permissions to sysfs in places they otherwise wouldn't need. Thanks, Alex > > From: Jean-Philippe Brucker [mailto:jean-philippe@linaro.org] > > Sent: Wednesday, November 13, 2019 6:29 PM > > To: Liu, Yi L <yi.l.liu@intel.com> > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > > > On Wed, Nov 13, 2019 at 07:43:43AM +0000, Liu, Yi L wrote: > > > > From: Alex Williamson <alex.williamson@redhat.com> > > > > Sent: Wednesday, November 13, 2019 1:26 AM > > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > > > > > > > On Tue, 12 Nov 2019 11:21:40 +0000 > > > > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > > > > > > > From: Alex Williamson < alex.williamson@redhat.com > > > > > > > Sent: Friday, November 8, 2019 7:21 AM > > > > > > To: Liu, Yi L <yi.l.liu@intel.com> > > > > > > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to > > host > > > > > > > > > > > > On Thu, 24 Oct 2019 08:26:23 -0400 > > > > > > Liu Yi L <yi.l.liu@intel.com> wrote: > > > > > > > > > > > > > This patch adds vfio support to bind guest translation structure > > > > > > > to host iommu. VFIO exposes iommu programming capability to user- > > > > > > > space. Guest is a user-space application in host under KVM solution. > > > > > > > For SVA usage in Virtual Machine, guest owns GVA->GPA translation > > > > > > > structure. And this part should be passdown to host to enable nested > > > > > > > translation (or say two stage translation). This patch reuses the > > > > > > > VFIO_IOMMU_BIND proposal from Jean-Philippe Brucker, and adds new > > > > > > > bind type for binding guest owned translation structure to host. > > > > > > > > > > > > > > *) Add two new ioctls for VFIO containers. > > > > > > > > > > > > > > - VFIO_IOMMU_BIND: for bind request from userspace, it could be > > > > > > > bind a process to a pasid or bind a guest pasid > > > > > > > to a device, this is indicated by type > > > > > > > - VFIO_IOMMU_UNBIND: for unbind request from userspace, it could be > > > > > > > unbind a process to a pasid or unbind a guest pasid > > > > > > > to a device, also indicated by type > > > > > > > - Bind type: > > > > > > > VFIO_IOMMU_BIND_PROCESS: user-space request to bind a > > process > > > > > > > to a device > > > > > > > VFIO_IOMMU_BIND_GUEST_PASID: bind guest owned translation > > > > > > > structure to host iommu. e.g. guest page table > > > > > > > > > > > > > > *) Code logic in vfio_iommu_type1_ioctl() to handle > > > > VFIO_IOMMU_BIND/UNBIND > > > > > > > > > > [...] > > > > > > > +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, > > > > > > > + void __user *arg, > > > > > > > + struct vfio_iommu_type1_bind > > *bind) > > > > > > > +{ > > > > > > > + struct iommu_gpasid_bind_data gbind_data; > > > > > > > + unsigned long minsz; > > > > > > > + int ret = 0; > > > > > > > + > > > > > > > + minsz = sizeof(*bind) + sizeof(gbind_data); > > > > > > > + if (bind->argsz < minsz) > > > > > > > + return -EINVAL; > > > > > > > > > > > > But gbind_data can change size if new vendor specific data is added to > > > > > > the union, so kernel updates break existing userspace. Fail. > > > > I guess we could take minsz up to the vendor-specific data, copy @format, > > and then check the size of vendor-specific data? > > > > > > > > > > > > yes, we have a version field in struct iommu_gpasid_bind_data. How > > > > > about doing sanity check per versions? kernel knows the gbind_data > > > > > size of specific versions. Does it make sense? If yes, I'll also apply it > > > > > to the other sanity check in this series to avoid userspace fail after > > > > > kernel update. > > > > > > > > Has it already been decided that the version field will be updated for > > > > every addition to the union? > > > > > > No, just my proposal. Jacob may help to explain the purpose of version > > > field. But if we may be too "frequent" for an uapi version number updating > > > if we inc version for each change in the union part. I may vote for the > > > second option from you below. > > > > > > > It seems there are two options, either > > > > the version definition includes the possible contents of the union, > > > > which means we need to support multiple versions concurrently in the > > > > kernel to maintain compatibility with userspace and follow deprecation > > > > protocols for removing that support, or we need to consider version to > > > > be the general form of the structure and interpret the format field to > > > > determine necessary length to copy from the user. > > > > > > As I mentioned above, may be better to let @version field only over the > > > general fields and let format to cover the possible changes in union. e.g. > > > IOMMU_PASID_FORMAT_INTEL_VTD2 may means version 2 of Intel > > > VT-d bind. But either way, I think we need to let kernel maintain multiple > > > versions to support compatible userspace. e.g. may have multiple versions > > > iommu_gpasid_bind_data_vtd struct in the union part. > > > > I couldn't find where the @version field originated in our old > > discussions, but I believe our plan for allowing future extensions was: > > > > * Add new vendor-specific data by introducing a new format > > (IOMMU_PASID_FORMAT_INTEL_VTD2, > > IOMMU_PASID_FORMAT_ARM_SMMUV2...), and > > extend the union. > > > > * Add a new common field, if it fits in the existing padding bytes, by > > adding a flag (IOMMU_SVA_GPASID_*). > > > > * Add a new common field, if it doesn't fit in the current padding bytes, > > or completely change the structure layout, by introducing a new version > > (IOMMU_GPASID_BIND_VERSION_2). In that case the kernel has to handle > > both new and old structure versions. It would have both > > iommu_gpasid_bind_data and iommu_gpasid_bind_data_v2 structs. > > > > I think iommu_cache_invalidate_info and iommu_page_response use the same > > scheme. iommu_fault is a bit more complicated because it's > > kernel->userspace and requires some negotiation: > > https://lore.kernel.org/linux-iommu/77405d39-81a4-d9a8-5d35- > > 27602199867a@arm.com/ > > > > [...] > > > > If the ioctls have similar purpose and form, then re-using a single > > > > ioctl might make sense, but BIND_PROCESS is only a place-holder in this > > > > series, which is not acceptable. A dual purpose ioctl does not > > > > preclude that we could also use a union for the data field to make the > > > > structure well specified. > > > > > > yes, BIND_PROCESS is only a place-holder here. From kernel p.o.v., both > > > BIND_GUEST_PASID and BIND_PROCESS are bind requests from userspace. > > > So the purposes are aligned. Below is the content the @data[] field > > > supposed to convey for BIND_PROCESS. If we use union, it would leave > > > space for extending it to support BIND_PROCESS. If only data[], it is a little > > > bit confusing why we define it in such manner if BIND_PROCESS is included > > > in this series. Please feel free let me know which one suits better. > > > > > > +struct vfio_iommu_type1_bind_process { > > > + __u32 flags; > > > +#define VFIO_IOMMU_BIND_PID (1 << 0) > > > + __u32 pasid; > > > + __s32 pid; > > > +}; > > > https://patchwork.kernel.org/patch/10394927/ > > > > Note that I don't plan to upstream BIND_PROCESS at the moment. It was > > useful for testing but I don't know of anyone actually needing it. > > > > > > > > That bind data > > > > > > structure expects a format (ex. IOMMU_PASID_FORMAT_INTEL_VTD). How > > > > does > > > > > > a user determine what formats are accepted from within the vfio API (or > > > > > > even outside of the vfio API)? > > > > > > > > > > The info is provided by vIOMMU emulator (e.g. virtual VT-d). The vSVA patch > > > > > from Jacob has a sanity check on it. > > > > > https://lkml.org/lkml/2019/10/28/873 > > > > > > > > The vIOMMU emulator runs at a layer above vfio. How does the vIOMMU > > > > emulator know that the vfio interface supports virtual VT-d? IMO, it's > > > > not acceptable that the user simply assume that an Intel host platform > > > > supports VT-d. For example, consider what happens when we need to > > > > define IOMMU_PASID_FORMAT_INTEL_VTDv2. How would the user learn that > > > > VTDv2 is supported and the original VTD format is not supported? > > > > > > I guess this may be another info VFIO_IOMMU_GET_INFO should provide. > > > It makes sense that vfio be aware of what platform it is running on. right? > > > After vfio gets the info, may let vfio fill in the format info. Is it the correct > > > direction? > > > > I thought you were planning to put that information in sysfs? We last > > discussed this over a year ago so I don't remember where we left it. I > > know Alex isn't keen on putting in sysfs what can be communicated through > > VFIO, but it is a convenient way to describe IOMMU features: > > http://www.linux-arm.org/git?p=linux- > > jpb.git;a=commitdiff;h=665370d5b5e0022c24b2d2b57975ef6fe7b40870;hp=7ce780 > > d838889b53f5e04ba5d444520621261eda > > > > My problem with GET_INFO was that it could be difficult to extend, and > > to describe things like variable-size list of supported page table > > formats, but I guess the new info capabilities make this easier. > > > > Thanks, > > Jean >
> From: Alex Williamson [mailto:alex.williamson@redhat.com] > Sent: Tuesday, December 3, 2019 8:12 AM > To: Liu, Yi L <yi.l.liu@intel.com> > Subject: Re: [RFC v2 3/3] vfio/type1: bind guest pasid (guest page tables) to host > > On Mon, 25 Nov 2019 07:45:18 +0000 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > Hi Alex, > > > > Thanks for the review. Here I'd like to conclude the major opens in this > > thread and see if we can get some agreements to prepare a new version. > > > > a) IOCTLs for BIND_GPASID and BIND_PROCESS, share a single IOCTL or two > > separate IOCTLs? > > Yi: It may be helpful to have separate IOCTLs. The bind data conveyed > > for BIND_GPASID and BIND_PROCESS are totally different, and the struct > > iommu_gpasid_bind_data has vendor specific data and may even have more > > versions in future. To better maintain it, I guess separate IOCTLs for > > the two bind types would be better. The structure for BIND_GPASID is > > as below: > > > > struct vfio_iommu_type1_bind { > > __u32 argsz; > > struct iommu_gpasid_bind_data bind_data; > > }; > > > We've been rather successful at extending ioctls in vfio and I'm > generally opposed to rampant ioctl proliferation. If we added @flags > to the above struct (as pretty much the standard for vfio ioctls), then > we could use it to describe the type of binding to perform and > therefore the type of data provided. I think my major complaint here > was that we were defining PROCESS but not implementing it. We can > design the ioctl to enable it, but not define it until it's implemented. sure, so I'll pull back the @flags field. BTW. Regards to the payloads, what would be preferred? @data[] or a wrapper structure like below? union { struct iommu_gpasid_bind_data bind_gpasid; }bind_data; > > b) how kernel-space learns the number of bytes to be copied (a.k.a. the > > usage of @version field and @format field of struct > > iommu_gpasid_bind_data) > > Yi: Jean has an excellent recap in prior reply on the plan of future > > extensions regards to @version field and @format field. Based on the > > plan, kernel space needs to parse the @version field and @format field > > to get the length of the current BIND_GPASID request. Also kernel needs > > to maintain the new and old structure versions. Follow specific > > deprecation policy in future. > > Yes, it seems reasonable, so from the struct above (plus @flags) we > could determine we have struct iommu_gpasid_bind_data as the payload > and read that using @version and @format as outlined. sure, thanks. > > c) how can vIOMMU emulator know that the vfio interface supports to config > > dual stage translation for vIOMMU? > > Yi: may do it via VFIO_IOMMU_GET_INFO. > > Yes please. got it. > > d) how can vIOMMU emulator know what @version and @format should be set > > in struct iommu_gpasid_bind_data? > > Yi: currently, we have two ways. First one, may do it via > > VFIO_IOMMU_GET_INFO. This is a natural idea as here @version and @format > > are used in vfio apis. It makes sense to let vfio to provide related info > > to vIOMMU emulator after checking with vendor specific iommu driver. Also, > > there is idea to do it via sysfs (/sys/class/iommu/dmar#) as we have plan > > to do IOMMU capability sync between vIOMMU and pIOMMU via sysfs. I have > > two concern on this option. Current iommu sysfs only provides vendor > > specific hardware infos. I'm not sure if it is good to expose infos > > defined in IOMMU generic layer via iommu sysfs. If this concern is not > > a big thing, I'm fine with both options. > > This seems like the same issue we had with IOMMU reserved regions, I'd > prefer that a user can figure out how to interact with the vfio > interface through the vfio interface. Forcing the user to poke around > in sysfs requires the user to have read permissions to sysfs in places > they otherwise wouldn't need. Thanks, thanks, let me prepare a new version. Regards, Yi Liu > Alex
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 3d73a7d..1a27e25 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2325,6 +2325,104 @@ static int vfio_iommu_type1_pasid_free(struct vfio_iommu *iommu, return ret; } +static int vfio_bind_gpasid_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + struct iommu_gpasid_bind_data *ustruct = + (struct iommu_gpasid_bind_data *) dc->data; + + return iommu_sva_bind_gpasid(dc->domain, dev, ustruct); +} + +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) +{ + struct domain_capsule *dc = (struct domain_capsule *)data; + struct iommu_gpasid_bind_data *ustruct = + (struct iommu_gpasid_bind_data *) dc->data; + + return iommu_sva_unbind_gpasid(dc->domain, dev, + ustruct->hpasid); +} + +/* + * unbind specific gpasid, caller of this function requires hold + * vfio_iommu->lock + */ +static long vfio_iommu_type1_do_guest_unbind(struct vfio_iommu *iommu, + struct iommu_gpasid_bind_data *gbind_data) +{ + return vfio_iommu_lookup_dev(iommu, vfio_unbind_gpasid_fn, gbind_data); +} + +static long vfio_iommu_type1_bind_gpasid(struct vfio_iommu *iommu, + void __user *arg, + struct vfio_iommu_type1_bind *bind) +{ + struct iommu_gpasid_bind_data gbind_data; + unsigned long minsz; + int ret = 0; + + minsz = sizeof(*bind) + sizeof(gbind_data); + if (bind->argsz < minsz) + return -EINVAL; + + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) + return -EFAULT; + + mutex_lock(&iommu->lock); + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = vfio_iommu_lookup_dev(iommu, vfio_bind_gpasid_fn, &gbind_data); + /* + * If bind failed, it may not be a total failure. Some devices within + * the iommu group may have bind successfully. Although we don't enable + * pasid capability for non-singletion iommu groups, a unbind operation + * would be helpful to ensure no partial binding for an iommu group. + */ + if (ret) + /* + * Undo all binds that already succeeded, no need to check the + * return value here since some device within the group has no + * successful bind when coming to this place switch. + */ + vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); + +out_unlock: + mutex_unlock(&iommu->lock); + return ret; +} + +static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu, + void __user *arg, + struct vfio_iommu_type1_bind *bind) +{ + struct iommu_gpasid_bind_data gbind_data; + unsigned long minsz; + int ret = 0; + + minsz = sizeof(*bind) + sizeof(gbind_data); + if (bind->argsz < minsz) + return -EINVAL; + + if (copy_from_user(&gbind_data, arg, sizeof(gbind_data))) + return -EFAULT; + + mutex_lock(&iommu->lock); + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = vfio_iommu_type1_do_guest_unbind(iommu, &gbind_data); + +out_unlock: + mutex_unlock(&iommu->lock); + return ret; +} + static long vfio_iommu_type1_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -2484,6 +2582,44 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, default: return -EINVAL; } + + } else if (cmd == VFIO_IOMMU_BIND) { + struct vfio_iommu_type1_bind bind; + + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); + + if (copy_from_user(&bind, (void __user *)arg, minsz)) + return -EFAULT; + + if (bind.argsz < minsz) + return -EINVAL; + + switch (bind.bind_type) { + case VFIO_IOMMU_BIND_GUEST_PASID: + return vfio_iommu_type1_bind_gpasid(iommu, + (void __user *)(arg + minsz), &bind); + default: + return -EINVAL; + } + + } else if (cmd == VFIO_IOMMU_UNBIND) { + struct vfio_iommu_type1_bind bind; + + minsz = offsetofend(struct vfio_iommu_type1_bind, bind_type); + + if (copy_from_user(&bind, (void __user *)arg, minsz)) + return -EFAULT; + + if (bind.argsz < minsz) + return -EINVAL; + + switch (bind.bind_type) { + case VFIO_IOMMU_BIND_GUEST_PASID: + return vfio_iommu_type1_unbind_gpasid(iommu, + (void __user *)(arg + minsz), &bind); + default: + return -EINVAL; + } } return -ENOTTY; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 04de290..78e8c64 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -832,6 +832,50 @@ struct vfio_iommu_type1_pasid_request { */ #define VFIO_IOMMU_PASID_REQUEST _IO(VFIO_TYPE, VFIO_BASE + 27) +enum vfio_iommu_bind_type { + VFIO_IOMMU_BIND_PROCESS, + VFIO_IOMMU_BIND_GUEST_PASID, +}; + +/* + * Supported types: + * - VFIO_IOMMU_BIND_GUEST_PASID: bind guest pasid, which invoked + * by guest, it takes iommu_gpasid_bind_data in data. + */ +struct vfio_iommu_type1_bind { + __u32 argsz; + enum vfio_iommu_bind_type bind_type; + __u8 data[]; +}; + +/* + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 28, struct vfio_iommu_bind) + * + * Manage address spaces of devices in this container. Initially a TYPE1 + * container can only have one address space, managed with + * VFIO_IOMMU_MAP/UNMAP_DMA. + * + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page + * tables, and BIND manages the stage-1 (guest) page tables. Other types of + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls + * non-PASID traffic and BIND controls PASID traffic. But this depends on the + * underlying IOMMU architecture and isn't guaranteed. + * + * Availability of this feature depends on the device, its bus, the underlying + * IOMMU and the CPU architecture. + * + * returns: 0 on success, -errno on failure. + */ +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 28) + +/* + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 29, struct vfio_iommu_bind) + * + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. + */ +#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 29) + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ /*