diff mbox series

[v5,01/12] iommu: Introduce a replace API for device pasid

Message ID 20241104132513.15890-2-yi.l.liu@intel.com (mailing list archive)
State New
Headers show
Series iommufd support pasid attach/replace | expand

Commit Message

Yi Liu Nov. 4, 2024, 1:25 p.m. UTC
Provide a high-level API to allow replacements of one domain with
another for specific pasid of a device. This is similar to
iommu_group_replace_domain() and it is expected to be used only by
IOMMUFD.

Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/iommu/iommu-priv.h |  4 ++
 drivers/iommu/iommu.c      | 90 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 90 insertions(+), 4 deletions(-)

Comments

Baolu Lu Nov. 5, 2024, 3:58 a.m. UTC | #1
On 11/4/24 21:25, Yi Liu wrote:
> +/**
> + * iommu_replace_device_pasid - Replace the domain that a pasid is attached to
> + * @domain: the new iommu domain
> + * @dev: the attached device.
> + * @pasid: the pasid of the device.
> + * @handle: the attach handle.
> + *
> + * This API allows the pasid to switch domains. Return 0 on success, or an
> + * error. The pasid will keep the old configuration if replacement failed.
> + * This is supposed to be used by iommufd, and iommufd can guarantee that
> + * both iommu_attach_device_pasid() and iommu_replace_device_pasid() would
> + * pass in a valid @handle.
> + */
> +int iommu_replace_device_pasid(struct iommu_domain *domain,
> +			       struct device *dev, ioasid_t pasid,
> +			       struct iommu_attach_handle *handle)
> +{
> +	/* Caller must be a probed driver on dev */
> +	struct iommu_group *group = dev->iommu_group;
> +	struct iommu_attach_handle *curr;
> +	int ret;
> +
> +	if (!domain->ops->set_dev_pasid)
> +		return -EOPNOTSUPP;
> +
> +	if (!group)
> +		return -ENODEV;
> +
> +	if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner ||
> +	    pasid == IOMMU_NO_PASID || !handle)
> +		return -EINVAL;
> +
> +	handle->domain = domain;
> +
> +	mutex_lock(&group->mutex);
> +	/*
> +	 * The iommu_attach_handle of the pasid becomes inconsistent with the
> +	 * actual handle per the below operation. The concurrent PRI path will
> +	 * deliver the PRQs per the new handle, this does not have a functional
> +	 * impact. The PRI path would eventually become consistent when the
> +	 * replacement is done.
> +	 */
> +	curr = (struct iommu_attach_handle *)xa_store(&group->pasid_array,
> +						      pasid, handle,
> +						      GFP_KERNEL);

The iommu drivers can only flush pending PRs in the hardware queue when
__iommu_set_group_pasid() is called. So, it appears more reasonable to
reorder things like this:

	__iommu_set_group_pasid();
	switch_attach_handle();

Or anything I overlooked?

> +	if (!curr) {
> +		xa_erase(&group->pasid_array, pasid);
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	ret = xa_err(curr);
> +	if (ret)
> +		goto out_unlock;
> +
> +	if (curr->domain == domain)
> +		goto out_unlock;
> +
> +	ret = __iommu_set_group_pasid(domain, group, pasid, curr->domain);
> +	if (ret)
> +		WARN_ON(handle != xa_store(&group->pasid_array, pasid,
> +					   curr, GFP_KERNEL));
> +out_unlock:
> +	mutex_unlock(&group->mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, IOMMUFD_INTERNAL);
> +
>   /*
>    * iommu_detach_device_pasid() - Detach the domain from pasid of device
>    * @domain: the iommu domain.

--
baolu
Yi Liu Nov. 5, 2024, 7:49 a.m. UTC | #2
On 2024/11/5 11:58, Baolu Lu wrote:
> On 11/4/24 21:25, Yi Liu wrote:
>> +/**
>> + * iommu_replace_device_pasid - Replace the domain that a pasid is 
>> attached to
>> + * @domain: the new iommu domain
>> + * @dev: the attached device.
>> + * @pasid: the pasid of the device.
>> + * @handle: the attach handle.
>> + *
>> + * This API allows the pasid to switch domains. Return 0 on success, or an
>> + * error. The pasid will keep the old configuration if replacement failed.
>> + * This is supposed to be used by iommufd, and iommufd can guarantee that
>> + * both iommu_attach_device_pasid() and iommu_replace_device_pasid() would
>> + * pass in a valid @handle.
>> + */
>> +int iommu_replace_device_pasid(struct iommu_domain *domain,
>> +                   struct device *dev, ioasid_t pasid,
>> +                   struct iommu_attach_handle *handle)
>> +{
>> +    /* Caller must be a probed driver on dev */
>> +    struct iommu_group *group = dev->iommu_group;
>> +    struct iommu_attach_handle *curr;
>> +    int ret;
>> +
>> +    if (!domain->ops->set_dev_pasid)
>> +        return -EOPNOTSUPP;
>> +
>> +    if (!group)
>> +        return -ENODEV;
>> +
>> +    if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner ||
>> +        pasid == IOMMU_NO_PASID || !handle)
>> +        return -EINVAL;
>> +
>> +    handle->domain = domain;
>> +
>> +    mutex_lock(&group->mutex);
>> +    /*
>> +     * The iommu_attach_handle of the pasid becomes inconsistent with the
>> +     * actual handle per the below operation. The concurrent PRI path will
>> +     * deliver the PRQs per the new handle, this does not have a functional
>> +     * impact. The PRI path would eventually become consistent when the
>> +     * replacement is done.
>> +     */
>> +    curr = (struct iommu_attach_handle *)xa_store(&group->pasid_array,
>> +                              pasid, handle,
>> +                              GFP_KERNEL);
> 
> The iommu drivers can only flush pending PRs in the hardware queue when
> __iommu_set_group_pasid() is called. So, it appears more reasonable to
> reorder things like this:
> 
>      __iommu_set_group_pasid();
>      switch_attach_handle();
> 
> Or anything I overlooked?

not quite get why this handle is related to iommu driver flushing PRs.
Before __iommu_set_group_pasid(), the pasid is still attached with the
old domain, so is the hw configuration.

>> +    if (!curr) {
>> +        xa_erase(&group->pasid_array, pasid);
>> +        ret = -EINVAL;
>> +        goto out_unlock;
>> +    }
>> +
>> +    ret = xa_err(curr);
>> +    if (ret)
>> +        goto out_unlock;
>> +
>> +    if (curr->domain == domain)
>> +        goto out_unlock;
>> +
>> +    ret = __iommu_set_group_pasid(domain, group, pasid, curr->domain);
>> +    if (ret)
>> +        WARN_ON(handle != xa_store(&group->pasid_array, pasid,
>> +                       curr, GFP_KERNEL));
>> +out_unlock:
>> +    mutex_unlock(&group->mutex);
>> +    return ret;
>> +}
>> +EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, IOMMUFD_INTERNAL);
>> +
>>   /*
>>    * iommu_detach_device_pasid() - Detach the domain from pasid of device
>>    * @domain: the iommu domain.
> 
> -- 
> baolu
Baolu Lu Nov. 5, 2024, 7:57 a.m. UTC | #3
On 2024/11/5 15:49, Yi Liu wrote:
> On 2024/11/5 11:58, Baolu Lu wrote:
>> On 11/4/24 21:25, Yi Liu wrote:
>>> +/**
>>> + * iommu_replace_device_pasid - Replace the domain that a pasid is 
>>> attached to
>>> + * @domain: the new iommu domain
>>> + * @dev: the attached device.
>>> + * @pasid: the pasid of the device.
>>> + * @handle: the attach handle.
>>> + *
>>> + * This API allows the pasid to switch domains. Return 0 on success, 
>>> or an
>>> + * error. The pasid will keep the old configuration if replacement 
>>> failed.
>>> + * This is supposed to be used by iommufd, and iommufd can guarantee 
>>> that
>>> + * both iommu_attach_device_pasid() and iommu_replace_device_pasid() 
>>> would
>>> + * pass in a valid @handle.
>>> + */
>>> +int iommu_replace_device_pasid(struct iommu_domain *domain,
>>> +                   struct device *dev, ioasid_t pasid,
>>> +                   struct iommu_attach_handle *handle)
>>> +{
>>> +    /* Caller must be a probed driver on dev */
>>> +    struct iommu_group *group = dev->iommu_group;
>>> +    struct iommu_attach_handle *curr;
>>> +    int ret;
>>> +
>>> +    if (!domain->ops->set_dev_pasid)
>>> +        return -EOPNOTSUPP;
>>> +
>>> +    if (!group)
>>> +        return -ENODEV;
>>> +
>>> +    if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner ||
>>> +        pasid == IOMMU_NO_PASID || !handle)
>>> +        return -EINVAL;
>>> +
>>> +    handle->domain = domain;
>>> +
>>> +    mutex_lock(&group->mutex);
>>> +    /*
>>> +     * The iommu_attach_handle of the pasid becomes inconsistent 
>>> with the
>>> +     * actual handle per the below operation. The concurrent PRI 
>>> path will
>>> +     * deliver the PRQs per the new handle, this does not have a 
>>> functional
>>> +     * impact. The PRI path would eventually become consistent when the
>>> +     * replacement is done.
>>> +     */
>>> +    curr = (struct iommu_attach_handle *)xa_store(&group->pasid_array,
>>> +                              pasid, handle,
>>> +                              GFP_KERNEL);
>>
>> The iommu drivers can only flush pending PRs in the hardware queue when
>> __iommu_set_group_pasid() is called. So, it appears more reasonable to
>> reorder things like this:
>>
>>      __iommu_set_group_pasid();
>>      switch_attach_handle();
>>
>> Or anything I overlooked?
> 
> not quite get why this handle is related to iommu driver flushing PRs.
> Before __iommu_set_group_pasid(), the pasid is still attached with the
> old domain, so is the hw configuration.

I meant that in the path of __iommu_set_group_pasid(), the iommu drivers
have the opportunity to flush the PRs pending in the hardware queue. If
the attach_handle is switched (by calling xa_store()) before
__iommu_set_group_pasid(), the pending PRs will be routed to iopf
handler of the new domain, which is not desirable.

--
baolu
Yi Liu Nov. 5, 2024, 8:10 a.m. UTC | #4
On 2024/11/5 15:57, Baolu Lu wrote:
> On 2024/11/5 15:49, Yi Liu wrote:
>> On 2024/11/5 11:58, Baolu Lu wrote:
>>> On 11/4/24 21:25, Yi Liu wrote:
>>>> +/**
>>>> + * iommu_replace_device_pasid - Replace the domain that a pasid is 
>>>> attached to
>>>> + * @domain: the new iommu domain
>>>> + * @dev: the attached device.
>>>> + * @pasid: the pasid of the device.
>>>> + * @handle: the attach handle.
>>>> + *
>>>> + * This API allows the pasid to switch domains. Return 0 on success, 
>>>> or an
>>>> + * error. The pasid will keep the old configuration if replacement 
>>>> failed.
>>>> + * This is supposed to be used by iommufd, and iommufd can guarantee that
>>>> + * both iommu_attach_device_pasid() and iommu_replace_device_pasid() 
>>>> would
>>>> + * pass in a valid @handle.
>>>> + */
>>>> +int iommu_replace_device_pasid(struct iommu_domain *domain,
>>>> +                   struct device *dev, ioasid_t pasid,
>>>> +                   struct iommu_attach_handle *handle)
>>>> +{
>>>> +    /* Caller must be a probed driver on dev */
>>>> +    struct iommu_group *group = dev->iommu_group;
>>>> +    struct iommu_attach_handle *curr;
>>>> +    int ret;
>>>> +
>>>> +    if (!domain->ops->set_dev_pasid)
>>>> +        return -EOPNOTSUPP;
>>>> +
>>>> +    if (!group)
>>>> +        return -ENODEV;
>>>> +
>>>> +    if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner ||
>>>> +        pasid == IOMMU_NO_PASID || !handle)
>>>> +        return -EINVAL;
>>>> +
>>>> +    handle->domain = domain;
>>>> +
>>>> +    mutex_lock(&group->mutex);
>>>> +    /*
>>>> +     * The iommu_attach_handle of the pasid becomes inconsistent with the
>>>> +     * actual handle per the below operation. The concurrent PRI path 
>>>> will
>>>> +     * deliver the PRQs per the new handle, this does not have a 
>>>> functional
>>>> +     * impact. The PRI path would eventually become consistent when the
>>>> +     * replacement is done.
>>>> +     */
>>>> +    curr = (struct iommu_attach_handle *)xa_store(&group->pasid_array,
>>>> +                              pasid, handle,
>>>> +                              GFP_KERNEL);
>>>
>>> The iommu drivers can only flush pending PRs in the hardware queue when
>>> __iommu_set_group_pasid() is called. So, it appears more reasonable to
>>> reorder things like this:
>>>
>>>      __iommu_set_group_pasid();
>>>      switch_attach_handle();
>>>
>>> Or anything I overlooked?
>>
>> not quite get why this handle is related to iommu driver flushing PRs.
>> Before __iommu_set_group_pasid(), the pasid is still attached with the
>> old domain, so is the hw configuration.
> 
> I meant that in the path of __iommu_set_group_pasid(), the iommu drivers
> have the opportunity to flush the PRs pending in the hardware queue. If
> the attach_handle is switched (by calling xa_store()) before
> __iommu_set_group_pasid(), the pending PRs will be routed to iopf
> handler of the new domain, which is not desirable.

I see. You mean the handling of PRQs. I was interpreting you are talking
about PRQ draining.

yet, what you described was discussed before [1]. Forwarding PRQs to the
new domain looks to be ok.

But you reminded me one thing. What I cared about more is the case
replacing an iopf-capable domain to non-capable domain. This means the new
coming PRQs would be responded by iopf_error_response(). Do you see an
issue here?

[1] https://lore.kernel.org/linux-iommu/20240429135512.GC941030@nvidia.com/
Baolu Lu Nov. 5, 2024, 8:14 a.m. UTC | #5
On 2024/11/5 16:10, Yi Liu wrote:
> On 2024/11/5 15:57, Baolu Lu wrote:
>> On 2024/11/5 15:49, Yi Liu wrote:
>>> On 2024/11/5 11:58, Baolu Lu wrote:
>>>> On 11/4/24 21:25, Yi Liu wrote:
>>>>> +/**
>>>>> + * iommu_replace_device_pasid - Replace the domain that a pasid is 
>>>>> attached to
>>>>> + * @domain: the new iommu domain
>>>>> + * @dev: the attached device.
>>>>> + * @pasid: the pasid of the device.
>>>>> + * @handle: the attach handle.
>>>>> + *
>>>>> + * This API allows the pasid to switch domains. Return 0 on 
>>>>> success, or an
>>>>> + * error. The pasid will keep the old configuration if replacement 
>>>>> failed.
>>>>> + * This is supposed to be used by iommufd, and iommufd can 
>>>>> guarantee that
>>>>> + * both iommu_attach_device_pasid() and 
>>>>> iommu_replace_device_pasid() would
>>>>> + * pass in a valid @handle.
>>>>> + */
>>>>> +int iommu_replace_device_pasid(struct iommu_domain *domain,
>>>>> +                   struct device *dev, ioasid_t pasid,
>>>>> +                   struct iommu_attach_handle *handle)
>>>>> +{
>>>>> +    /* Caller must be a probed driver on dev */
>>>>> +    struct iommu_group *group = dev->iommu_group;
>>>>> +    struct iommu_attach_handle *curr;
>>>>> +    int ret;
>>>>> +
>>>>> +    if (!domain->ops->set_dev_pasid)
>>>>> +        return -EOPNOTSUPP;
>>>>> +
>>>>> +    if (!group)
>>>>> +        return -ENODEV;
>>>>> +
>>>>> +    if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner ||
>>>>> +        pasid == IOMMU_NO_PASID || !handle)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    handle->domain = domain;
>>>>> +
>>>>> +    mutex_lock(&group->mutex);
>>>>> +    /*
>>>>> +     * The iommu_attach_handle of the pasid becomes inconsistent 
>>>>> with the
>>>>> +     * actual handle per the below operation. The concurrent PRI 
>>>>> path will
>>>>> +     * deliver the PRQs per the new handle, this does not have a 
>>>>> functional
>>>>> +     * impact. The PRI path would eventually become consistent 
>>>>> when the
>>>>> +     * replacement is done.
>>>>> +     */
>>>>> +    curr = (struct iommu_attach_handle *)xa_store(&group- 
>>>>> >pasid_array,
>>>>> +                              pasid, handle,
>>>>> +                              GFP_KERNEL);
>>>>
>>>> The iommu drivers can only flush pending PRs in the hardware queue when
>>>> __iommu_set_group_pasid() is called. So, it appears more reasonable to
>>>> reorder things like this:
>>>>
>>>>      __iommu_set_group_pasid();
>>>>      switch_attach_handle();
>>>>
>>>> Or anything I overlooked?
>>>
>>> not quite get why this handle is related to iommu driver flushing PRs.
>>> Before __iommu_set_group_pasid(), the pasid is still attached with the
>>> old domain, so is the hw configuration.
>>
>> I meant that in the path of __iommu_set_group_pasid(), the iommu drivers
>> have the opportunity to flush the PRs pending in the hardware queue. If
>> the attach_handle is switched (by calling xa_store()) before
>> __iommu_set_group_pasid(), the pending PRs will be routed to iopf
>> handler of the new domain, which is not desirable.
> 
> I see. You mean the handling of PRQs. I was interpreting you are talking
> about PRQ draining.
> 
> yet, what you described was discussed before [1]. Forwarding PRQs to the
> new domain looks to be ok.
> 
> But you reminded me one thing. What I cared about more is the case
> replacing an iopf-capable domain to non-capable domain. This means the new
> coming PRQs would be responded by iopf_error_response(). Do you see an
> issue here?

I am not sure, but it will be more reasonable if you can make it in the
right order. If that's impossible, then add some comments to explain it.

--
baolu
Jason Gunthorpe Nov. 5, 2024, 3:10 p.m. UTC | #6
On Tue, Nov 05, 2024 at 04:10:59PM +0800, Yi Liu wrote:

> > > not quite get why this handle is related to iommu driver flushing PRs.
> > > Before __iommu_set_group_pasid(), the pasid is still attached with the
> > > old domain, so is the hw configuration.
> > 
> > I meant that in the path of __iommu_set_group_pasid(), the iommu drivers
> > have the opportunity to flush the PRs pending in the hardware queue. If
> > the attach_handle is switched (by calling xa_store()) before
> > __iommu_set_group_pasid(), the pending PRs will be routed to iopf
> > handler of the new domain, which is not desirable.
> 
> I see. You mean the handling of PRQs. I was interpreting you are talking
> about PRQ draining.

I don't think we need to worry about this race, and certainly you
shouldn't be making the domain replacement path non-hitless just to
fence the page requests.

If a page request comes in during the race window of domain change
there are only three outcomes:

  1) The old domain handles it and it translates on the old domain
  2) The new domain handles it and it translates on the new domain
  3) The old domain handles it and it translates on the new domain.
     a) The page request is ack'd and the device retries and loads the
       new domain - OK - at best it will use the new translation, at
       worst it will retry.
     b) the page request fails and the device sees the failure. This
        is the same as #1 - OK

All are correct. We don't need to do more here than just let the race
resolve itself.

Once the domains are switched in HW we do have to flush everything
queued due to the fault path locking scheme on the domain.

Jason
Baolu Lu Nov. 6, 2024, 8:52 a.m. UTC | #7
On 2024/11/5 23:10, Jason Gunthorpe wrote:
> On Tue, Nov 05, 2024 at 04:10:59PM +0800, Yi Liu wrote:
> 
>>>> not quite get why this handle is related to iommu driver flushing PRs.
>>>> Before __iommu_set_group_pasid(), the pasid is still attached with the
>>>> old domain, so is the hw configuration.
>>> I meant that in the path of __iommu_set_group_pasid(), the iommu drivers
>>> have the opportunity to flush the PRs pending in the hardware queue. If
>>> the attach_handle is switched (by calling xa_store()) before
>>> __iommu_set_group_pasid(), the pending PRs will be routed to iopf
>>> handler of the new domain, which is not desirable.
>> I see. You mean the handling of PRQs. I was interpreting you are talking
>> about PRQ draining.
> I don't think we need to worry about this race, and certainly you
> shouldn't be making the domain replacement path non-hitless just to
> fence the page requests.
> 
> If a page request comes in during the race window of domain change
> there are only three outcomes:
> 
>    1) The old domain handles it and it translates on the old domain
>    2) The new domain handles it and it translates on the new domain
>    3) The old domain handles it and it translates on the new domain.
>       a) The page request is ack'd and the device retries and loads the
>         new domain - OK - at best it will use the new translation, at
>         worst it will retry.
>       b) the page request fails and the device sees the failure. This
>          is the same as #1 - OK
> 
> All are correct. We don't need to do more here than just let the race
> resolve itself.
> 
> Once the domains are switched in HW we do have to flush everything
> queued due to the fault path locking scheme on the domain.

Agreed. To my understanding, the worst case is that the device retries
the transaction which might result in another page fault, which will set
up the translation in the new domain.

--
baolu
diff mbox series

Patch

diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index de5b54eaa8bf..90b367de267e 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -27,6 +27,10 @@  static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp
 int iommu_group_replace_domain(struct iommu_group *group,
 			       struct iommu_domain *new_domain);
 
+int iommu_replace_device_pasid(struct iommu_domain *domain,
+			       struct device *dev, ioasid_t pasid,
+			       struct iommu_attach_handle *handle);
+
 int iommu_device_register_bus(struct iommu_device *iommu,
 			      const struct iommu_ops *ops,
 			      const struct bus_type *bus,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 53f8e60acf30..a441ba0a6733 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3411,14 +3411,15 @@  static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
 }
 
 static int __iommu_set_group_pasid(struct iommu_domain *domain,
-				   struct iommu_group *group, ioasid_t pasid)
+				   struct iommu_group *group, ioasid_t pasid,
+				   struct iommu_domain *old)
 {
 	struct group_device *device, *last_gdev;
 	int ret;
 
 	for_each_group_device(group, device) {
 		ret = domain->ops->set_dev_pasid(domain, device->dev,
-						 pasid, NULL);
+						 pasid, old);
 		if (ret)
 			goto err_revert;
 	}
@@ -3430,7 +3431,20 @@  static int __iommu_set_group_pasid(struct iommu_domain *domain,
 	for_each_group_device(group, device) {
 		if (device == last_gdev)
 			break;
-		iommu_remove_dev_pasid(device->dev, pasid, domain);
+		/* If no old domain, undo the succeeded devices/pasid */
+		if (!old) {
+			iommu_remove_dev_pasid(device->dev, pasid, domain);
+			continue;
+		}
+
+		/*
+		 * Rollback the succeeded devices/pasid to the old domain.
+		 * And it is a driver bug to fail attaching with a previously
+		 * good domain.
+		 */
+		if (WARN_ON(old->ops->set_dev_pasid(old, device->dev,
+						    pasid, domain)))
+			iommu_remove_dev_pasid(device->dev, pasid, domain);
 	}
 	return ret;
 }
@@ -3492,7 +3506,7 @@  int iommu_attach_device_pasid(struct iommu_domain *domain,
 	if (ret)
 		goto out_unlock;
 
-	ret = __iommu_set_group_pasid(domain, group, pasid);
+	ret = __iommu_set_group_pasid(domain, group, pasid, NULL);
 	if (ret)
 		xa_erase(&group->pasid_array, pasid);
 out_unlock:
@@ -3501,6 +3515,74 @@  int iommu_attach_device_pasid(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device_pasid);
 
+/**
+ * iommu_replace_device_pasid - Replace the domain that a pasid is attached to
+ * @domain: the new iommu domain
+ * @dev: the attached device.
+ * @pasid: the pasid of the device.
+ * @handle: the attach handle.
+ *
+ * This API allows the pasid to switch domains. Return 0 on success, or an
+ * error. The pasid will keep the old configuration if replacement failed.
+ * This is supposed to be used by iommufd, and iommufd can guarantee that
+ * both iommu_attach_device_pasid() and iommu_replace_device_pasid() would
+ * pass in a valid @handle.
+ */
+int iommu_replace_device_pasid(struct iommu_domain *domain,
+			       struct device *dev, ioasid_t pasid,
+			       struct iommu_attach_handle *handle)
+{
+	/* Caller must be a probed driver on dev */
+	struct iommu_group *group = dev->iommu_group;
+	struct iommu_attach_handle *curr;
+	int ret;
+
+	if (!domain->ops->set_dev_pasid)
+		return -EOPNOTSUPP;
+
+	if (!group)
+		return -ENODEV;
+
+	if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner ||
+	    pasid == IOMMU_NO_PASID || !handle)
+		return -EINVAL;
+
+	handle->domain = domain;
+
+	mutex_lock(&group->mutex);
+	/*
+	 * The iommu_attach_handle of the pasid becomes inconsistent with the
+	 * actual handle per the below operation. The concurrent PRI path will
+	 * deliver the PRQs per the new handle, this does not have a functional
+	 * impact. The PRI path would eventually become consistent when the
+	 * replacement is done.
+	 */
+	curr = (struct iommu_attach_handle *)xa_store(&group->pasid_array,
+						      pasid, handle,
+						      GFP_KERNEL);
+	if (!curr) {
+		xa_erase(&group->pasid_array, pasid);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = xa_err(curr);
+	if (ret)
+		goto out_unlock;
+
+	if (curr->domain == domain)
+		goto out_unlock;
+
+	ret = __iommu_set_group_pasid(domain, group, pasid, curr->domain);
+	if (ret)
+		WARN_ON(handle != xa_store(&group->pasid_array, pasid,
+					   curr, GFP_KERNEL));
+out_unlock:
+	mutex_unlock(&group->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, IOMMUFD_INTERNAL);
+
 /*
  * iommu_detach_device_pasid() - Detach the domain from pasid of device
  * @domain: the iommu domain.