diff mbox series

[rc] iommu/arm-smmu-v3: Do not use GFP_KERNEL under as spinlock

Message ID 0-v1-210234560de7+69a-smmu_cd_atomic_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series [rc] iommu/arm-smmu-v3: Do not use GFP_KERNEL under as spinlock | expand

Commit Message

Jason Gunthorpe Feb. 2, 2024, 2:28 p.m. UTC
If the SMMU is configured to use a two level CD table then
arm_smmu_write_ctx_desc() allocates a CD table leaf internally using
GFP_KERNEL. Due to recent changes this is being done under a spinlock to
iterate over the device list - thus it will trigger a sleeping while
atomic warning:

  arm_smmu_sva_set_dev_pasid()
    mutex_lock(&sva_lock);
    __arm_smmu_sva_bind()
     arm_smmu_mmu_notifier_get()
      spin_lock_irqsave()
      arm_smmu_write_ctx_desc()
	arm_smmu_get_cd_ptr()
         arm_smmu_alloc_cd_leaf_table()
	  dmam_alloc_coherent(GFP_KERNEL)

This is a 64K high order allocation and really should not be done
atomically.

The proper answer is to sequence CD programming in a way that does not
require the device list or a spinlock. Part two of my lager series does
this and already solves this bug.

In the mean time we can solve the functional issue by preallocating the CD
entry outside any locking. CD leafs are effectively never freed so this
guarantees the arm_smmu_write_ctx_desc() inside the lock will not take the
allocation path.

Fixes: 24503148c545 ("iommu/arm-smmu-v3: Refactor write_ctx_desc")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/all/4e25d161-0cf8-4050-9aa3-dfa21cd63e56@moroto.mountain/
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     | 2 ++
 3 files changed, 8 insertions(+), 1 deletion(-)


base-commit: c76c50cd425e574df5a071cd7e1805d0764aabff

Comments

Michael Shavit Feb. 3, 2024, 8:02 a.m. UTC | #1
On Fri, Feb 2, 2024 at 11:28 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> If the SMMU is configured to use a two level CD table then
> arm_smmu_write_ctx_desc() allocates a CD table leaf internally using
> GFP_KERNEL. Due to recent changes this is being done under a spinlock to
> iterate over the device list - thus it will trigger a sleeping while
> atomic warning:
>
>   arm_smmu_sva_set_dev_pasid()
>     mutex_lock(&sva_lock);
>     __arm_smmu_sva_bind()
>      arm_smmu_mmu_notifier_get()
>       spin_lock_irqsave()
>       arm_smmu_write_ctx_desc()
>         arm_smmu_get_cd_ptr()
>          arm_smmu_alloc_cd_leaf_table()
>           dmam_alloc_coherent(GFP_KERNEL)
>
> This is a 64K high order allocation and really should not be done
> atomically.
>
> The proper answer is to sequence CD programming in a way that does not
> require the device list or a spinlock. Part two of my lager series does
> this and already solves this bug.
>
> In the mean time we can solve the functional issue by preallocating the CD
> entry outside any locking. CD leafs are effectively never freed so this
> guarantees the arm_smmu_write_ctx_desc() inside the lock will not take the
> allocation path.
>
> Fixes: 24503148c545 ("iommu/arm-smmu-v3: Refactor write_ctx_desc")
> Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> Closes: https://lore.kernel.org/all/4e25d161-0cf8-4050-9aa3-dfa21cd63e56@moroto.mountain/
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +++++
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 2 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     | 2 ++
>  3 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index 05722121f00e70..068d60732e6481 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -588,6 +588,11 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
>  {
>         int ret = 0;
>         struct mm_struct *mm = domain->mm;
> +       struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +
> +       /* Preallocate the required leafs outside locks */
> +       if (!arm_smmu_get_cd_ptr(master, id))
> +               return -ENOMEM;
>
>         mutex_lock(&sva_lock);
>         ret = __arm_smmu_sva_bind(dev, mm);
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 0ffb1cf17e0b2e..ec2bb8685bb50e 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1019,7 +1019,7 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
>         WRITE_ONCE(*dst, cpu_to_le64(val));
>  }
>
> -static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
> +__le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
>  {
>         __le64 *l1ptr;
>         unsigned int idx;
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 65fb388d51734d..fa0eeb3519ef28 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -748,6 +748,8 @@ extern struct xarray arm_smmu_asid_xa;
>  extern struct mutex arm_smmu_asid_lock;
>  extern struct arm_smmu_ctx_desc quiet_cd;
>
> +__le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid);
> +
>  int arm_smmu_write_ctx_desc(struct arm_smmu_master *smmu_master, int ssid,
>                             struct arm_smmu_ctx_desc *cd);
>  void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
>
> base-commit: c76c50cd425e574df5a071cd7e1805d0764aabff
> --
> 2.43.0
>

Reviewed-by: Michael Shavit <mshavit@google.com>
Will Deacon Feb. 8, 2024, 12:58 p.m. UTC | #2
On Fri, Feb 02, 2024 at 10:28:05AM -0400, Jason Gunthorpe wrote:
> If the SMMU is configured to use a two level CD table then
> arm_smmu_write_ctx_desc() allocates a CD table leaf internally using
> GFP_KERNEL. Due to recent changes this is being done under a spinlock to
> iterate over the device list - thus it will trigger a sleeping while
> atomic warning:
> 
>   arm_smmu_sva_set_dev_pasid()
>     mutex_lock(&sva_lock);
>     __arm_smmu_sva_bind()
>      arm_smmu_mmu_notifier_get()
>       spin_lock_irqsave()
>       arm_smmu_write_ctx_desc()
> 	arm_smmu_get_cd_ptr()
>          arm_smmu_alloc_cd_leaf_table()
> 	  dmam_alloc_coherent(GFP_KERNEL)
> 
> This is a 64K high order allocation and really should not be done
> atomically.
> 
> The proper answer is to sequence CD programming in a way that does not
> require the device list or a spinlock. Part two of my lager series does
> this and already solves this bug.
> 
> In the mean time we can solve the functional issue by preallocating the CD
> entry outside any locking. CD leafs are effectively never freed so this
> guarantees the arm_smmu_write_ctx_desc() inside the lock will not take the
> allocation path.
> 
> Fixes: 24503148c545 ("iommu/arm-smmu-v3: Refactor write_ctx_desc")
> Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> Closes: https://lore.kernel.org/all/4e25d161-0cf8-4050-9aa3-dfa21cd63e56@moroto.mountain/
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +++++
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 2 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     | 2 ++
>  3 files changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index 05722121f00e70..068d60732e6481 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -588,6 +588,11 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
>  {
>  	int ret = 0;
>  	struct mm_struct *mm = domain->mm;
> +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +
> +	/* Preallocate the required leafs outside locks */
> +	if (!arm_smmu_get_cd_ptr(master, id))
> +		return -ENOMEM;

What guarantees that all the masters in the domain share the same l2 CD
table? The only reason we need the spinlock is to walk over the device
list, so if that's not necessary then we could push this down into
arm_smmu_mmu_notifier_get().

Will
Jason Gunthorpe Feb. 8, 2024, 2:27 p.m. UTC | #3
On Thu, Feb 08, 2024 at 12:58:47PM +0000, Will Deacon wrote:
> On Fri, Feb 02, 2024 at 10:28:05AM -0400, Jason Gunthorpe wrote:
> > If the SMMU is configured to use a two level CD table then
> > arm_smmu_write_ctx_desc() allocates a CD table leaf internally using
> > GFP_KERNEL. Due to recent changes this is being done under a spinlock to
> > iterate over the device list - thus it will trigger a sleeping while
> > atomic warning:
> > 
> >   arm_smmu_sva_set_dev_pasid()
> >     mutex_lock(&sva_lock);
> >     __arm_smmu_sva_bind()
> >      arm_smmu_mmu_notifier_get()
> >       spin_lock_irqsave()
> >       arm_smmu_write_ctx_desc()
> > 	arm_smmu_get_cd_ptr()
> >          arm_smmu_alloc_cd_leaf_table()
> > 	  dmam_alloc_coherent(GFP_KERNEL)
> > 
> > This is a 64K high order allocation and really should not be done
> > atomically.
> > 
> > The proper answer is to sequence CD programming in a way that does not
> > require the device list or a spinlock. Part two of my lager series does
> > this and already solves this bug.
> > 
> > In the mean time we can solve the functional issue by preallocating the CD
> > entry outside any locking. CD leafs are effectively never freed so this
> > guarantees the arm_smmu_write_ctx_desc() inside the lock will not take the
> > allocation path.
> > 
> > Fixes: 24503148c545 ("iommu/arm-smmu-v3: Refactor write_ctx_desc")
> > Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> > Closes: https://lore.kernel.org/all/4e25d161-0cf8-4050-9aa3-dfa21cd63e56@moroto.mountain/
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > ---
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +++++
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 2 +-
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     | 2 ++
> >  3 files changed, 8 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > index 05722121f00e70..068d60732e6481 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > @@ -588,6 +588,11 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
> >  {
> >  	int ret = 0;
> >  	struct mm_struct *mm = domain->mm;
> > +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> > +
> > +	/* Preallocate the required leafs outside locks */
> > +	if (!arm_smmu_get_cd_ptr(master, id))
> > +		return -ENOMEM;
> 
> What guarantees that all the masters in the domain share the same l2 CD
> table?

Nothing, in fact they don't.

However, a multi-master RID domain is something that doesn't fully
work with SVA today. This code is functionally/security wrong to
install the PASID into all the masters that share the RID domain.

In practice that doesn't happen because the iommu group size when
PASID is enabled is restricted to 1 and the DMA API is the thing that
has created a unique per-group domain to be able to turn SVA on at
all.

So, the RID domain's list has one master entry that is the same as the
set_dev_pasid's master.

Which is why this patch works, it prealloctes the only master that
matters for the only configuration where SVA works and is used today.

iommufd could violate these assumptions which is part of why I'm
fixing all of this in preparation to enable PASID support for all
drivers in iommufd.

> The only reason we need the spinlock is to walk over the device
> list, so if that's not necessary then we could push this down into
> arm_smmu_mmu_notifier_get().

Pushing stuff down into arm_smmu_mmu_notifier_get() is the wrong
direction. CD manipulation needs to be lifted up and out into the
generic PASID level, not pushed down toward more SVA specific code.

This patch is where I got rid of the device list iteration:

https://lore.kernel.org/linux-iommu/11-v4-e7091cdd9e8d+43b1-smmuv3_newapi_p2_jgg@nvidia.com/

By lifting the CD programming up and then into the developing generic
PASID layer.

It is worth to jump ahead to the end, once part 2 is done the
set_dev_pasid SVA op looks like this:

static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
				      struct device *dev, ioasid_t id)
{
	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
	struct arm_smmu_cd target;
	int ret;

	/* Prevent arm_smmu_mm_release from being called while we are attaching */
	if (!mmget_not_zero(domain->mm))
		return -EINVAL;

	/*
	 * This does not need the arm_smmu_asid_lock because SVA domains never
	 * get reassigned
	 */
	arm_smmu_make_sva_cd(&target, master, smmu_domain->domain.mm,
			     smmu_domain->asid,
			     smmu_domain->btm_invalidation);

	ret = arm_smmu_set_pasid(master, smmu_domain, id, &target);

	mmput(domain->mm);
	return ret;
}

Where the cd programming is in arm_smmu_set_pasid() and that code is
fully shared between attaching a normal S1 paging domain and a SVA to
a PASID.

Jason
Will Deacon Feb. 13, 2024, 11:20 a.m. UTC | #4
Hi Jason,

On Thu, Feb 08, 2024 at 10:27:29AM -0400, Jason Gunthorpe wrote:
> On Thu, Feb 08, 2024 at 12:58:47PM +0000, Will Deacon wrote:
> > On Fri, Feb 02, 2024 at 10:28:05AM -0400, Jason Gunthorpe wrote:
> > > If the SMMU is configured to use a two level CD table then
> > > arm_smmu_write_ctx_desc() allocates a CD table leaf internally using
> > > GFP_KERNEL. Due to recent changes this is being done under a spinlock to
> > > iterate over the device list - thus it will trigger a sleeping while
> > > atomic warning:
> > > 
> > >   arm_smmu_sva_set_dev_pasid()
> > >     mutex_lock(&sva_lock);
> > >     __arm_smmu_sva_bind()
> > >      arm_smmu_mmu_notifier_get()
> > >       spin_lock_irqsave()
> > >       arm_smmu_write_ctx_desc()
> > > 	arm_smmu_get_cd_ptr()
> > >          arm_smmu_alloc_cd_leaf_table()
> > > 	  dmam_alloc_coherent(GFP_KERNEL)
> > > 
> > > This is a 64K high order allocation and really should not be done
> > > atomically.
> > > 
> > > The proper answer is to sequence CD programming in a way that does not
> > > require the device list or a spinlock. Part two of my lager series does
> > > this and already solves this bug.
> > > 
> > > In the mean time we can solve the functional issue by preallocating the CD
> > > entry outside any locking. CD leafs are effectively never freed so this
> > > guarantees the arm_smmu_write_ctx_desc() inside the lock will not take the
> > > allocation path.
> > > 
> > > Fixes: 24503148c545 ("iommu/arm-smmu-v3: Refactor write_ctx_desc")
> > > Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> > > Closes: https://lore.kernel.org/all/4e25d161-0cf8-4050-9aa3-dfa21cd63e56@moroto.mountain/
> > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > > ---
> > >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +++++
> > >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 2 +-
> > >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     | 2 ++
> > >  3 files changed, 8 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > > index 05722121f00e70..068d60732e6481 100644
> > > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > > @@ -588,6 +588,11 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
> > >  {
> > >  	int ret = 0;
> > >  	struct mm_struct *mm = domain->mm;
> > > +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> > > +
> > > +	/* Preallocate the required leafs outside locks */
> > > +	if (!arm_smmu_get_cd_ptr(master, id))
> > > +		return -ENOMEM;
> > 
> > What guarantees that all the masters in the domain share the same l2 CD
> > table?
> 
> Nothing, in fact they don't.
> 
> However, a multi-master RID domain is something that doesn't fully
> work with SVA today. This code is functionally/security wrong to
> install the PASID into all the masters that share the RID domain.
> 
> In practice that doesn't happen because the iommu group size when
> PASID is enabled is restricted to 1 and the DMA API is the thing that
> has created a unique per-group domain to be able to turn SVA on at
> all.
> 
> So, the RID domain's list has one master entry that is the same as the
> set_dev_pasid's master.
> 
> Which is why this patch works, it prealloctes the only master that
> matters for the only configuration where SVA works and is used today.
> 
> iommufd could violate these assumptions which is part of why I'm
> fixing all of this in preparation to enable PASID support for all
> drivers in iommufd.

Damn, all the surrounding rework has really left this poor code in a bit
of a state. Could you please respin your fix, but with an additional change
to arm_smmu_mmu_notifier_get() so that it returns an error if there is
more than one device in the domain, with some of the above in a comment?

Cheers,

Will
Jason Gunthorpe Feb. 13, 2024, 12:18 p.m. UTC | #5
On Tue, Feb 13, 2024 at 11:20:15AM +0000, Will Deacon wrote:

> Damn, all the surrounding rework has really left this poor code in a bit
> of a state. 

That's one way to describe it. There is a reason it took me so many
patches to align this to the new core APIs :(

> Could you please respin your fix, but with an additional change
> to arm_smmu_mmu_notifier_get() so that it returns an error if there is
> more than one device in the domain, with some of the above in a comment?

Adding a check is not a comprehensive solution, there are still ways
userspace can attack this code with iommufd's coming PASID support. It
certainly doesn't belong in this patch which should be backported.

I can summarize some of these details in a comment for this patch.

Jason
Will Deacon Feb. 13, 2024, 12:54 p.m. UTC | #6
Hi Jason,

On Tue, Feb 13, 2024 at 08:18:50AM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 13, 2024 at 11:20:15AM +0000, Will Deacon wrote:
> 
> > Damn, all the surrounding rework has really left this poor code in a bit
> > of a state. 
> 
> That's one way to describe it. There is a reason it took me so many
> patches to align this to the new core APIs :(
> 
> > Could you please respin your fix, but with an additional change
> > to arm_smmu_mmu_notifier_get() so that it returns an error if there is
> > more than one device in the domain, with some of the above in a comment?
> 
> Adding a check is not a comprehensive solution, there are still ways
> userspace can attack this code with iommufd's coming PASID support. It
> certainly doesn't belong in this patch which should be backported.

Ok, then how about avoiding the allocation entirely once the lock is held?
Untested diff below, but you'd call arm_smmu_init_cd_table() from
arm_smmu_sva_set_dev_pasid(). I'd just like to make sure we really do
avoid allocating from atomic context if we end up with a multi-master
domain.

> I can summarize some of these details in a comment for this patch.

Alternatively, we can just revert the offending commits if we're not able
to agree on a simple fix for now. I'd prefer to avoid that though.

Will

--->8

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 0ffb1cf17e0b..8ca92639b13a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1019,7 +1019,10 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
        WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
-static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
+static __le64 *
+__arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid,
+                     int (*cd_alloc)(struct arm_smmu_device *,
+                                     struct arm_smmu_l1_ctx_desc *))
 {
        __le64 *l1ptr;
        unsigned int idx;
@@ -1033,7 +1036,7 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
        idx = ssid >> CTXDESC_SPLIT;
        l1_desc = &cd_table->l1_desc[idx];
        if (!l1_desc->l2ptr) {
-               if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+               if (!cd_alloc || cd_alloc(smmu, l1_desc))
                        return NULL;
 
                l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
@@ -1045,6 +1048,19 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
        return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS;
 }
 
+static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
+{
+       return __arm_smmu_get_cd_ptr(master, ssid, NULL);
+}
+
+int arm_smmu_init_cd_table(struct arm_smmu_master *master, u32 ssid)
+{
+       if (!__arm_smmu_get_cd_ptr(master, ssid, arm_smmu_alloc_cd_leaf_table))
+               return -ENOMEM;
+
+       return 0;
+}
+
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
                            struct arm_smmu_ctx_desc *cd)
 {
Jason Gunthorpe Feb. 13, 2024, 1:30 p.m. UTC | #7
On Tue, Feb 13, 2024 at 12:54:56PM +0000, Will Deacon wrote:
> > Adding a check is not a comprehensive solution, there are still ways
> > userspace can attack this code with iommufd's coming PASID support. It
> > certainly doesn't belong in this patch which should be backported.
> 
> Ok, then how about avoiding the allocation entirely once the lock is
> held?

We need to allocate because we can't assume anything already made the
CD leaf present. Do you mean to do this additionally to this patch?

There are several places where a noalloc CD behavior would be useful,
I'd be fine to add that as part of this patch to protect the loop.

> > I can summarize some of these details in a comment for this patch.
>
> Alternatively, we can just revert the offending commits if we're not able
> to agree on a simple fix for now. I'd prefer to avoid that though.

I feel we are not aligned here. We are trying to get the driver to
work properly with iommufd and expose all the HW features. This has
been fixed comprehensively and well-reviewed patches have been on the
list for 7 months now. We want to move forward, not revert things.

Jason
Will Deacon Feb. 13, 2024, 1:56 p.m. UTC | #8
On Tue, Feb 13, 2024 at 09:30:45AM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 13, 2024 at 12:54:56PM +0000, Will Deacon wrote:
> > > Adding a check is not a comprehensive solution, there are still ways
> > > userspace can attack this code with iommufd's coming PASID support. It
> > > certainly doesn't belong in this patch which should be backported.
> > 
> > Ok, then how about avoiding the allocation entirely once the lock is
> > held?
> 
> We need to allocate because we can't assume anything already made the
> CD leaf present. Do you mean to do this additionally to this patch?

Yes, that's why I said "you'd call arm_smmu_init_cd_table() from
arm_smmu_sva_set_dev_pasid()" in the part of my reply that you trimmed.
The diff is just trying to explain an alternative approach, rather than
meant as a wholesale replacment to your patch. Crucially, it does _not_
allocate with the spinlock held.

I can write it as a full patch if you prefer (and I'll need your help to
test it), but I was really hoping to continue the discussion so you could
spin a v2.

> > > I can summarize some of these details in a comment for this patch.
> >
> > Alternatively, we can just revert the offending commits if we're not able
> > to agree on a simple fix for now. I'd prefer to avoid that though.
> 
> I feel we are not aligned here. We are trying to get the driver to
> work properly with iommufd and expose all the HW features. This has
> been fixed comprehensively and well-reviewed patches have been on the
> list for 7 months now. We want to move forward, not revert things.

I appreciate the effort you're putting in, but this specific patch is
fixing a regression so let's stabilise the base before we continue to
build on top of it.

Will
Jason Gunthorpe Feb. 13, 2024, 2 p.m. UTC | #9
On Tue, Feb 13, 2024 at 01:56:28PM +0000, Will Deacon wrote:

> I can write it as a full patch if you prefer (and I'll need your help to
> test it), but I was really hoping to continue the discussion so you could
> spin a v2.

No worries, I'll take care of it, just need to understand what we have
agreed.

Jason
Will Deacon Feb. 14, 2024, 5 p.m. UTC | #10
On Tue, Feb 13, 2024 at 10:00:33AM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 13, 2024 at 01:56:28PM +0000, Will Deacon wrote:
> 
> > I can write it as a full patch if you prefer (and I'll need your help to
> > test it), but I was really hoping to continue the discussion so you could
> > spin a v2.
> 
> No worries, I'll take care of it, just need to understand what we have
> agreed.

Cheers. I finished the diff off anyway, as it's basically just merging
the thing I sent with the rest of your patch. What do you reckon?

Will

--->8

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 05722121f00e..128f61a705c7 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -588,6 +588,12 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
 {
 	int ret = 0;
 	struct mm_struct *mm = domain->mm;
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+	/* Pre-allocate the CD leaf table without any locks held */
+	ret = arm_smmu_init_cd_table(master, id);
+	if (ret)
+		return ret;
 
 	mutex_lock(&sva_lock);
 	ret = __arm_smmu_sva_bind(dev, mm);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 0ffb1cf17e0b..e48f2b46f25e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1019,7 +1019,10 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
-static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
+static __le64 *
+__arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid,
+		      int (*cd_alloc)(struct arm_smmu_device *,
+				      struct arm_smmu_l1_ctx_desc *))
 {
 	__le64 *l1ptr;
 	unsigned int idx;
@@ -1033,7 +1036,7 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
 	idx = ssid >> CTXDESC_SPLIT;
 	l1_desc = &cd_table->l1_desc[idx];
 	if (!l1_desc->l2ptr) {
-		if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+		if (!cd_alloc || cd_alloc(smmu, l1_desc))
 			return NULL;
 
 		l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
@@ -1045,6 +1048,19 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
 	return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS;
 }
 
+static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
+{
+	return __arm_smmu_get_cd_ptr(master, ssid, NULL);
+}
+
+int arm_smmu_init_cd_table(struct arm_smmu_master *master, int ssid)
+{
+	if (!__arm_smmu_get_cd_ptr(master, ssid, arm_smmu_alloc_cd_leaf_table))
+		return -ENOMEM;
+
+	return 0;
+}
+
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 			    struct arm_smmu_ctx_desc *cd)
 {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 65fb388d5173..cc42cede77bc 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -748,6 +748,7 @@ extern struct xarray arm_smmu_asid_xa;
 extern struct mutex arm_smmu_asid_lock;
 extern struct arm_smmu_ctx_desc quiet_cd;
 
+int arm_smmu_init_cd_table(struct arm_smmu_master *smmu_master, int ssid);
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *smmu_master, int ssid,
 			    struct arm_smmu_ctx_desc *cd);
 void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
Jason Gunthorpe Feb. 14, 2024, 7:09 p.m. UTC | #11
On Wed, Feb 14, 2024 at 05:00:30PM +0000, Will Deacon wrote:

> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 0ffb1cf17e0b..e48f2b46f25e 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1019,7 +1019,10 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
>  	WRITE_ONCE(*dst, cpu_to_le64(val));
>  }
>  
> -static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
> +static __le64 *
> +__arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid,
> +		      int (*cd_alloc)(struct arm_smmu_device *,
> +				      struct arm_smmu_l1_ctx_desc *))
>  {
>  	__le64 *l1ptr;
>  	unsigned int idx;
> @@ -1033,7 +1036,7 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
>  	idx = ssid >> CTXDESC_SPLIT;
>  	l1_desc = &cd_table->l1_desc[idx];
>  	if (!l1_desc->l2ptr) {
> -		if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
> +		if (!cd_alloc || cd_alloc(smmu, l1_desc))
>  			return NULL;
>  
>  		l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
> @@ -1045,6 +1048,19 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
>  	return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS;
>  }
>  
> +static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
> +{
> +	return __arm_smmu_get_cd_ptr(master, ssid, NULL);
> +}

I think this will break arm_smmu_write_ctx_desc() which requires
allocation behavior to install the SSID0 in a two level table?

> +int arm_smmu_init_cd_table(struct arm_smmu_master *master, int ssid)
> +{
> +	if (!__arm_smmu_get_cd_ptr(master, ssid, arm_smmu_alloc_cd_leaf_table))
> +		return -ENOMEM;
> +
> +	return 0;
> +}

And this does not compose well with the rest of where we are going. At
the end there are 7 calls to arm_smmu_get_cd_ptr(), all need
allocation except for two (clear and mmu notifier release)

Better to add a noalloc version. Also 'bool noalloc' would be clearer
than a function pointer without obfuscating the control flow.

Also I don't think this should go to -rc, it is not fixing any
bug. The preallocation is enough. Making more clarity for this
confused code is a nice to have at best.

How about this instead as a -next patch:

--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -288,10 +288,11 @@ static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
 
 /* Allocate or get existing MMU notifier for this {domain, mm} pair */
 static struct arm_smmu_mmu_notifier *
-arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain,
+arm_smmu_mmu_notifier_get(struct arm_smmu_master *true_master,
+                         struct arm_smmu_domain *smmu_domain,
                          struct mm_struct *mm)
 {
-       int ret;
+       int ret = 0;
        unsigned long flags;
        struct arm_smmu_ctx_desc *cd;
        struct arm_smmu_mmu_notifier *smmu_mn;
@@ -327,8 +328,15 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain,
 
        spin_lock_irqsave(&smmu_domain->devices_lock, flags);
        list_for_each_entry(master, &smmu_domain->devices, domain_head) {
-               ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm),
-                                             cd);
+               /*
+                * A limitation of the current SVA code requires the RID
+                * smmu_domain to be unique to the actual master.
+                */
+               if (WARN_ON(master != true_master))
+                       ret = -EINVAL;
+               if (!ret)
+                       ret = arm_smmu_write_ctx_desc(
+                               master, mm_get_enqcmd_pasid(mm), cd);
                if (ret) {
                        list_for_each_entry_from_reverse(
                                master, &smmu_domain->devices, domain_head)
@@ -398,7 +406,7 @@ static int __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm)
 
        bond->mm = mm;
 
-       bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm);
+       bond->smmu_mn = arm_smmu_mmu_notifier_get(master, smmu_domain, mm);
        if (IS_ERR(bond->smmu_mn)) {
                ret = PTR_ERR(bond->smmu_mn);
                goto err_free_bond;

Thanks,
Jason
Will Deacon Feb. 15, 2024, 11:38 a.m. UTC | #12
On Wed, Feb 14, 2024 at 03:09:54PM -0400, Jason Gunthorpe wrote:
> On Wed, Feb 14, 2024 at 05:00:30PM +0000, Will Deacon wrote:
> 
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > index 0ffb1cf17e0b..e48f2b46f25e 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > @@ -1019,7 +1019,10 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
> >  	WRITE_ONCE(*dst, cpu_to_le64(val));
> >  }
> >  
> > -static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
> > +static __le64 *
> > +__arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid,
> > +		      int (*cd_alloc)(struct arm_smmu_device *,
> > +				      struct arm_smmu_l1_ctx_desc *))
> >  {
> >  	__le64 *l1ptr;
> >  	unsigned int idx;
> > @@ -1033,7 +1036,7 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
> >  	idx = ssid >> CTXDESC_SPLIT;
> >  	l1_desc = &cd_table->l1_desc[idx];
> >  	if (!l1_desc->l2ptr) {
> > -		if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
> > +		if (!cd_alloc || cd_alloc(smmu, l1_desc))
> >  			return NULL;
> >  
> >  		l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
> > @@ -1045,6 +1048,19 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
> >  	return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS;
> >  }
> >  
> > +static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
> > +{
> > +	return __arm_smmu_get_cd_ptr(master, ssid, NULL);
> > +}
> 
> I think this will break arm_smmu_write_ctx_desc() which requires
> allocation behavior to install the SSID0 in a two level table?

Ah yes, you're quite right. We'd need to add the allocation to
arm_smmu_attach_dev() as well.

> > +int arm_smmu_init_cd_table(struct arm_smmu_master *master, int ssid)
> > +{
> > +	if (!__arm_smmu_get_cd_ptr(master, ssid, arm_smmu_alloc_cd_leaf_table))
> > +		return -ENOMEM;
> > +
> > +	return 0;
> > +}
> 
> And this does not compose well with the rest of where we are going. At
> the end there are 7 calls to arm_smmu_get_cd_ptr(), all need
> allocation except for two (clear and mmu notifier release)
> 
> Better to add a noalloc version. Also 'bool noalloc' would be clearer
> than a function pointer without obfuscating the control flow.
> 
> Also I don't think this should go to -rc, it is not fixing any
> bug. The preallocation is enough. Making more clarity for this
> confused code is a nice to have at best.
> 
> How about this instead as a -next patch:
> 
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -288,10 +288,11 @@ static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
>  
>  /* Allocate or get existing MMU notifier for this {domain, mm} pair */
>  static struct arm_smmu_mmu_notifier *
> -arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain,
> +arm_smmu_mmu_notifier_get(struct arm_smmu_master *true_master,
> +                         struct arm_smmu_domain *smmu_domain,
>                           struct mm_struct *mm)
>  {
> -       int ret;
> +       int ret = 0;
>         unsigned long flags;
>         struct arm_smmu_ctx_desc *cd;
>         struct arm_smmu_mmu_notifier *smmu_mn;
> @@ -327,8 +328,15 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain,
>  
>         spin_lock_irqsave(&smmu_domain->devices_lock, flags);
>         list_for_each_entry(master, &smmu_domain->devices, domain_head) {
> -               ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm),
> -                                             cd);
> +               /*
> +                * A limitation of the current SVA code requires the RID
> +                * smmu_domain to be unique to the actual master.
> +                */
> +               if (WARN_ON(master != true_master))
> +                       ret = -EINVAL;
> +               if (!ret)
> +                       ret = arm_smmu_write_ctx_desc(
> +                               master, mm_get_enqcmd_pasid(mm), cd);
>                 if (ret) {
>                         list_for_each_entry_from_reverse(
>                                 master, &smmu_domain->devices, domain_head)
> @@ -398,7 +406,7 @@ static int __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm)
>  
>         bond->mm = mm;
>  
> -       bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm);
> +       bond->smmu_mn = arm_smmu_mmu_notifier_get(master, smmu_domain, mm);
>         if (IS_ERR(bond->smmu_mn)) {
>                 ret = PTR_ERR(bond->smmu_mn);
>                 goto err_free_bond;

This is what I originally had in mind, but for some reason I thought you
weren't happy with it. Mind if I fold it into your -rc patch? I'd really
like them to go in together.

Will
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 05722121f00e70..068d60732e6481 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -588,6 +588,11 @@  static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
 {
 	int ret = 0;
 	struct mm_struct *mm = domain->mm;
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+	/* Preallocate the required leafs outside locks */
+	if (!arm_smmu_get_cd_ptr(master, id))
+		return -ENOMEM;
 
 	mutex_lock(&sva_lock);
 	ret = __arm_smmu_sva_bind(dev, mm);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 0ffb1cf17e0b2e..ec2bb8685bb50e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1019,7 +1019,7 @@  static void arm_smmu_write_cd_l1_desc(__le64 *dst,
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
-static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
+__le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
 {
 	__le64 *l1ptr;
 	unsigned int idx;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 65fb388d51734d..fa0eeb3519ef28 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -748,6 +748,8 @@  extern struct xarray arm_smmu_asid_xa;
 extern struct mutex arm_smmu_asid_lock;
 extern struct arm_smmu_ctx_desc quiet_cd;
 
+__le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid);
+
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *smmu_master, int ssid,
 			    struct arm_smmu_ctx_desc *cd);
 void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);