diff mbox series

[RFC,v2,4/7] iommufd: Associate kvm pointer to iommufd ctx

Message ID 20240208151837.35068-5-shameerali.kolothum.thodi@huawei.com (mailing list archive)
State New, archived
Headers show
Series iommu/arm-smmu-v3: Use pinned KVM VMID for stage 2 | expand

Commit Message

Shameerali Kolothum Thodi Feb. 8, 2024, 3:18 p.m. UTC
Introduce an API to set the KVM pointer to the iommufd ctx
and set the same when a vfio dev is bind to iommufd.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  3 +++
 drivers/iommu/iommufd/main.c            | 14 ++++++++++++++
 drivers/vfio/device_cdev.c              |  3 +++
 include/linux/iommufd.h                 |  7 +++++++
 4 files changed, 27 insertions(+)

Comments

Jason Gunthorpe Feb. 8, 2024, 3:42 p.m. UTC | #1
On Thu, Feb 08, 2024 at 03:18:34PM +0000, Shameer Kolothum wrote:

> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 991f864d1f9b..28ede82bb1a6 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -16,6 +16,7 @@ struct iommu_domain;
>  struct iommu_group;
>  struct iommu_option;
>  struct iommufd_device;
> +struct kvm;
>  
>  struct iommufd_ctx {
>  	struct file *file;
> @@ -27,6 +28,8 @@ struct iommufd_ctx {
>  	/* Compatibility with VFIO no iommu */
>  	u8 no_iommu_mode;
>  	struct iommufd_ioas *vfio_ioas;
> +	/* Associated KVM pointer */
> +	struct kvm *kvm;
>  };

Associating the KVM with the entire iommufd is a big hammer, is this
what we want to do?

I know it has to be linked to domain allocation and the coming
"viommu" object, and it is already linked to VFIO.

It means we support one KVM per iommufd (which doesn't seem
unreasonable, but also the first time we've had such a limitation)

The other option would be to pass in the kvm to the individual sub
objects.

Kevin?

Sean would you be OK with this approach considering your other series
to try to make more of this private?

Jason
Sean Christopherson June 24, 2024, 4:53 p.m. UTC | #2
On Thu, Feb 08, 2024, Jason Gunthorpe wrote:
> On Thu, Feb 08, 2024 at 03:18:34PM +0000, Shameer Kolothum wrote:
> 
> > diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> > index 991f864d1f9b..28ede82bb1a6 100644
> > --- a/drivers/iommu/iommufd/iommufd_private.h
> > +++ b/drivers/iommu/iommufd/iommufd_private.h
> > @@ -16,6 +16,7 @@ struct iommu_domain;
> >  struct iommu_group;
> >  struct iommu_option;
> >  struct iommufd_device;
> > +struct kvm;
> >  
> >  struct iommufd_ctx {
> >  	struct file *file;
> > @@ -27,6 +28,8 @@ struct iommufd_ctx {
> >  	/* Compatibility with VFIO no iommu */
> >  	u8 no_iommu_mode;
> >  	struct iommufd_ioas *vfio_ioas;
> > +	/* Associated KVM pointer */
> > +	struct kvm *kvm;
> >  };
> 
> Associating the KVM with the entire iommufd is a big hammer, is this
> what we want to do?
> 
> I know it has to be linked to domain allocation and the coming
> "viommu" object, and it is already linked to VFIO.
> 
> It means we support one KVM per iommufd (which doesn't seem
> unreasonable, but also the first time we've had such a limitation)

And if KVM+iommufd come as pairs, wouldn't iommufd_ctx_set_kvm() need to reject
attempts to bind devices associated with different KVMs, as opposed to silently
ignoring that case?  E.g. something like the below?  Or is there magic elsewhere
in the stack that prevents such a scenario?

void iommufd_ctx_set_kvm(struct iommufd_ctx *ictx, struct kvm *kvm)
{
	int r = 0;

	xa_lock(&ictx->objects);
	if (!ictx->kvm)
		ictx->kvm = kvm;
	else if (icxt->kvm != kvm)
		r = -EINVAL;
	xa_unlock(&ictx->objects);
}

> The other option would be to pass in the kvm to the individual sub
> objects.
> 
> Kevin?
> 
> Sean would you be OK with this approach considering your other series
> to try to make more of this private?

Sorry, I completely missed this.

If kvm_pinned_vmid_{get,put}() are implemented directly by KVM ARM, then I don't
have any immediate concerns, as KVM ARM is a long, long way from being able to
isolate KVM from the core kernel.  And if we ever want/need to provide generic
APIs for propagating state from KVM into iommufd, bundling the KVM file pointer[*]
with a set of function pointers wouldn't be terribly difficult.

That said, I find the on-demand pinning to be very odd.  IIUC, if KVM runs out
of pinnable VMIDs, attaching a device to the KVM+iommu will fail.  Failing an
iommufd operation because of a (potentially transient) KVM resource issue is
rather unpleasant.

And assuming that pinnable VMIDs are a somewhat scarce resource, it wouldn't
suprise me if someone wanted to add cgroup integration, e.g. similar to the
misc cgroup that's used to manage SEV(-ES) ASIDs on KVM AMD (IIUC, an SEV ASID
is analagous to an ARM VMID).

Rather than on-demand pinning, would it make sense to have KVM provide an ioctl()
(or capability, or VM type) to let userspace pin a VM's VMID?  That would allow
for a much saner failure mode, and I suspect would be cleaner in general for iommufd.

[*] https://lore.kernel.org/all/ZXkVSKULLivrMkBl@google.com
Jason Gunthorpe June 24, 2024, 5:07 p.m. UTC | #3
On Mon, Jun 24, 2024 at 09:53:00AM -0700, Sean Christopherson wrote:
> > Associating the KVM with the entire iommufd is a big hammer, is this
> > what we want to do?
> > 
> > I know it has to be linked to domain allocation and the coming
> > "viommu" object, and it is already linked to VFIO.
> > 
> > It means we support one KVM per iommufd (which doesn't seem
> > unreasonable, but also the first time we've had such a limitation)
> 
> And if KVM+iommufd come as pairs, wouldn't iommufd_ctx_set_kvm() need to reject
> attempts to bind devices associated with different KVMs, as opposed to silently
> ignoring that case?  E.g. something like the below?  Or is there magic elsewhere
> in the stack that prevents such a scenario?

Yes, it would need things like that

But I think based on other discussions we are likely to tie to the KVM
to the coming IOMMUFD VIOMMU object, and the KVM will probably be
provided at object creation time to avoid this issue.

> > Sean would you be OK with this approach considering your other series
> > to try to make more of this private?
> 
> Sorry, I completely missed this.
> 
> If kvm_pinned_vmid_{get,put}() are implemented directly by KVM ARM, then I don't
> have any immediate concerns, as KVM ARM is a long, long way from being able to
> isolate KVM from the core kernel.  

I think that is a reasonable thing, I also don't really see VMID as
being general. We will have to figure out how to ensure that the KVM
FD we got is an ARM KVM FD..

> That said, I find the on-demand pinning to be very odd.  IIUC, if KVM runs out
> of pinnable VMIDs, attaching a device to the KVM+iommu will fail.  Failing an
> iommufd operation because of a (potentially transient) KVM resource issue is
> rather unpleasant.

It is kind of subtle, but the only thing that will consume VMIDs is
IOMMUFD operations that are working with nested translation but not
providing KVMs. This is a pretty small blast radius - ie a specific
qemu will fail to start - that I think we can tolerate it.

More normal iommu operation will not require VMIDs so things like
driver attaching/etc is fine.

> And assuming that pinnable VMIDs are a somewhat scarce resource, it wouldn't
> suprise me if someone wanted to add cgroup integration, e.g. similar to the
> misc cgroup that's used to manage SEV(-ES) ASIDs on KVM AMD (IIUC, an SEV ASID
> is analagous to an ARM VMID).

Yeah, but if someone is using such a cgroup then I expect they will
also have an up to date VMM that doesn't trigger this VMID allocation
in the first place...
 
> Rather than on-demand pinning, would it make sense to have KVM provide an ioctl()
> (or capability, or VM type) to let userspace pin a VM's VMID?  That would allow
> for a much saner failure mode, and I suspect would be cleaner in general for iommufd.

The point of this mechanism is to support using this iommufd feature
without a KVM at all. We could instead prevent this directly 100% of
the time, but it means that HW with this BTM capability would not run
the legacy VMMs at all, so I'm not that keen on it..

When a KVM is present then the iommu needs to adopt the VMID of KVM,
and that should have a mechanism to ensure the VMID is valid so long
as the IOMMU is using it (eg because the KVM FD is open)

Jason
Sean Christopherson June 24, 2024, 5:54 p.m. UTC | #4
On Mon, Jun 24, 2024, Jason Gunthorpe wrote:
> On Mon, Jun 24, 2024 at 09:53:00AM -0700, Sean Christopherson wrote:
> > If kvm_pinned_vmid_{get,put}() are implemented directly by KVM ARM, then I don't
> > have any immediate concerns, as KVM ARM is a long, long way from being able to
> > isolate KVM from the core kernel.  
> 
> I think that is a reasonable thing, I also don't really see VMID as
> being general. We will have to figure out how to ensure that the KVM
> FD we got is an ARM KVM FD..

Isn't the caller in ARM specific code?  I was assuming kvm_pinned_vmid_{get,put}()
would simply not exist for non-ARM builds.

> > That said, I find the on-demand pinning to be very odd.  IIUC, if KVM runs out
> > of pinnable VMIDs, attaching a device to the KVM+iommu will fail.  Failing an
> > iommufd operation because of a (potentially transient) KVM resource issue is
> > rather unpleasant.
> 
> It is kind of subtle, but the only thing that will consume VMIDs is
> IOMMUFD operations that are working with nested translation but not
> providing KVMs. This is a pretty small blast radius - ie a specific
> qemu will fail to start - that I think we can tolerate it.
> 
> More normal iommu operation will not require VMIDs so things like
> driver attaching/etc is fine.
> 
> > And assuming that pinnable VMIDs are a somewhat scarce resource, it wouldn't
> > suprise me if someone wanted to add cgroup integration, e.g. similar to the
> > misc cgroup that's used to manage SEV(-ES) ASIDs on KVM AMD (IIUC, an SEV ASID
> > is analagous to an ARM VMID).
> 
> Yeah, but if someone is using such a cgroup then I expect they will
> also have an up to date VMM that doesn't trigger this VMID allocation
> in the first place...

I suspect we're talking about two different things.  Either that, or I am really
lost.

> > Rather than on-demand pinning, would it make sense to have KVM provide an ioctl()
> > (or capability, or VM type) to let userspace pin a VM's VMID?  That would allow
> > for a much saner failure mode, and I suspect would be cleaner in general for iommufd.
> 
> The point of this mechanism is to support using this iommufd feature
> without a KVM at all. We could instead prevent this directly 100% of
> the time, but it means that HW with this BTM capability would not run
> the legacy VMMs at all, so I'm not that keen on it..
> 
> When a KVM is present then the iommu needs to adopt the VMID of KVM,
> and that should have a mechanism to ensure the VMID is valid so long
> as the IOMMU is using it (eg because the KVM FD is open)

Right, and that's what I'm referring to as "on-demand pinning".  For the IOMMU
to adopt a KVM VMID, the VMID needs to be pinned (or KVM would need to notify
the IOMMU every time the VMID changed), i.e. every KVM+IOMMU pair pins a VMID
that is managed by KVM.

Hmm, kvm_arm_pinned_vmid_get() doesn't fail, it just falls back to VMID=0.  Which
seems odd.
Jason Gunthorpe June 24, 2024, 6:01 p.m. UTC | #5
On Mon, Jun 24, 2024 at 10:54:37AM -0700, Sean Christopherson wrote:
> > > And assuming that pinnable VMIDs are a somewhat scarce resource, it wouldn't
> > > suprise me if someone wanted to add cgroup integration, e.g. similar to the
> > > misc cgroup that's used to manage SEV(-ES) ASIDs on KVM AMD (IIUC, an SEV ASID
> > > is analagous to an ARM VMID).
> > 
> > Yeah, but if someone is using such a cgroup then I expect they will
> > also have an up to date VMM that doesn't trigger this VMID allocation
> > in the first place...
> 
> I suspect we're talking about two different things.  Either that, or I am really
> lost.

I mean KVM will have already allocated and charged the cgroup for it's
use of the VMID. The IOMMU side just has to match it, no second
allocation of a VMID.

We wouldn't charge a cgroup for iommu and kvm sharing the same vmid.

> > When a KVM is present then the iommu needs to adopt the VMID of KVM,
> > and that should have a mechanism to ensure the VMID is valid so long
> > as the IOMMU is using it (eg because the KVM FD is open)
> 
> Right, and that's what I'm referring to as "on-demand pinning".  For the IOMMU
> to adopt a KVM VMID, the VMID needs to be pinned (or KVM would need to notify
> the IOMMU every time the VMID changed), i.e. every KVM+IOMMU pair pins a VMID
> that is managed by KVM.

Ok, right, yes, the expectation is that KVM allocates a VMID at some
point and it stays fixed for the life of that kvm.

If KVM can change VMID on the fly then that is a further complication
:\

> Hmm, kvm_arm_pinned_vmid_get() doesn't fail, it just falls back to VMID=0.  Which
> seems odd.

I had the impression the KVM always had a fixed VMID, but I don't
really know.

Jason
Oliver Upton June 24, 2024, 7:12 p.m. UTC | #6
On Mon, Jun 24, 2024 at 03:01:48PM -0300, Jason Gunthorpe wrote:
> On Mon, Jun 24, 2024 at 10:54:37AM -0700, Sean Christopherson wrote:
> > > > And assuming that pinnable VMIDs are a somewhat scarce resource, it wouldn't
> > > > suprise me if someone wanted to add cgroup integration, e.g. similar to the
> > > > misc cgroup that's used to manage SEV(-ES) ASIDs on KVM AMD (IIUC, an SEV ASID
> > > > is analagous to an ARM VMID).
> > > 
> > > Yeah, but if someone is using such a cgroup then I expect they will
> > > also have an up to date VMM that doesn't trigger this VMID allocation
> > > in the first place...
> > 
> > I suspect we're talking about two different things.  Either that, or I am really
> > lost.
> 
> I mean KVM will have already allocated and charged the cgroup for it's
> use of the VMID. The IOMMU side just has to match it, no second
> allocation of a VMID.
> 
> We wouldn't charge a cgroup for iommu and kvm sharing the same vmid.

I think the concern remains that an operator may want to limit the blast
radius of some runaway VMID allocation in a system. But you're right, a
well-intentioned VMM should wind up with a single charge for all the
stage-2's that used the VMID allocation.

> > > When a KVM is present then the iommu needs to adopt the VMID of KVM,
> > > and that should have a mechanism to ensure the VMID is valid so long
> > > as the IOMMU is using it (eg because the KVM FD is open)
> > 
> > Right, and that's what I'm referring to as "on-demand pinning".  For the IOMMU
> > to adopt a KVM VMID, the VMID needs to be pinned (or KVM would need to notify
> > the IOMMU every time the VMID changed), i.e. every KVM+IOMMU pair pins a VMID
> > that is managed by KVM.
> 
> Ok, right, yes, the expectation is that KVM allocates a VMID at some
> point and it stays fixed for the life of that kvm.
> 
> If KVM can change VMID on the fly then that is a further complication
> :\
> 
> > Hmm, kvm_arm_pinned_vmid_get() doesn't fail, it just falls back to VMID=0.  Which
> > seems odd.

This is bleeding a bit of implementation detail where VMID=0 is known to
be reserved (thus invalid), it'd probably be better if the
implementation actually just returned an error.

VMID=0 is associated with the host's MMU context, which is relevant when
running {n,h}VHE mode, as the VMID tags TLB entries even if stage-2
translation is disabled (HCR_EL2.VM = 0).
Shameerali Kolothum Thodi June 24, 2024, 7:13 p.m. UTC | #7
> -----Original Message-----
> From: Sean Christopherson <seanjc@google.com>
> Sent: Monday, June 24, 2024 6:55 PM
> To: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> kvmarm@lists.linux.dev; iommu@lists.linux.dev; linux-arm-
> kernel@lists.infradead.org; Linuxarm <linuxarm@huawei.com>;
> kevin.tian@intel.com; alex.williamson@redhat.com; maz@kernel.org;
> oliver.upton@linux.dev; will@kernel.org; robin.murphy@arm.com; jean-
> philippe@linaro.org; Jonathan Cameron <jonathan.cameron@huawei.com>
> Subject: Re: [RFC PATCH v2 4/7] iommufd: Associate kvm pointer to iommufd
> ctx
> 
> On Mon, Jun 24, 2024, Jason Gunthorpe wrote:
> > On Mon, Jun 24, 2024 at 09:53:00AM -0700, Sean Christopherson wrote:
> > > If kvm_pinned_vmid_{get,put}() are implemented directly by KVM ARM,
> then I don't
> > > have any immediate concerns, as KVM ARM is a long, long way from
> being able to
> > > isolate KVM from the core kernel.
> >
> > I think that is a reasonable thing, I also don't really see VMID as
> > being general. We will have to figure out how to ensure that the KVM
> > FD we got is an ARM KVM FD..
> 
> Isn't the caller in ARM specific code?  I was assuming
> kvm_pinned_vmid_{get,put}()
> would simply not exist for non-ARM builds.

The caller is in ARM specific code(SMMUv3 driver). But at present, the kvm pointer
is passed to it by IOMMUFD during nested domain allocation time. And IOMMUFD
gets the kvm pointer from VFIO during device bind operation.

Not sure,, how this flow is going to be in  the new IOMMUFD VIOMMU object model
Jason referred. If not through IOMMUFD, SMMUv3 has to have a way to figure out
the KVM associated with the device.  

> 
> > > That said, I find the on-demand pinning to be very odd.  IIUC, if KVM runs
> out
> > > of pinnable VMIDs, attaching a device to the KVM+iommu will fail.  Failing
> an
> > > iommufd operation because of a (potentially transient) KVM resource
> issue is
> > > rather unpleasant.
> >
> > It is kind of subtle, but the only thing that will consume VMIDs is
> > IOMMUFD operations that are working with nested translation but not
> > providing KVMs. This is a pretty small blast radius - ie a specific
> > qemu will fail to start - that I think we can tolerate it.
> >
> > More normal iommu operation will not require VMIDs so things like
> > driver attaching/etc is fine.
> >
> > > And assuming that pinnable VMIDs are a somewhat scarce resource, it
> wouldn't
> > > suprise me if someone wanted to add cgroup integration, e.g. similar to
> the
> > > misc cgroup that's used to manage SEV(-ES) ASIDs on KVM AMD (IIUC, an
> SEV ASID
> > > is analagous to an ARM VMID).
> >
> > Yeah, but if someone is using such a cgroup then I expect they will
> > also have an up to date VMM that doesn't trigger this VMID allocation
> > in the first place...
> 
> I suspect we're talking about two different things.  Either that, or I am really
> lost.
> 
> > > Rather than on-demand pinning, would it make sense to have KVM
> provide an ioctl()
> > > (or capability, or VM type) to let userspace pin a VM's VMID?  That would
> allow
> > > for a much saner failure mode, and I suspect would be cleaner in general
> for iommufd.
> >
> > The point of this mechanism is to support using this iommufd feature
> > without a KVM at all. We could instead prevent this directly 100% of
> > the time, but it means that HW with this BTM capability would not run
> > the legacy VMMs at all, so I'm not that keen on it..
> >
> > When a KVM is present then the iommu needs to adopt the VMID of KVM,
> > and that should have a mechanism to ensure the VMID is valid so long
> > as the IOMMU is using it (eg because the KVM FD is open)
> 
> Right, and that's what I'm referring to as "on-demand pinning".  For the
> IOMMU
> to adopt a KVM VMID, the VMID needs to be pinned (or KVM would need to
> notify
> the IOMMU every time the VMID changed), i.e. every KVM+IOMMU pair pins
> a VMID
> that is managed by KVM.
> 
> Hmm, kvm_arm_pinned_vmid_get() doesn't fail, it just falls back to VMID=0.
> Which
> seems odd.

See patch 3, kvm_arch_pinned_vmid_get() return -EINVAL if VMID == 0. For ARM64/KVM
VMID 0 is always reserved and never allocated for a Guest.

Thanks,
Shameer
Sean Christopherson June 24, 2024, 7:29 p.m. UTC | #8
On Mon, Jun 24, 2024, Oliver Upton wrote:
> On Mon, Jun 24, 2024 at 03:01:48PM -0300, Jason Gunthorpe wrote:
> > On Mon, Jun 24, 2024 at 10:54:37AM -0700, Sean Christopherson wrote:
> > > > > And assuming that pinnable VMIDs are a somewhat scarce resource, it wouldn't
> > > > > suprise me if someone wanted to add cgroup integration, e.g. similar to the
> > > > > misc cgroup that's used to manage SEV(-ES) ASIDs on KVM AMD (IIUC, an SEV ASID
> > > > > is analagous to an ARM VMID).
> > > > 
> > > > Yeah, but if someone is using such a cgroup then I expect they will
> > > > also have an up to date VMM that doesn't trigger this VMID allocation
> > > > in the first place...
> > > 
> > > I suspect we're talking about two different things.  Either that, or I am really
> > > lost.
> > 
> > I mean KVM will have already allocated and charged the cgroup for it's
> > use of the VMID. The IOMMU side just has to match it, no second
> > allocation of a VMID.
> > 
> > We wouldn't charge a cgroup for iommu and kvm sharing the same vmid.
> 
> I think the concern remains that an operator may want to limit the blast
> radius of some runaway VMID allocation in a system. But you're right, a
> well-intentioned VMM should wind up with a single charge for all the
> stage-2's that used the VMID allocation.
> 
> > > > When a KVM is present then the iommu needs to adopt the VMID of KVM,
> > > > and that should have a mechanism to ensure the VMID is valid so long
> > > > as the IOMMU is using it (eg because the KVM FD is open)
> > > 
> > > Right, and that's what I'm referring to as "on-demand pinning".  For the IOMMU
> > > to adopt a KVM VMID, the VMID needs to be pinned (or KVM would need to notify
> > > the IOMMU every time the VMID changed), i.e. every KVM+IOMMU pair pins a VMID
> > > that is managed by KVM.
> > 
> > Ok, right, yes, the expectation is that KVM allocates a VMID at some
> > point and it stays fixed for the life of that kvm.
> > 
> > If KVM can change VMID on the fly then that is a further complication
> > :\

Ya, as written today, KVM doesn't assign a VMID when the VM is created, and instead
allocates VMIDs on-demand when a vCPU is run.

The KVM changes in this series allow "pinning" the currently assigned VMID, i.e.
tries to address that further complication.  But because of the on-demand
allocation, there might not be a currently assigned VMID for VM, or the VMID might
be stale, i.e. re-assigned to a different VM.

Thus, kvm_arm_pinned_vmid_get() can effectively trigger VMID allocations, and
thus cgroup charging and failure.

If I'm reading the ARM code correctly, the intent is to cycle through VMIDs as  
necessary so that it's possible for every actively running VM to have a VMID.
And maybe also to also minimize the number of TLB + I$ invalidations?

> > 
> > > Hmm, kvm_arm_pinned_vmid_get() doesn't fail, it just falls back to VMID=0.  Which
> > > seems odd.
> 
> This is bleeding a bit of implementation detail where VMID=0 is known to
> be reserved (thus invalid), it'd probably be better if the
> implementation actually just returned an error.

Oof, I assumed using VMID=0 just caused a loss of performance, but this makes
it sound like the IOMMU mappings will fault?

> VMID=0 is associated with the host's MMU context, which is relevant when
> running {n,h}VHE mode, as the VMID tags TLB entries even if stage-2
> translation is disabled (HCR_EL2.VM = 0).

Heh, I assumed VMID=0 is the host MMU, Intel and AMD have the same effective
behavior :-)
Oliver Upton June 24, 2024, 7:51 p.m. UTC | #9
On Mon, Jun 24, 2024 at 12:29:07PM -0700, Sean Christopherson wrote:
> On Mon, Jun 24, 2024, Oliver Upton wrote:
> > On Mon, Jun 24, 2024 at 03:01:48PM -0300, Jason Gunthorpe wrote:
> > > If KVM can change VMID on the fly then that is a further complication
> > > :\
> 
> Ya, as written today, KVM doesn't assign a VMID when the VM is created, and instead
> allocates VMIDs on-demand when a vCPU is run.
> 
> The KVM changes in this series allow "pinning" the currently assigned VMID, i.e.
> tries to address that further complication.  But because of the on-demand
> allocation, there might not be a currently assigned VMID for VM, or the VMID might
> be stale, i.e. re-assigned to a different VM.
> 
> Thus, kvm_arm_pinned_vmid_get() can effectively trigger VMID allocations, and
> thus cgroup charging and failure.
> 
> If I'm reading the ARM code correctly, the intent is to cycle through VMIDs as  
> necessary so that it's possible for every actively running VM to have a VMID.
> And maybe also to also minimize the number of TLB + I$ invalidations?

The commentary about TLBIs + I$ invalidations is in relation to how
rollover is handled. The kernel's ASID allocator does some deferred
invalidation, the VMID allocator does some eager invalidation at rollover
because it is believed to be a rarer occurrence.

But generally speaking, it's what you expect, we have some structures in
hardware that use VMID to form a tag, and we wnat to avoid blasting them
if at all possible.

> > > 
> > > > Hmm, kvm_arm_pinned_vmid_get() doesn't fail, it just falls back to VMID=0.  Which
> > > > seems odd.
> > 
> > This is bleeding a bit of implementation detail where VMID=0 is known to
> > be reserved (thus invalid), it'd probably be better if the
> > implementation actually just returned an error.
> 
> Oof, I assumed using VMID=0 just caused a loss of performance, but this makes
> it sound like the IOMMU mappings will fault?

Bit worse than that :)

Having the SMMU participate in broadcast TLB maintenance means TLBI
instructions on the CPU invalidate translations in the SMMU, which is
useful for SVA usecases. However, if the host is using different VMIDs
for the CPU and SMMU, then guest TLBIs no longer match the guest's SMMU
context...

Pinning / sharing the VMID between CPU and SMMU is a hard requirement if
you advertise BTM support to the guest.
Jason Gunthorpe June 24, 2024, 10:35 p.m. UTC | #10
On Mon, Jun 24, 2024 at 12:51:34PM -0700, Oliver Upton wrote:

> Having the SMMU participate in broadcast TLB maintenance means TLBI
> instructions on the CPU invalidate translations in the SMMU, which is
> useful for SVA usecases. However, if the host is using different VMIDs
> for the CPU and SMMU, then guest TLBIs no longer match the guest's SMMU
> context...
> 
> Pinning / sharing the VMID between CPU and SMMU is a hard requirement if
> you advertise BTM support to the guest.

Yes, and the extra detail that the SMMU *must* always have a unique
VMID for the specific VM, we can't leave it in a state without one.

Meaning there is little purpose to changing the VMID at runtime as you
can't multiplex more VM's than you have IDs once the SMMU is using it
anyhow.

Jason
Tian, Kevin June 25, 2024, 1:53 a.m. UTC | #11
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Thursday, February 8, 2024 11:42 PM
> 
> On Thu, Feb 08, 2024 at 03:18:34PM +0000, Shameer Kolothum wrote:
> 
> > diff --git a/drivers/iommu/iommufd/iommufd_private.h
> b/drivers/iommu/iommufd/iommufd_private.h
> > index 991f864d1f9b..28ede82bb1a6 100644
> > --- a/drivers/iommu/iommufd/iommufd_private.h
> > +++ b/drivers/iommu/iommufd/iommufd_private.h
> > @@ -16,6 +16,7 @@ struct iommu_domain;
> >  struct iommu_group;
> >  struct iommu_option;
> >  struct iommufd_device;
> > +struct kvm;
> >
> >  struct iommufd_ctx {
> >  	struct file *file;
> > @@ -27,6 +28,8 @@ struct iommufd_ctx {
> >  	/* Compatibility with VFIO no iommu */
> >  	u8 no_iommu_mode;
> >  	struct iommufd_ioas *vfio_ioas;
> > +	/* Associated KVM pointer */
> > +	struct kvm *kvm;
> >  };
> 
> Associating the KVM with the entire iommufd is a big hammer, is this
> what we want to do?
> 
> I know it has to be linked to domain allocation and the coming
> "viommu" object, and it is already linked to VFIO.
> 
> It means we support one KVM per iommufd (which doesn't seem
> unreasonable, but also the first time we've had such a limitation)
> 
> The other option would be to pass in the kvm to the individual sub
> objects.
> 
> Kevin?
> 

I prefer to not imposing such limitation. Passing it in optionally
to the individual sub objects e.g. hwpt or viommu makes more
sense to me.
Tian, Kevin June 25, 2024, 2:21 a.m. UTC | #12
> From: Sean Christopherson <seanjc@google.com>
> Sent: Tuesday, June 25, 2024 3:29 AM
> 
> On Mon, Jun 24, 2024, Oliver Upton wrote:
> > VMID=0 is associated with the host's MMU context, which is relevant when
> > running {n,h}VHE mode, as the VMID tags TLB entries even if stage-2
> > translation is disabled (HCR_EL2.VM = 0).
> 
> Heh, I assumed VMID=0 is the host MMU, Intel and AMD have the same
> effective
> behavior :-)

side note - Intel VT-d spec recommends stage-1 translations using a
VMID different from those associated with stage-2, but not necessarily
to be 0. intel-iommu driver uses '1' so far:

/*
 * Domain ID reserved for pasid entries programmed for first-level
 * only and pass-through transfer modes.
 */
#define FLPT_DEFAULT_DID                1
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..28ede82bb1a6 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -16,6 +16,7 @@  struct iommu_domain;
 struct iommu_group;
 struct iommu_option;
 struct iommufd_device;
+struct kvm;
 
 struct iommufd_ctx {
 	struct file *file;
@@ -27,6 +28,8 @@  struct iommufd_ctx {
 	/* Compatibility with VFIO no iommu */
 	u8 no_iommu_mode;
 	struct iommufd_ioas *vfio_ioas;
+	/* Associated KVM pointer */
+	struct kvm *kvm;
 };
 
 /*
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 39b32932c61e..28272510fba4 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -495,6 +495,20 @@  void iommufd_ctx_put(struct iommufd_ctx *ictx)
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
 
+/**
+ * iommufd_ctx_set_kvm - Called to set a KVM pointer to iommufd context
+ * @ictx: Context to operate on
+ * @kvm: KVM pointer with a reference taken using kvm_get_kvm_safe()
+ */
+void iommufd_ctx_set_kvm(struct iommufd_ctx *ictx, struct kvm *kvm)
+{
+	xa_lock(&ictx->objects);
+	if (!ictx->kvm)
+		ictx->kvm = kvm;
+	xa_unlock(&ictx->objects);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_set_kvm, IOMMUFD);
+
 static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_ACCESS] = {
 		.destroy = iommufd_access_destroy_object,
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index e75da0a70d1f..e75e96fb57cb 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -101,6 +101,9 @@  long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
 	 */
 	vfio_df_get_kvm_safe(df);
 
+	if (df->kvm)
+		iommufd_ctx_set_kvm(df->iommufd, df->kvm);
+
 	ret = vfio_df_open(df);
 	if (ret)
 		goto out_put_kvm;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index ffc3a949f837..7408b620d0b8 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -17,6 +17,7 @@  struct iommufd_ctx;
 struct iommufd_access;
 struct file;
 struct iommu_group;
+struct kvm;
 
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 					   struct device *dev, u32 *id);
@@ -59,6 +60,7 @@  struct iommufd_ctx *iommufd_ctx_from_file(struct file *file);
 struct iommufd_ctx *iommufd_ctx_from_fd(int fd);
 void iommufd_ctx_put(struct iommufd_ctx *ictx);
 bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group);
+void iommufd_ctx_set_kvm(struct iommufd_ctx *ictx, struct kvm *kvm);
 
 int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
 			     unsigned long length, struct page **out_pages,
@@ -80,6 +82,11 @@  static inline void iommufd_ctx_put(struct iommufd_ctx *ictx)
 {
 }
 
+static inline void iommufd_ctx_set_kvm(struct iommufd_ctx *ictx,
+				       struct kvm *kvm)
+{
+}
+
 static inline int iommufd_access_pin_pages(struct iommufd_access *access,
 					   unsigned long iova,
 					   unsigned long length,