diff mbox series

[v2] KVM: x86/xen: Implement hvm_op/HVMOP_flush_tlbs hypercall

Message ID 20230417122206.34647-2-metikaya@amazon.co.uk (mailing list archive)
State New, archived
Headers show
Series [v2] KVM: x86/xen: Implement hvm_op/HVMOP_flush_tlbs hypercall | expand

Commit Message

Kaya, Metin April 17, 2023, 12:22 p.m. UTC
HVMOP_flush_tlbs suboperation of hvm_op hypercall allows a guest to
flush all vCPU TLBs. There is no way for the VMM to flush TLBs from
userspace. Hence, this patch adds support for flushing vCPU TLBs to KVM
by making a KVM_REQ_TLB_FLUSH_GUEST request for all guest vCPUs.

Signed-off-by: Metin Kaya <metikaya@amazon.co.uk>

---
 arch/x86/kvm/xen.c                 | 31 ++++++++++++++++++++++++++++++
 include/xen/interface/hvm/hvm_op.h |  3 +++
 2 files changed, 34 insertions(+)

Comments

Sean Christopherson April 17, 2023, 4:31 p.m. UTC | #1
On Mon, Apr 17, 2023, Metin Kaya wrote:
> HVMOP_flush_tlbs suboperation of hvm_op hypercall allows a guest to
> flush all vCPU TLBs. There is no way for the VMM to flush TLBs from
> userspace.

Ah, took me a minute to connect the dots.  Monday morning is definitely partly
to blame, but it would be helpful to expand this sentence to be more explicit as
to why userspace's inability to efficiently flush TLBs.

And strictly speaking, userspace _can_ flush TLBs, just not in a precise, efficient
way.

> Hence, this patch adds support for flushing vCPU TLBs to KVM

Don't use "this patch", just state what the patch does as a command.

> by making a KVM_REQ_TLB_FLUSH_GUEST request for all guest vCPUs.

E.g.

  Implement in-KVM support for Xen's HVMOP_flush_tlbs hypercall, which
  allows the guest to flush all vCPU's TLBs.  KVM doesn't provide an ioctl()
  to precisely flush guest TLBs, and punting to userspace would likely
  negate the performance benefits of avoiding a TLB shootdown in the guest.

> Signed-off-by: Metin Kaya <metikaya@amazon.co.uk>
> 
> ---

Regarding the cover letter, first and foremost, make sure the cover letter has
a subject.  `git format-patch --cover-letter` will generate what you want, i.e.
there's no need hand generate it (or however you ended up with a nearly empty
mail with no subjection.

Second, there's no need to provide a cover letter for a standalone patch just to
capture the version information.  This area of the patch, between the three "---"
and the diff, is ignored by `git am`, and can be used for version info.

>  arch/x86/kvm/xen.c                 | 31 ++++++++++++++++++++++++++++++
>  include/xen/interface/hvm/hvm_op.h |  3 +++

Modifications to uapi headers is conspicuously missing.  I.e. there likely needs
to be a capability so that userspace can query support.

>  2 files changed, 34 insertions(+)
> 
> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
> index 40edf4d1974c..78fa6d08bebc 100644
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -21,6 +21,7 @@
>  #include <xen/interface/vcpu.h>
>  #include <xen/interface/version.h>
>  #include <xen/interface/event_channel.h>
> +#include <xen/interface/hvm/hvm_op.h>
>  #include <xen/interface/sched.h>
>  
>  #include <asm/xen/cpuid.h>
> @@ -1330,6 +1331,32 @@ static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
>  	return false;
>  }
>  
> +static void kvm_xen_hvmop_flush_tlbs(struct kvm_vcpu *vcpu, bool longmode,

Passing @longmode all the way down here just to ignore just screams copy+paste :-)

> +				     u64 arg, u64 *r)
> +{
> +	if (arg) {
> +		*r = -EINVAL;
> +		return;
> +	}
> +
> +	kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);

This doesn't even compile.  I'll give you one guess as to how much confidence I
have that this was properly tested.

Aha!  And QEMU appears to have Xen emulation support.  That means KVM-Unit-Tests
is an option.  Specifically, extend the "access" test to use this hypercall instead
of INVLPG.  That'll verify that the flush is actually being performed as expteced.

> +	*r = 0;

Using a out param in a void function is silly.  But that's a moot point because
this whole function is silly, just open code it in the caller.

> +}
> +
> +static bool kvm_xen_hcall_hvm_op(struct kvm_vcpu *vcpu, bool longmode,

There's no need for @longmode.

> +				 int cmd, u64 arg, u64 *r)
> +{
> +	switch (cmd) {
> +	case HVMOP_flush_tlbs:
> +		kvm_xen_hvmop_flush_tlbs(vcpu, longmode, arg, r);

This can simply be:

		if (arg) {
			*r = -EINVAL;
		} else {
			kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_TLB_FLUSH_GUEST);
			*r = 0;
		}
		return true;

> +		return true;
> +	default:
> +		break;
> +	}
> +
> +	return false;
> +}
> +
>  struct compat_vcpu_set_singleshot_timer {
>      uint64_t timeout_abs_ns;
>      uint32_t flags;
> @@ -1501,6 +1528,10 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
>  			timeout |= params[1] << 32;
>  		handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
>  		break;
> +	case __HYPERVISOR_hvm_op:
> +		handled = kvm_xen_hcall_hvm_op(vcpu, longmode, params[0], params[1],
> +					       &r);

It'll be a moot point, but in the future let code like this poke out past 80 chars.
The 80 char soft limit exists to make code more readable, wrapping a one-char variable
at the tail end of a function call does more harm than good.

> +		break;
>  	}
>  	default:
>  		break;
> diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h
> index 03134bf3cec1..373123226c6f 100644
> --- a/include/xen/interface/hvm/hvm_op.h
> +++ b/include/xen/interface/hvm/hvm_op.h
> @@ -16,6 +16,9 @@ struct xen_hvm_param {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param);
>  
> +/* Flushes all VCPU TLBs:

s/VCPU/vCPU

And "all vCPU TLBs" is ambiguous.  It could mean "all TLBs for the target vCPU".
Adding "guest" in there would also be helpful, e.g. to make it clear that this
doesn't flush TDP mappings.

> @arg must be NULL. */

Is the "NULL" part verbatim from Xen?  Because "0" seems like a better description
than "NULL" for a u64.

E.g.

  "Flush guest TLBs for all vCPUs"

> +#define HVMOP_flush_tlbs            5
> +
>  /* Hint from PV drivers for pagetable destruction. */
>  #define HVMOP_pagetable_dying       9
>  struct xen_hvm_pagetable_dying {
> -- 
> 2.39.2
>
David Woodhouse April 18, 2023, 9:14 a.m. UTC | #2
On Mon, 2023-04-17 at 09:31 -0700, Sean Christopherson wrote:
> On Mon, Apr 17, 2023, Metin Kaya wrote:
> > HVMOP_flush_tlbs suboperation of hvm_op hypercall allows a guest to
> > flush all vCPU TLBs. There is no way for the VMM to flush TLBs from
> > userspace.
> 
> Ah, took me a minute to connect the dots.  Monday morning is definitely partly
> to blame, but it would be helpful to expand this sentence to be more explicit as
> to why userspace's inability to efficiently flush TLBs.
> 
> And strictly speaking, userspace _can_ flush TLBs, just not in a precise, efficient
> way.

Hm, how? We should probably implement that in userspace as a fallback,
however much it sucks.

> >  arch/x86/kvm/xen.c                 | 31 ++++++++++++++++++++++++++++++
> >  include/xen/interface/hvm/hvm_op.h |  3 +++
> 
> Modifications to uapi headers is conspicuously missing.  I.e. there likely needs
> to be a capability so that userspace can query support.

Nah, nobody cares. If the kernel "accelerates" this hypercall, so be
it. Userspace will just never get the KVM_EXIT_XEN for that hypercall
because it'll be magically handled, like the others.

> 
> > +                                  u64 arg, u64 *r)
> > +{
> > +     if (arg) {
> > +             *r = -EINVAL;
> > +             return;
> > +     }
> > +
> > +     kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
> 
> This doesn't even compile.  I'll give you one guess as to how much confidence I
> have that this was properly tested.
> 
> Aha!  And QEMU appears to have Xen emulation support. 

As of 8.0, yes :)

>  That means KVM-Unit-Tests
> is an option.  Specifically, extend the "access" test to use this hypercall instead
> of INVLPG.  That'll verify that the flush is actually being performed as expteced.
> 


> > +                              int cmd, u64 arg, u64 *r)
> > +{
> > +     switch (cmd) {
> > +     case HVMOP_flush_tlbs:
> > +             kvm_xen_hvmop_flush_tlbs(vcpu, longmode, arg, r);
> 
> This can simply be:
> 
>                 if (arg) {
>                         *r = -EINVAL;

I don't even know that we care about in-kernel acceleration for the
-EINVAL case. We could just return false for that, and let userspace
(report and) handle it.

It should never happen; Xen has never accepted anything but NULL as the
first argument. For reasons unclear.

    case HVMOP_flush_tlbs:
        rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -EINVAL;


I don't think we'll be adding any more HVMOPs to that switch statement
so we might as well make it:

 if (cmd == HVMOP_flush_tlbs && !arg)


>                 } else {
>                         kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_TLB_FLUSH_GUEST);
>                         *r = 0;
>                 }
>                 return true;

...

> > +/* Flushes all VCPU TLBs:
> 
> s/VCPU/vCPU
> 
> And "all vCPU TLBs" is ambiguous.  It could mean "all TLBs for the target vCPU".
> Adding "guest" in there would also be helpful, e.g. to make it clear that this
> doesn't flush TDP mappings.
> 
> > @arg must be NULL. */
> 
> Is the "NULL" part verbatim from Xen?  Because "0" seems like a better description
> than "NULL" for a u64.

Yes, that's all verbatim from Xen. The first hypercall argument is
typically a pointer.
Sean Christopherson April 18, 2023, 2:44 p.m. UTC | #3
On Tue, Apr 18, 2023, David Woodhouse wrote:
> On Mon, 2023-04-17 at 09:31 -0700, Sean Christopherson wrote:
> > On Mon, Apr 17, 2023, Metin Kaya wrote:
> > > HVMOP_flush_tlbs suboperation of hvm_op hypercall allows a guest to
> > > flush all vCPU TLBs. There is no way for the VMM to flush TLBs from
> > > userspace.
> > 
> > Ah, took me a minute to connect the dots.� Monday morning is definitely partly
> > to blame, but it would be helpful to expand this sentence to be more explicit as
> > to why userspace's inability to efficiently flush TLBs.
> > 
> > And strictly speaking, userspace _can_ flush TLBs, just not in a precise, efficient
> > way.
> 
> Hm, how? We should probably implement that in userspace as a fallback,
> however much it sucks.

Oh, the suckage is high :-)  Use KVM_{G,S}ET_SREGS2 to toggle any CR{0,3,4}/EFER
bit and __set_sregs() will reset the MMU context.  Note that without this fix[*]
that I'm going to squeeze into 6.4, the MMU context reset may result in all TDP
MMU roots being freed and reallocated.

[*] https://lore.kernel.org/all/20230413231251.1481410-1-seanjc@google.com

> 
> > > �arch/x86/kvm/xen.c���������������� | 31 ++++++++++++++++++++++++++++++
> > > �include/xen/interface/hvm/hvm_op.h |� 3 +++
> > 
> > Modifications to uapi headers is conspicuously missing.� I.e. there likely needs
> > to be a capability so that userspace can query support.
> 
> Nah, nobody cares. If the kernel "accelerates" this hypercall, so be
> it. Userspace will just never get the KVM_EXIT_XEN for that hypercall
> because it'll be magically handled, like the others.

Ah, that makes sense, I was thinking userspace would complain if it got the
"unexpected" exit.
David Woodhouse April 18, 2023, 3:47 p.m. UTC | #4
On Tue, 2023-04-18 at 07:44 -0700, Sean Christopherson wrote:
> On Tue, Apr 18, 2023, David Woodhouse wrote:
> > On Mon, 2023-04-17 at 09:31 -0700, Sean Christopherson wrote:
> > > On Mon, Apr 17, 2023, Metin Kaya wrote:
> > > > HVMOP_flush_tlbs suboperation of hvm_op hypercall allows a guest to
> > > > flush all vCPU TLBs. There is no way for the VMM to flush TLBs from
> > > > userspace.
> > > 
> > > Ah, took me a minute to connect the dots.� Monday morning is definitely partly
> > > to blame, but it would be helpful to expand this sentence to be more explicit as
> > > to why userspace's inability to efficiently flush TLBs.
> > > 
> > > And strictly speaking, userspace _can_ flush TLBs, just not in a precise, efficient
> > > way.
> > 
> > Hm, how? We should probably implement that in userspace as a fallback,
> > however much it sucks.
> 
> Oh, the suckage is high :-)  Use KVM_{G,S}ET_SREGS2 to toggle any CR{0,3,4}/EFER
> bit and __set_sregs() will reset the MMU context.

Oh, that suckage is quite high indeed. I'm not actually sure I'll
bother doing this in QEMU. It's quite esoteric; Linux has never used it
and I think that after launching <mumble> million production Xen guests
this way we've only ever seen it used by some FreeBSD variant.

It doesn't seem to be in mainline FreeBSD; there's a hint of it here
https://people.freebsd.org/~dfr/freebsd-6.x-xen-31032009.diff for
example but it's disabled even then:

 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
-		invltlb();
-		smp_invltlb();
+#if defined(XENHVM) && defined(notdef)
+		/*
+		 * As far as I can tell, this makes things slower, at
+		 * least where there are only two physical cpus and
+		 * the host is not overcommitted.
+		 */
+		if (is_running_on_xen()) {
+			HYPERVISOR_hvm_op(HVMOP_flush_tlbs, NULL);
+		} else
+#endif
+		{
+			invltlb();
+			smp_invltlb();
+		}

>  Note that without this fix[*]
> that I'm going to squeeze into 6.4, the MMU context reset may result in all TDP
> MMU roots being freed and reallocated.
> 
> [*] https://lore.kernel.org/all/20230413231251.1481410-1-seanjc@google.com
> 
> > 
> > > > �arch/x86/kvm/xen.c���������������� | 31 ++++++++++++++++++++++++++++++
> > > > �include/xen/interface/hvm/hvm_op.h |� 3 +++
> > > 
> > > Modifications to uapi headers is conspicuously missing.� I.e. there likely needs
> > > to be a capability so that userspace can query support.
> > 
> > Nah, nobody cares. If the kernel "accelerates" this hypercall, so be
> > it. Userspace will just never get the KVM_EXIT_XEN for that hypercall
> > because it'll be magically handled, like the others.
> 
> Ah, that makes sense, I was thinking userspace would complain if it got the
> "unexpected" exit.

I've tried to follow a model where userspace is *always* expected to
implement the hypercall, and if the kernel chooses to intervene to
accelerate it, that's a bonus.

It saves a bunch of complexity in error handling in the kernel when we
can just say "screw this, let userspace cope".
David Woodhouse April 18, 2023, 4:08 p.m. UTC | #5
On Tue, 2023-04-18 at 16:47 +0100, David Woodhouse wrote:
> 
> Oh, that suckage is quite high indeed. I'm not actually sure I'll
> bother doing this in QEMU. It's quite esoteric; Linux has never used it
> and I think that after launching <mumble> million production Xen guests
> this way we've only ever seen it used by some FreeBSD variant.
> 
> It doesn't seem to be in mainline FreeBSD; there's a hint of it here
> https://people.freebsd.org/~dfr/freebsd-6.x-xen-31032009.diff for
> example but it's disabled even then:

For the record, Metin and Paul tell me I lied; it was actually seen
with a Windows guest using the ancient closed-source Citrix drivers.
diff mbox series

Patch

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 40edf4d1974c..78fa6d08bebc 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -21,6 +21,7 @@ 
 #include <xen/interface/vcpu.h>
 #include <xen/interface/version.h>
 #include <xen/interface/event_channel.h>
+#include <xen/interface/hvm/hvm_op.h>
 #include <xen/interface/sched.h>
 
 #include <asm/xen/cpuid.h>
@@ -1330,6 +1331,32 @@  static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
 	return false;
 }
 
+static void kvm_xen_hvmop_flush_tlbs(struct kvm_vcpu *vcpu, bool longmode,
+				     u64 arg, u64 *r)
+{
+	if (arg) {
+		*r = -EINVAL;
+		return;
+	}
+
+	kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
+	*r = 0;
+}
+
+static bool kvm_xen_hcall_hvm_op(struct kvm_vcpu *vcpu, bool longmode,
+				 int cmd, u64 arg, u64 *r)
+{
+	switch (cmd) {
+	case HVMOP_flush_tlbs:
+		kvm_xen_hvmop_flush_tlbs(vcpu, longmode, arg, r);
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
 struct compat_vcpu_set_singleshot_timer {
     uint64_t timeout_abs_ns;
     uint32_t flags;
@@ -1501,6 +1528,10 @@  int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 			timeout |= params[1] << 32;
 		handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
 		break;
+	case __HYPERVISOR_hvm_op:
+		handled = kvm_xen_hcall_hvm_op(vcpu, longmode, params[0], params[1],
+					       &r);
+		break;
 	}
 	default:
 		break;
diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h
index 03134bf3cec1..373123226c6f 100644
--- a/include/xen/interface/hvm/hvm_op.h
+++ b/include/xen/interface/hvm/hvm_op.h
@@ -16,6 +16,9 @@  struct xen_hvm_param {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param);
 
+/* Flushes all VCPU TLBs: @arg must be NULL. */
+#define HVMOP_flush_tlbs            5
+
 /* Hint from PV drivers for pagetable destruction. */
 #define HVMOP_pagetable_dying       9
 struct xen_hvm_pagetable_dying {