diff mbox

[05/37] KVM: Record the executing ioctl number on the vcpu struct

Message ID 20171012104141.26902-6-christoffer.dall@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Christoffer Dall Oct. 12, 2017, 10:41 a.m. UTC
Some architectures may decide to do different things during
kvm_arch_vcpu_load depending on the ioctl being executed.  For example,
arm64 is about to do significant work in vcpu load/put when running a
vcpu, but not when doing things like KVM_SET_ONE_REG or
KVM_SET_MP_STATE.

Therefore, store the ioctl number that we are executing on the VCPU
during the first vcpu_load() which succeeds in getting the vcpu->mutex
and set the ioctl number to 0 when exiting kvm_vcpu_ioctl() after
successfully loading the vcpu.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/x86/kvm/vmx.c       | 2 +-
 arch/x86/kvm/x86.c       | 8 ++++----
 include/linux/kvm_host.h | 3 ++-
 virt/kvm/kvm_main.c      | 6 ++++--
 4 files changed, 11 insertions(+), 8 deletions(-)

Comments

Radim Krčmář Oct. 13, 2017, 5:13 p.m. UTC | #1
2017-10-12 12:41+0200, Christoffer Dall:
> Some architectures may decide to do different things during
> kvm_arch_vcpu_load depending on the ioctl being executed.  For example,
> arm64 is about to do significant work in vcpu load/put when running a
> vcpu, but not when doing things like KVM_SET_ONE_REG or
> KVM_SET_MP_STATE.
> 
> Therefore, store the ioctl number that we are executing on the VCPU
> during the first vcpu_load() which succeeds in getting the vcpu->mutex
> and set the ioctl number to 0 when exiting kvm_vcpu_ioctl() after
> successfully loading the vcpu.
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> ---
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> @@ -147,12 +147,13 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
>  /*
>   * Switches to specified vcpu, until a matching vcpu_put()
>   */
> -int vcpu_load(struct kvm_vcpu *vcpu)
> +int vcpu_load(struct kvm_vcpu *vcpu, unsigned int ioctl)
>  {
>  	int cpu;
>  
>  	if (mutex_lock_killable(&vcpu->mutex))
>  		return -EINTR;
> +	vcpu->ioctl = ioctl;

This seems to prevent races by protecting the store by a mutex, but

>  	cpu = get_cpu();
>  	preempt_notifier_register(&vcpu->preempt_notifier);
>  	kvm_arch_vcpu_load(vcpu, cpu);
> @@ -2529,7 +2530,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
>  #endif
>  
>  
> -	r = vcpu_load(vcpu);
> +	r = vcpu_load(vcpu, ioctl);
>  	if (r)
>  		return r;
>  	switch (ioctl) {
> @@ -2704,6 +2705,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
>  	}
>  out:
>  	vcpu_put(vcpu);
> +	vcpu->ioctl = 0;

we should still have a race as we clear ioctl only after releasing the
lock.  For example malicious userspace could break KVM terms of use and
issue !KVM_RUN followed by KVM_RUN, so we would have these races:

   !KVM_RUN                             |   KVM_RUN

   mutex_lock_killable(&vcpu->mutex);   |
   vcpu->ioctl = !KVM_RUN;              |
   ...                                  | mutex_lock_killable(&vcpu->mutex);
   mutex_unlock(&vcpu->mutex);          |
                                        | vcpu->ioctl = KVM_RUN;
                                        | kvm_arch_vcpu_load() // variant 1
   vcpu->ioctl = 0;                     | ...
                                        | kvm_arch_vcpu_load() // variant 2
                                        | vcpu_put()

where the observed value of vcpu->ioctl in vcpu_put() would not
correctly pair with vcpu_load() or worse, kvm_arch_arch_load() in
KVM_RUN would execute with vcpu->ioctl = 0.

I think that other (special) callsites of vcpu_load()/vcpu_put() have a
well defined IOCTL that can be used instead of vcpu->ioctl, so we could
just pass the ioctl value all the way to arch code and never save it
anywhere,

thanks.

>  	kfree(fpu);
>  	kfree(kvm_sregs);
>  	return r;
Christoffer Dall Oct. 13, 2017, 5:31 p.m. UTC | #2
On Fri, Oct 13, 2017 at 07:13:07PM +0200, Radim Krčmář wrote:
> 2017-10-12 12:41+0200, Christoffer Dall:
> > Some architectures may decide to do different things during
> > kvm_arch_vcpu_load depending on the ioctl being executed.  For example,
> > arm64 is about to do significant work in vcpu load/put when running a
> > vcpu, but not when doing things like KVM_SET_ONE_REG or
> > KVM_SET_MP_STATE.
> > 
> > Therefore, store the ioctl number that we are executing on the VCPU
> > during the first vcpu_load() which succeeds in getting the vcpu->mutex
> > and set the ioctl number to 0 when exiting kvm_vcpu_ioctl() after
> > successfully loading the vcpu.
> > 
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> > ---
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > @@ -147,12 +147,13 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
> >  /*
> >   * Switches to specified vcpu, until a matching vcpu_put()
> >   */
> > -int vcpu_load(struct kvm_vcpu *vcpu)
> > +int vcpu_load(struct kvm_vcpu *vcpu, unsigned int ioctl)
> >  {
> >  	int cpu;
> >  
> >  	if (mutex_lock_killable(&vcpu->mutex))
> >  		return -EINTR;
> > +	vcpu->ioctl = ioctl;
> 
> This seems to prevent races by protecting the store by a mutex, but
> 
> >  	cpu = get_cpu();
> >  	preempt_notifier_register(&vcpu->preempt_notifier);
> >  	kvm_arch_vcpu_load(vcpu, cpu);
> > @@ -2529,7 +2530,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
> >  #endif
> >  
> >  
> > -	r = vcpu_load(vcpu);
> > +	r = vcpu_load(vcpu, ioctl);
> >  	if (r)
> >  		return r;
> >  	switch (ioctl) {
> > @@ -2704,6 +2705,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
> >  	}
> >  out:
> >  	vcpu_put(vcpu);
> > +	vcpu->ioctl = 0;
> 
> we should still have a race as we clear ioctl only after releasing the
> lock.  For example malicious userspace could break KVM terms of use and
> issue !KVM_RUN followed by KVM_RUN, so we would have these races:
> 
>    !KVM_RUN                             |   KVM_RUN
> 
>    mutex_lock_killable(&vcpu->mutex);   |
>    vcpu->ioctl = !KVM_RUN;              |
>    ...                                  | mutex_lock_killable(&vcpu->mutex);
>    mutex_unlock(&vcpu->mutex);          |
>                                         | vcpu->ioctl = KVM_RUN;
>                                         | kvm_arch_vcpu_load() // variant 1
>    vcpu->ioctl = 0;                     | ...
>                                         | kvm_arch_vcpu_load() // variant 2
>                                         | vcpu_put()
> 
> where the observed value of vcpu->ioctl in vcpu_put() would not
> correctly pair with vcpu_load() or worse, kvm_arch_arch_load() in
> KVM_RUN would execute with vcpu->ioctl = 0.

Yeah, this is super racy, thanks for pointing that out.

> 
> I think that other (special) callsites of vcpu_load()/vcpu_put() have a
> well defined IOCTL that can be used instead of vcpu->ioctl, so we could
> just pass the ioctl value all the way to arch code and never save it
> anywhere,

I don't think that works; what would you do with preempt notifier calls?

One solution is to add a parameter to vcpu_put, lie for vcpu_load, which
also sets the ioctl, and other callers than the final vcpu_put in
kvm_vcpu_ioctl() just pass the existing value, where the kvm_vcpu_ioctl
call can pass 0 which gets set before releasing the mutex.

Can you think of a more elegant solution?

Thanks,
-Christoffer
Radim Krčmář Oct. 13, 2017, 6:38 p.m. UTC | #3
2017-10-13 19:31+0200, Christoffer Dall:
> On Fri, Oct 13, 2017 at 07:13:07PM +0200, Radim Krčmář wrote:
> > I think that other (special) callsites of vcpu_load()/vcpu_put() have a
> > well defined IOCTL that can be used instead of vcpu->ioctl, so we could
> > just pass the ioctl value all the way to arch code and never save it
> > anywhere,
> 
> I don't think that works; what would you do with preempt notifier calls?

Right, BUG :), I didn't consider them before and they need to know.

> One solution is to add a parameter to vcpu_put, lie for vcpu_load, which
> also sets the ioctl, and other callers than the final vcpu_put in
> kvm_vcpu_ioctl() just pass the existing value, where the kvm_vcpu_ioctl
> call can pass 0 which gets set before releasing the mutex.
> 
> Can you think of a more elegant solution?

Not really, only thought of touching preempt notifiers and it seems to
be more complicated.

I think we shouldn't restore ioctl on vcpu_put() at all -- the value
isn't well defined outside of the mutex, so there is no point in looking
and we can just zero the ioctl.

Actually, I wouldn't rely on the existing value at all because that.
The need for load/put depends on the current code path, not on the one
we race with.

x86 seems to be the only user of vcpu_load() outside of kvm_vcpu_ioctl()
and the callers are either under a VM ioctl or under a VM destruction
paths (invalid IOCTL) and we can just hardcode that.

Passing 0 to all other vcpu_load()s and unconditionally zeroing ioctl
before mutex_unlock() should work.
Christoffer Dall Oct. 13, 2017, 6:51 p.m. UTC | #4
On Fri, Oct 13, 2017 at 8:38 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> 2017-10-13 19:31+0200, Christoffer Dall:
>> On Fri, Oct 13, 2017 at 07:13:07PM +0200, Radim Krčmář wrote:
>> > I think that other (special) callsites of vcpu_load()/vcpu_put() have a
>> > well defined IOCTL that can be used instead of vcpu->ioctl, so we could
>> > just pass the ioctl value all the way to arch code and never save it
>> > anywhere,
>>
>> I don't think that works; what would you do with preempt notifier calls?
>
> Right, BUG :), I didn't consider them before and they need to know.
>
>> One solution is to add a parameter to vcpu_put, lie for vcpu_load, which
>> also sets the ioctl, and other callers than the final vcpu_put in
>> kvm_vcpu_ioctl() just pass the existing value, where the kvm_vcpu_ioctl
>> call can pass 0 which gets set before releasing the mutex.
>>
>> Can you think of a more elegant solution?
>
> Not really, only thought of touching preempt notifiers and it seems to
> be more complicated.
>
> I think we shouldn't restore ioctl on vcpu_put() at all -- the value
> isn't well defined outside of the mutex, so there is no point in looking
> and we can just zero the ioctl.

I agree, that fits with the semantics of vcpu_put() quit nicely actually.

>
> Actually, I wouldn't rely on the existing value at all because that.
> The need for load/put depends on the current code path, not on the one
> we race with.
>
> x86 seems to be the only user of vcpu_load() outside of kvm_vcpu_ioctl()
> and the callers are either under a VM ioctl or under a VM destruction
> paths (invalid IOCTL) and we can just hardcode that.

Yes, we should.  I honestly just wanted feedback on this before
tracing through all call paths to the vcpu_load() calls on the x86
side.

>
> Passing 0 to all other vcpu_load()s and unconditionally zeroing ioctl
> before mutex_unlock() should work.


Agreed.  I'll do that for the next version.

Thanks for having a look!

-Christoffer
Andrew Jones Nov. 7, 2017, 10:45 a.m. UTC | #5
On Thu, Oct 12, 2017 at 12:41:09PM +0200, Christoffer Dall wrote:
> Some architectures may decide to do different things during
> kvm_arch_vcpu_load depending on the ioctl being executed.  For example,
> arm64 is about to do significant work in vcpu load/put when running a
> vcpu, but not when doing things like KVM_SET_ONE_REG or
> KVM_SET_MP_STATE.
> 
> Therefore, store the ioctl number that we are executing on the VCPU
> during the first vcpu_load() which succeeds in getting the vcpu->mutex
> and set the ioctl number to 0 when exiting kvm_vcpu_ioctl() after
> successfully loading the vcpu.
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> ---
>  arch/x86/kvm/vmx.c       | 2 +-
>  arch/x86/kvm/x86.c       | 8 ++++----
>  include/linux/kvm_host.h | 3 ++-
>  virt/kvm/kvm_main.c      | 6 ++++--
>  4 files changed, 11 insertions(+), 8 deletions(-)
>

I wonder if enough other architectures would be able to benefit from this
for most/all of their non-RUN VCPU ioctls. If so, then maybe we should
consider doing something like

 int __vcpu_load(struct kvm_vcpu *vcpu)
 {
    int cpu;

    cpu = get_cpu();
    preempt_notifier_register(&vcpu->preempt_notifier);
    kvm_arch_vcpu_load(vcpu, cpu);
    put_cpu();
    return 0;
 }
 int vcpu_load(struct kvm_vcpu *vcpu)
 {
    if (mutex_lock_killable(&vcpu->mutex))
        return -EINTR;
    return __vcpu_load(vcpu);
 }

and the equivalent for vcpu_put.

Then just take the lock in kvm_vcpu_ioctl and leave it to the
kvm_arch_vcpu_ioctl_* functions to call __vcpu_load/__vcpu_put
if necessary.

Thanks,
drew
Christoffer Dall Nov. 22, 2017, 8:28 p.m. UTC | #6
On Tue, Nov 07, 2017 at 11:45:01AM +0100, Andrew Jones wrote:
> On Thu, Oct 12, 2017 at 12:41:09PM +0200, Christoffer Dall wrote:
> > Some architectures may decide to do different things during
> > kvm_arch_vcpu_load depending on the ioctl being executed.  For example,
> > arm64 is about to do significant work in vcpu load/put when running a
> > vcpu, but not when doing things like KVM_SET_ONE_REG or
> > KVM_SET_MP_STATE.
> > 
> > Therefore, store the ioctl number that we are executing on the VCPU
> > during the first vcpu_load() which succeeds in getting the vcpu->mutex
> > and set the ioctl number to 0 when exiting kvm_vcpu_ioctl() after
> > successfully loading the vcpu.
> > 
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> > ---
> >  arch/x86/kvm/vmx.c       | 2 +-
> >  arch/x86/kvm/x86.c       | 8 ++++----
> >  include/linux/kvm_host.h | 3 ++-
> >  virt/kvm/kvm_main.c      | 6 ++++--
> >  4 files changed, 11 insertions(+), 8 deletions(-)
> >
> 
> I wonder if enough other architectures would be able to benefit from this
> for most/all of their non-RUN VCPU ioctls. If so, then maybe we should
> consider doing something like
> 
>  int __vcpu_load(struct kvm_vcpu *vcpu)
>  {
>     int cpu;
> 
>     cpu = get_cpu();
>     preempt_notifier_register(&vcpu->preempt_notifier);
>     kvm_arch_vcpu_load(vcpu, cpu);
>     put_cpu();
>     return 0;
>  }
>  int vcpu_load(struct kvm_vcpu *vcpu)
>  {
>     if (mutex_lock_killable(&vcpu->mutex))
>         return -EINTR;
>     return __vcpu_load(vcpu);
>  }
> 
> and the equivalent for vcpu_put.
> 
> Then just take the lock in kvm_vcpu_ioctl and leave it to the
> kvm_arch_vcpu_ioctl_* functions to call __vcpu_load/__vcpu_put
> if necessary.
> 
That seems like it would be a pretty invasive change, because we change
the semantics of when kvm_arch_vcpu_load() is called, and I'm not really
sure when it then does/doesn't make sense to call __vcpu_load/__vcpu_put
for each architecture.

But on ARM we only need preempt_notifiers for KVM_RUN, so if it's
similar on other architectures, it might not be that bad.

Was your intention about vcpu_load() then that x86 could remain
unmodified?

Thanks,
-Christoffer
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6970249..d729889 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9493,7 +9493,7 @@  static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int r;
 
-       r = vcpu_load(vcpu);
+       r = vcpu_load(vcpu, vcpu->ioctl);
        BUG_ON(r);
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
        free_nested(vmx);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cd17b7d..68d9c95 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7723,7 +7723,7 @@  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	int r;
 
 	kvm_vcpu_mtrr_init(vcpu);
-	r = vcpu_load(vcpu);
+	r = vcpu_load(vcpu, vcpu->ioctl);
 	if (r)
 		return r;
 	kvm_vcpu_reset(vcpu, false);
@@ -7739,7 +7739,7 @@  void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 	kvm_hv_vcpu_postcreate(vcpu);
 
-	if (vcpu_load(vcpu))
+	if (vcpu_load(vcpu, vcpu->ioctl))
 		return;
 	msr.data = 0x0;
 	msr.index = MSR_IA32_TSC;
@@ -7759,7 +7759,7 @@  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	int r;
 	vcpu->arch.apf.msr_val = 0;
 
-	r = vcpu_load(vcpu);
+	r = vcpu_load(vcpu, vcpu->ioctl);
 	BUG_ON(r);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
@@ -8116,7 +8116,7 @@  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 {
 	int r;
-	r = vcpu_load(vcpu);
+	r = vcpu_load(vcpu, vcpu->ioctl);
 	BUG_ON(r);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6882538..da0acc0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -274,6 +274,7 @@  struct kvm_vcpu {
 	bool preempted;
 	struct kvm_vcpu_arch arch;
 	struct dentry *debugfs_dentry;
+	unsigned int ioctl; /* ioctl currently executing or 0 */
 };
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -533,7 +534,7 @@  static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 
-int __must_check vcpu_load(struct kvm_vcpu *vcpu);
+int __must_check vcpu_load(struct kvm_vcpu *vcpu, unsigned int ioctl);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_IOAPIC
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9deb5a2..1911ef0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -147,12 +147,13 @@  bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
-int vcpu_load(struct kvm_vcpu *vcpu)
+int vcpu_load(struct kvm_vcpu *vcpu, unsigned int ioctl)
 {
 	int cpu;
 
 	if (mutex_lock_killable(&vcpu->mutex))
 		return -EINTR;
+	vcpu->ioctl = ioctl;
 	cpu = get_cpu();
 	preempt_notifier_register(&vcpu->preempt_notifier);
 	kvm_arch_vcpu_load(vcpu, cpu);
@@ -2529,7 +2530,7 @@  static long kvm_vcpu_ioctl(struct file *filp,
 #endif
 
 
-	r = vcpu_load(vcpu);
+	r = vcpu_load(vcpu, ioctl);
 	if (r)
 		return r;
 	switch (ioctl) {
@@ -2704,6 +2705,7 @@  static long kvm_vcpu_ioctl(struct file *filp,
 	}
 out:
 	vcpu_put(vcpu);
+	vcpu->ioctl = 0;
 	kfree(fpu);
 	kfree(kvm_sregs);
 	return r;