diff mbox series

KVM: Fix error path in kvm_vm_ioctl_create_vcpu() on xa_store() failure

Message ID 20240730155646.1687-1-will@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: Fix error path in kvm_vm_ioctl_create_vcpu() on xa_store() failure | expand

Commit Message

Will Deacon July 30, 2024, 3:56 p.m. UTC
If the xa_store() fails in kvm_vm_ioctl_create_vcpu() then we shouldn't
drop the reference to the 'struct kvm' because the vCPU fd has been
installed and will take care of the refcounting.

This was found by inspection, but forcing the xa_store() to fail
confirms the problem:

 | Unable to handle kernel paging request at virtual address ffff800080ecd960
 | Call trace:
 |  _raw_spin_lock_irq+0x2c/0x70
 |  kvm_irqfd_release+0x24/0xa0
 |  kvm_vm_release+0x1c/0x38
 |  __fput+0x88/0x2ec
 |  ____fput+0x10/0x1c
 |  task_work_run+0xb0/0xd4
 |  do_exit+0x210/0x854
 |  do_group_exit+0x70/0x98
 |  get_signal+0x6b0/0x73c
 |  do_signal+0xa4/0x11e8
 |  do_notify_resume+0x60/0x12c
 |  el0_svc+0x64/0x68
 |  el0t_64_sync_handler+0x84/0xfc
 |  el0t_64_sync+0x190/0x194
 | Code: b9000909 d503201f 2a1f03e1 52800028 (88e17c08)

Add a new label to the error path so that we can branch directly to the
xa_release() if the xa_store() fails.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Michal Luczaj <mhal@rbox.co>
Cc: Alexander Potapenko <glider@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 virt/kvm/kvm_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Comments

Michal Luczaj July 30, 2024, 6:55 p.m. UTC | #1
On 7/30/24 17:56, Will Deacon wrote:
> If the xa_store() fails in kvm_vm_ioctl_create_vcpu() then we shouldn't
> drop the reference to the 'struct kvm' because the vCPU fd has been
> installed and will take care of the refcounting.
> 
> This was found by inspection, but forcing the xa_store() to fail
> confirms the problem:
> 
>  | Unable to handle kernel paging request at virtual address ffff800080ecd960
>  | Call trace:
>  |  _raw_spin_lock_irq+0x2c/0x70
>  |  kvm_irqfd_release+0x24/0xa0
>  |  kvm_vm_release+0x1c/0x38
>  |  __fput+0x88/0x2ec
>  |  ____fput+0x10/0x1c
>  |  task_work_run+0xb0/0xd4
>  |  do_exit+0x210/0x854
>  |  do_group_exit+0x70/0x98
>  |  get_signal+0x6b0/0x73c
>  |  do_signal+0xa4/0x11e8
>  |  do_notify_resume+0x60/0x12c
>  |  el0_svc+0x64/0x68
>  |  el0t_64_sync_handler+0x84/0xfc
>  |  el0t_64_sync+0x190/0x194
>  | Code: b9000909 d503201f 2a1f03e1 52800028 (88e17c08)
> 
> Add a new label to the error path so that we can branch directly to the
> xa_release() if the xa_store() fails.
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Michal Luczaj <mhal@rbox.co>
> Cc: Alexander Potapenko <glider@google.com>
> Cc: Marc Zyngier <maz@kernel.org>
> Signed-off-by: Will Deacon <will@kernel.org>
> ---
>  virt/kvm/kvm_main.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d0788d0a72cc..b80dd8cead8c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4293,7 +4293,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>  
>  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
>  		r = -EINVAL;
> -		goto kvm_put_xa_release;
> +		goto err_xa_release;
>  	}
>  
>  	/*
> @@ -4310,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>  
>  kvm_put_xa_release:
>  	kvm_put_kvm_no_destroy(kvm);
> +err_xa_release:
>  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
>  unlock_vcpu_destroy:
>  	mutex_unlock(&kvm->lock);

My bad for neglecting the "impossible" path. Thanks for the fix.

I wonder if it's complete. If we really want to consider the possibility of
this xa_store() failing, then keeping vCPU fd installed and calling
kmem_cache_free(kvm_vcpu_cache, vcpu) on the error path looks wrong.
Sean Christopherson July 30, 2024, 11:31 p.m. UTC | #2
On Tue, Jul 30, 2024, Michal Luczaj wrote:
> On 7/30/24 17:56, Will Deacon wrote:
> > If the xa_store() fails in kvm_vm_ioctl_create_vcpu() then we shouldn't
> > drop the reference to the 'struct kvm' because the vCPU fd has been
> > installed and will take care of the refcounting.
> > 
> > This was found by inspection, but forcing the xa_store() to fail
> > confirms the problem:
> > 
> >  | Unable to handle kernel paging request at virtual address ffff800080ecd960
> >  | Call trace:
> >  |  _raw_spin_lock_irq+0x2c/0x70
> >  |  kvm_irqfd_release+0x24/0xa0
> >  |  kvm_vm_release+0x1c/0x38
> >  |  __fput+0x88/0x2ec
> >  |  ____fput+0x10/0x1c
> >  |  task_work_run+0xb0/0xd4
> >  |  do_exit+0x210/0x854
> >  |  do_group_exit+0x70/0x98
> >  |  get_signal+0x6b0/0x73c
> >  |  do_signal+0xa4/0x11e8
> >  |  do_notify_resume+0x60/0x12c
> >  |  el0_svc+0x64/0x68
> >  |  el0t_64_sync_handler+0x84/0xfc
> >  |  el0t_64_sync+0x190/0x194
> >  | Code: b9000909 d503201f 2a1f03e1 52800028 (88e17c08)
> > 
> > Add a new label to the error path so that we can branch directly to the
> > xa_release() if the xa_store() fails.
> > 
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Michal Luczaj <mhal@rbox.co>
> > Cc: Alexander Potapenko <glider@google.com>
> > Cc: Marc Zyngier <maz@kernel.org>
> > Signed-off-by: Will Deacon <will@kernel.org>
> > ---
> >  virt/kvm/kvm_main.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index d0788d0a72cc..b80dd8cead8c 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -4293,7 +4293,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> >  
> >  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
> >  		r = -EINVAL;
> > -		goto kvm_put_xa_release;
> > +		goto err_xa_release;
> >  	}
> >  
> >  	/*
> > @@ -4310,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> >  
> >  kvm_put_xa_release:
> >  	kvm_put_kvm_no_destroy(kvm);
> > +err_xa_release:
> >  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
> >  unlock_vcpu_destroy:
> >  	mutex_unlock(&kvm->lock);
> 
> My bad for neglecting the "impossible" path. Thanks for the fix.
> 
> I wonder if it's complete. If we really want to consider the possibility of
> this xa_store() failing, then keeping vCPU fd installed and calling
> kmem_cache_free(kvm_vcpu_cache, vcpu) on the error path looks wrong.

Yeah, the vCPU is exposed to userspace, freeing its assets will just cause
different problems.  KVM_BUG_ON() will prevent _new_ vCPU ioctl() calls (and kick
running vCPUs out of the guest), but it doesn't interrupt other CPUs, e.g. if
userspace is being sneaking and has already invoked a vCPU ioctl(), KVM will hit
a use-after-free (several of them).

As Michal alluded to, it should be impossible for xa_store() to fail since KVM
pre-allocates/reserves memory.  Given that, deliberately leaking the vCPU seems
like the least awful "solution".
Will Deacon July 31, 2024, 1:31 p.m. UTC | #3
On Tue, Jul 30, 2024 at 04:31:08PM -0700, Sean Christopherson wrote:
> On Tue, Jul 30, 2024, Michal Luczaj wrote:
> > On 7/30/24 17:56, Will Deacon wrote:
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index d0788d0a72cc..b80dd8cead8c 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -4293,7 +4293,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> > >  
> > >  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
> > >  		r = -EINVAL;
> > > -		goto kvm_put_xa_release;
> > > +		goto err_xa_release;
> > >  	}
> > >  
> > >  	/*
> > > @@ -4310,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> > >  
> > >  kvm_put_xa_release:
> > >  	kvm_put_kvm_no_destroy(kvm);
> > > +err_xa_release:
> > >  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
> > >  unlock_vcpu_destroy:
> > >  	mutex_unlock(&kvm->lock);
> > 
> > My bad for neglecting the "impossible" path. Thanks for the fix.
> > 
> > I wonder if it's complete. If we really want to consider the possibility of
> > this xa_store() failing, then keeping vCPU fd installed and calling
> > kmem_cache_free(kvm_vcpu_cache, vcpu) on the error path looks wrong.
> 
> Yeah, the vCPU is exposed to userspace, freeing its assets will just cause
> different problems.  KVM_BUG_ON() will prevent _new_ vCPU ioctl() calls (and kick
> running vCPUs out of the guest), but it doesn't interrupt other CPUs, e.g. if
> userspace is being sneaking and has already invoked a vCPU ioctl(), KVM will hit
> a use-after-free (several of them).

Damn, yes. Just because we haven't returned the fd yet, doesn't mean
userspace can't make use of it.

> As Michal alluded to, it should be impossible for xa_store() to fail since KVM
> pre-allocates/reserves memory.  Given that, deliberately leaking the vCPU seems
> like the least awful "solution".

Could we actually just move the xa_store() before the fd creation? I
can't immediately see any issues with that...

Will
Michal Luczaj July 31, 2024, 3:49 p.m. UTC | #4
On 7/31/24 15:31, Will Deacon wrote:
> On Tue, Jul 30, 2024 at 04:31:08PM -0700, Sean Christopherson wrote:
>> On Tue, Jul 30, 2024, Michal Luczaj wrote:
>>> On 7/30/24 17:56, Will Deacon wrote:
>>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>>> index d0788d0a72cc..b80dd8cead8c 100644
>>>> --- a/virt/kvm/kvm_main.c
>>>> +++ b/virt/kvm/kvm_main.c
>>>> @@ -4293,7 +4293,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>>>>  
>>>>  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
>>>>  		r = -EINVAL;
>>>> -		goto kvm_put_xa_release;
>>>> +		goto err_xa_release;
>>>>  	}
>>>>  
>>>>  	/*
>>>> @@ -4310,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>>>>  
>>>>  kvm_put_xa_release:
>>>>  	kvm_put_kvm_no_destroy(kvm);
>>>> +err_xa_release:
>>>>  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
>>>>  unlock_vcpu_destroy:
>>>>  	mutex_unlock(&kvm->lock);
>>>
>>> My bad for neglecting the "impossible" path. Thanks for the fix.
>>>
>>> I wonder if it's complete. If we really want to consider the possibility of
>>> this xa_store() failing, then keeping vCPU fd installed and calling
>>> kmem_cache_free(kvm_vcpu_cache, vcpu) on the error path looks wrong.
>>
>> Yeah, the vCPU is exposed to userspace, freeing its assets will just cause
>> different problems.  KVM_BUG_ON() will prevent _new_ vCPU ioctl() calls (and kick
>> running vCPUs out of the guest), but it doesn't interrupt other CPUs, e.g. if
>> userspace is being sneaking and has already invoked a vCPU ioctl(), KVM will hit
>> a use-after-free (several of them).
> 
> Damn, yes. Just because we haven't returned the fd yet, doesn't mean
> userspace can't make use of it.
>
>> As Michal alluded to, it should be impossible for xa_store() to fail since KVM
>> pre-allocates/reserves memory.  Given that, deliberately leaking the vCPU seems
>> like the least awful "solution".
> 
> Could we actually just move the xa_store() before the fd creation? I
> can't immediately see any issues with that...

Hah, please see commit afb2acb2e3a3 :) Long story short: create_vcpu_fd()
can legally fail, which must be handled gracefully, which would involve
destruction of an already xa_store()ed vCPU, which is racy.
Sean Christopherson July 31, 2024, 4:18 p.m. UTC | #5
On Wed, Jul 31, 2024, Michal Luczaj wrote:
> On 7/31/24 15:31, Will Deacon wrote:
> > On Tue, Jul 30, 2024 at 04:31:08PM -0700, Sean Christopherson wrote:
> >> On Tue, Jul 30, 2024, Michal Luczaj wrote:
> >>> On 7/30/24 17:56, Will Deacon wrote:
> >>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> >>>> index d0788d0a72cc..b80dd8cead8c 100644
> >>>> --- a/virt/kvm/kvm_main.c
> >>>> +++ b/virt/kvm/kvm_main.c
> >>>> @@ -4293,7 +4293,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> >>>>  
> >>>>  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
> >>>>  		r = -EINVAL;
> >>>> -		goto kvm_put_xa_release;
> >>>> +		goto err_xa_release;
> >>>>  	}
> >>>>  
> >>>>  	/*
> >>>> @@ -4310,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> >>>>  
> >>>>  kvm_put_xa_release:
> >>>>  	kvm_put_kvm_no_destroy(kvm);
> >>>> +err_xa_release:
> >>>>  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
> >>>>  unlock_vcpu_destroy:
> >>>>  	mutex_unlock(&kvm->lock);
> >>>
> >>> My bad for neglecting the "impossible" path. Thanks for the fix.
> >>>
> >>> I wonder if it's complete. If we really want to consider the possibility of
> >>> this xa_store() failing, then keeping vCPU fd installed and calling
> >>> kmem_cache_free(kvm_vcpu_cache, vcpu) on the error path looks wrong.
> >>
> >> Yeah, the vCPU is exposed to userspace, freeing its assets will just cause
> >> different problems.  KVM_BUG_ON() will prevent _new_ vCPU ioctl() calls (and kick
> >> running vCPUs out of the guest), but it doesn't interrupt other CPUs, e.g. if
> >> userspace is being sneaking and has already invoked a vCPU ioctl(), KVM will hit
> >> a use-after-free (several of them).
> > 
> > Damn, yes. Just because we haven't returned the fd yet, doesn't mean
> > userspace can't make use of it.
> >
> >> As Michal alluded to, it should be impossible for xa_store() to fail since KVM
> >> pre-allocates/reserves memory.  Given that, deliberately leaking the vCPU seems
> >> like the least awful "solution".
> > 
> > Could we actually just move the xa_store() before the fd creation? I
> > can't immediately see any issues with that...
> 
> Hah, please see commit afb2acb2e3a3 :) Long story short: create_vcpu_fd()
> can legally fail, which must be handled gracefully, which would involve
> destruction of an already xa_store()ed vCPU, which is racy.

Ya, the basic problem is that we have two ways of publishing the vCPU, fd and
vcpu_array, with no way of setting both atomically.  Given that xa_store() should
never fail, I vote we do the simple thing and deliberately leak the memory.
Michal Luczaj July 31, 2024, 7:27 p.m. UTC | #6
On 7/31/24 18:18, Sean Christopherson wrote:
> On Wed, Jul 31, 2024, Michal Luczaj wrote:
>> On 7/31/24 15:31, Will Deacon wrote:
>>> On Tue, Jul 30, 2024 at 04:31:08PM -0700, Sean Christopherson wrote:
>>>> On Tue, Jul 30, 2024, Michal Luczaj wrote:
>>>>> On 7/30/24 17:56, Will Deacon wrote:
>>>>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>>>>> index d0788d0a72cc..b80dd8cead8c 100644
>>>>>> --- a/virt/kvm/kvm_main.c
>>>>>> +++ b/virt/kvm/kvm_main.c
>>>>>> @@ -4293,7 +4293,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>>>>>>  
>>>>>>  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
>>>>>>  		r = -EINVAL;
>>>>>> -		goto kvm_put_xa_release;
>>>>>> +		goto err_xa_release;
>>>>>>  	}
>>>>>>  
>>>>>>  	/*
>>>>>> @@ -4310,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>>>>>>  
>>>>>>  kvm_put_xa_release:
>>>>>>  	kvm_put_kvm_no_destroy(kvm);
>>>>>> +err_xa_release:
>>>>>>  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
>>>>>>  unlock_vcpu_destroy:
>>>>>>  	mutex_unlock(&kvm->lock);
>>>>>
>>>>> My bad for neglecting the "impossible" path. Thanks for the fix.
>>>>>
>>>>> I wonder if it's complete. If we really want to consider the possibility of
>>>>> this xa_store() failing, then keeping vCPU fd installed and calling
>>>>> kmem_cache_free(kvm_vcpu_cache, vcpu) on the error path looks wrong.
>>>>
>>>> Yeah, the vCPU is exposed to userspace, freeing its assets will just cause
>>>> different problems.  KVM_BUG_ON() will prevent _new_ vCPU ioctl() calls (and kick
>>>> running vCPUs out of the guest), but it doesn't interrupt other CPUs, e.g. if
>>>> userspace is being sneaking and has already invoked a vCPU ioctl(), KVM will hit
>>>> a use-after-free (several of them).
>>>
>>> Damn, yes. Just because we haven't returned the fd yet, doesn't mean
>>> userspace can't make use of it.
>>>
>>>> As Michal alluded to, it should be impossible for xa_store() to fail since KVM
>>>> pre-allocates/reserves memory.  Given that, deliberately leaking the vCPU seems
>>>> like the least awful "solution".
>>>
>>> Could we actually just move the xa_store() before the fd creation? I
>>> can't immediately see any issues with that...
>>
>> Hah, please see commit afb2acb2e3a3 :) Long story short: create_vcpu_fd()
>> can legally fail, which must be handled gracefully, which would involve
>> destruction of an already xa_store()ed vCPU, which is racy.
> 
> Ya, the basic problem is that we have two ways of publishing the vCPU, fd and
> vcpu_array, with no way of setting both atomically.  Given that xa_store() should
> never fail, I vote we do the simple thing and deliberately leak the memory.

I agree it's a good idea. So for a failed xa_store(), just drop the goto?
Will Deacon Aug. 1, 2024, 12:41 p.m. UTC | #7
On Wed, Jul 31, 2024 at 09:18:56AM -0700, Sean Christopherson wrote:
> On Wed, Jul 31, 2024, Michal Luczaj wrote:
> > On 7/31/24 15:31, Will Deacon wrote:
> > > On Tue, Jul 30, 2024 at 04:31:08PM -0700, Sean Christopherson wrote:
> > >> On Tue, Jul 30, 2024, Michal Luczaj wrote:
> > >>> On 7/30/24 17:56, Will Deacon wrote:
> > >>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > >>>> index d0788d0a72cc..b80dd8cead8c 100644
> > >>>> --- a/virt/kvm/kvm_main.c
> > >>>> +++ b/virt/kvm/kvm_main.c
> > >>>> @@ -4293,7 +4293,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> > >>>>  
> > >>>>  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
> > >>>>  		r = -EINVAL;
> > >>>> -		goto kvm_put_xa_release;
> > >>>> +		goto err_xa_release;
> > >>>>  	}
> > >>>>  
> > >>>>  	/*
> > >>>> @@ -4310,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> > >>>>  
> > >>>>  kvm_put_xa_release:
> > >>>>  	kvm_put_kvm_no_destroy(kvm);
> > >>>> +err_xa_release:
> > >>>>  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
> > >>>>  unlock_vcpu_destroy:
> > >>>>  	mutex_unlock(&kvm->lock);
> > >>>
> > >>> My bad for neglecting the "impossible" path. Thanks for the fix.
> > >>>
> > >>> I wonder if it's complete. If we really want to consider the possibility of
> > >>> this xa_store() failing, then keeping vCPU fd installed and calling
> > >>> kmem_cache_free(kvm_vcpu_cache, vcpu) on the error path looks wrong.
> > >>
> > >> Yeah, the vCPU is exposed to userspace, freeing its assets will just cause
> > >> different problems.  KVM_BUG_ON() will prevent _new_ vCPU ioctl() calls (and kick
> > >> running vCPUs out of the guest), but it doesn't interrupt other CPUs, e.g. if
> > >> userspace is being sneaking and has already invoked a vCPU ioctl(), KVM will hit
> > >> a use-after-free (several of them).
> > > 
> > > Damn, yes. Just because we haven't returned the fd yet, doesn't mean
> > > userspace can't make use of it.
> > >
> > >> As Michal alluded to, it should be impossible for xa_store() to fail since KVM
> > >> pre-allocates/reserves memory.  Given that, deliberately leaking the vCPU seems
> > >> like the least awful "solution".
> > > 
> > > Could we actually just move the xa_store() before the fd creation? I
> > > can't immediately see any issues with that...
> > 
> > Hah, please see commit afb2acb2e3a3 :) Long story short: create_vcpu_fd()
> > can legally fail, which must be handled gracefully, which would involve
> > destruction of an already xa_store()ed vCPU, which is racy.
> 
> Ya, the basic problem is that we have two ways of publishing the vCPU, fd and
> vcpu_array, with no way of setting both atomically.  Given that xa_store() should
> never fail, I vote we do the simple thing and deliberately leak the memory.

I'm inclined to agree. This conversation did momentarily get me worried
about the window between the successful create_vcpu_fd() and the
xa_store(), but it looks like 'kvm->online_vcpus' protects that.

I'll spin a v2 leaking the vCPU, then.

Will
Michal Luczaj Aug. 4, 2024, 9:05 p.m. UTC | #8
On 8/1/24 14:41, Will Deacon wrote:
> On Wed, Jul 31, 2024 at 09:18:56AM -0700, Sean Christopherson wrote:
>> [...]
>> Ya, the basic problem is that we have two ways of publishing the vCPU, fd and
>> vcpu_array, with no way of setting both atomically.  Given that xa_store() should
>> never fail, I vote we do the simple thing and deliberately leak the memory.
> 
> I'm inclined to agree. This conversation did momentarily get me worried
> about the window between the successful create_vcpu_fd() and the
> xa_store(), but it looks like 'kvm->online_vcpus' protects that.
> 
> I'll spin a v2 leaking the vCPU, then.

But perhaps you're right. The window you've described may be an issue.
For example:

static u64 get_time_ref_counter(struct kvm *kvm)
{
	...
	vcpu = kvm_get_vcpu(kvm, 0); // may still be NULL
	tsc = kvm_read_l1_tsc(vcpu, rdtsc());
	return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
		+ hv->tsc_ref.tsc_offset;
}

u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
	return vcpu->arch.l1_tsc_offset +
		kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}

After stuffing msleep() between fd install and vcpu_array store:

[  125.296110] BUG: kernel NULL pointer dereference, address: 0000000000000b38
[  125.296203] #PF: supervisor read access in kernel mode
[  125.296266] #PF: error_code(0x0000) - not-present page
[  125.296327] PGD 12539e067 P4D 12539e067 PUD 12539d067 PMD 0
[  125.296392] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[  125.296454] CPU: 12 UID: 1000 PID: 1179 Comm: a.out Not tainted 6.11.0-rc1nokasan+ #19
[  125.296521] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[  125.296585] RIP: 0010:kvm_read_l1_tsc+0x6/0x50 [kvm]
[  125.297376] Call Trace:
[  125.297430]  <TASK>
[  125.297919]  get_time_ref_counter+0x70/0x90 [kvm]
[  125.298039]  kvm_hv_get_msr_common+0xc1/0x7d0 [kvm]
[  125.298150]  __kvm_get_msr+0x72/0xf0 [kvm]
[  125.298421]  do_get_msr+0x16/0x50 [kvm]
[  125.298531]  msr_io+0x9d/0x110 [kvm]
[  125.298626]  kvm_arch_vcpu_ioctl+0xdc5/0x19c0 [kvm]
[  125.299345]  kvm_vcpu_ioctl+0x6cc/0x920 [kvm]
[  125.299540]  __x64_sys_ioctl+0x90/0xd0
[  125.299582]  do_syscall_64+0x93/0x180
[  125.300206]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  125.300243] RIP: 0033:0x7f2d64aded2d

So, is get_time_ref_counter() broken (with a trivial fix) or should it be
considered a regression after commit afb2acb2e3a3
("KVM: Fix vcpu_array[0] races")?

Note that KASAN build, after null ptr oops, reports:

[ 3528.449742] BUG: KASAN: slab-use-after-free in mutex_can_spin_on_owner+0x18b/0x1b0
[ 3528.449884] Read of size 4 at addr ffff88814a040034 by task a.out/1240
[ 3528.450135] CPU: 6 UID: 1000 PID: 1240 Comm: a.out Tainted: G      D            6.11.0-rc1+ #20
[ 3528.450289] Tainted: [D]=DIE
[ 3528.450412] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[ 3528.450551] Call Trace:
[ 3528.450677]  <TASK>
[ 3528.450802]  dump_stack_lvl+0x68/0x90
[ 3528.450940]  print_report+0x174/0x4f6
[ 3528.451074]  ? __virt_addr_valid+0x208/0x410
[ 3528.451205]  ? mutex_can_spin_on_owner+0x18b/0x1b0
[ 3528.451337]  kasan_report+0xb9/0x190
[ 3528.451469]  ? mutex_can_spin_on_owner+0x18b/0x1b0
[ 3528.451606]  mutex_can_spin_on_owner+0x18b/0x1b0
[ 3528.451737]  __mutex_lock+0x1e3/0x1010
[ 3528.451871]  ? kvm_arch_vcpu_postcreate+0x3c/0x150 [kvm]
[ 3528.452321]  ? __pfx___mutex_lock+0x10/0x10
[ 3528.452456]  ? __pfx_lock_release+0x10/0x10
[ 3528.452642]  ? __pfx___mutex_unlock_slowpath+0x10/0x10
[ 3528.452794]  ? __pfx_do_raw_spin_lock+0x10/0x10
[ 3528.452928]  ? kvm_arch_vcpu_postcreate+0x3c/0x150 [kvm]
[ 3528.453303]  kvm_arch_vcpu_postcreate+0x3c/0x150 [kvm]
[ 3528.453663]  kvm_vm_ioctl+0x1b73/0x21b0 [kvm]
[ 3528.454025]  ? mark_lock+0xe2/0x1530
[ 3528.454160]  ? __pfx_kvm_vm_ioctl+0x10/0x10 [kvm]
[ 3528.454543]  ? __pfx_mark_lock+0x10/0x10
[ 3528.454686]  ? __pfx_lock_release+0x10/0x10
[ 3528.454826]  ? schedule+0xe8/0x3b0
[ 3528.454970]  ? __lock_acquire+0xd68/0x5e20
[ 3528.455114]  ? futex_wait_setup+0xb2/0x190
[ 3528.455252]  ? __entry_text_end+0x1543/0x10260d
[ 3528.455385]  ? __pfx___lock_acquire+0x10/0x10
[ 3528.455542]  ? __pfx_futex_wake_mark+0x10/0x10
[ 3528.455676]  ? __pfx_do_vfs_ioctl+0x10/0x10
[ 3528.455826]  ? find_held_lock+0x2d/0x110
[ 3528.455959]  ? lock_release+0x44b/0x770
[ 3528.456090]  ? __pfx_futex_wait+0x10/0x10
[ 3528.456222]  ? __pfx_ioctl_has_perm.constprop.0.isra.0+0x10/0x10
[ 3528.456368]  ? __fget_files+0x1d6/0x340
[ 3528.456508]  __x64_sys_ioctl+0x130/0x1a0
[ 3528.456641]  do_syscall_64+0x93/0x180
[ 3528.456772]  ? lockdep_hardirqs_on_prepare+0x16d/0x400
[ 3528.456907]  ? do_syscall_64+0x9f/0x180
[ 3528.457034]  ? lockdep_hardirqs_on+0x78/0x100
[ 3528.457164]  ? do_syscall_64+0x9f/0x180
[ 3528.457292]  ? lock_release+0x44b/0x770
[ 3528.457425]  ? __pfx_lock_release+0x10/0x10
[ 3528.457560]  ? lockdep_hardirqs_on_prepare+0x16d/0x400
[ 3528.457694]  ? do_syscall_64+0x9f/0x180
[ 3528.457821]  ? lockdep_hardirqs_on+0x78/0x100
[ 3528.457950]  ? do_syscall_64+0x9f/0x180
[ 3528.458081]  ? clear_bhb_loop+0x45/0xa0
[ 3528.458210]  ? clear_bhb_loop+0x45/0xa0
[ 3528.458341]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 3528.458475] RIP: 0033:0x7f2457d4ed2d
[ 3528.458609] Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
[ 3528.458783] RSP: 002b:00007ffd616de5d0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 3528.458927] RAX: ffffffffffffffda RBX: 00007ffd616de778 RCX: 00007f2457d4ed2d
[ 3528.459062] RDX: 0000000000000000 RSI: 000000000000ae41 RDI: 0000000000000004
[ 3528.459195] RBP: 00007ffd616de620 R08: 00000000004040c0 R09: 0000000000000001
[ 3528.459327] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[ 3528.459459] R13: 0000000000000000 R14: 00007f2457e77000 R15: 0000000000403e00
[ 3528.459604]  </TASK>

[ 3528.459843] Allocated by task 1240:
[ 3528.459968]  kasan_save_stack+0x1e/0x40
[ 3528.460100]  kasan_save_track+0x10/0x30
[ 3528.460230]  __kasan_slab_alloc+0x85/0x90
[ 3528.460357]  kmem_cache_alloc_node_noprof+0x12c/0x360
[ 3528.460488]  copy_process+0x372/0x8470
[ 3528.460617]  kernel_clone+0xa6/0x620
[ 3528.460744]  __do_sys_clone3+0x109/0x140
[ 3528.460873]  do_syscall_64+0x93/0x180
[ 3528.460998]  entry_SYSCALL_64_after_hwframe+0x76/0x7e

[ 3528.461246] Freed by task 0:
[ 3528.461368]  kasan_save_stack+0x1e/0x40
[ 3528.461499]  kasan_save_track+0x10/0x30
[ 3528.461628]  kasan_save_free_info+0x37/0x70
[ 3528.461757]  poison_slab_object+0x109/0x180
[ 3528.461875]  __kasan_slab_free+0x2e/0x50
[ 3528.461988]  kmem_cache_free+0x17d/0x450
[ 3528.462108]  delayed_put_task_struct+0x16a/0x1f0
[ 3528.462226]  rcu_do_batch+0x368/0xd50
[ 3528.462342]  rcu_core+0x6d5/0xb60
[ 3528.462458]  handle_softirqs+0x1b4/0x770
[ 3528.462572]  __irq_exit_rcu+0xbb/0x1c0
[ 3528.462683]  irq_exit_rcu+0xa/0x30
[ 3528.462786]  sysvec_apic_timer_interrupt+0x9d/0xc0
[ 3528.462891]  asm_sysvec_apic_timer_interrupt+0x16/0x20

[ 3528.463095] Last potentially related work creation:
[ 3528.463198]  kasan_save_stack+0x1e/0x40
[ 3528.463305]  __kasan_record_aux_stack+0xad/0xc0
[ 3528.463411]  __call_rcu_common.constprop.0+0xae/0xe80
[ 3528.463519]  __schedule+0xfd8/0x5ee0
[ 3528.463622]  schedule_idle+0x52/0x80
[ 3528.463725]  do_idle+0x25e/0x3d0
[ 3528.463828]  cpu_startup_entry+0x50/0x60
[ 3528.463925]  start_secondary+0x201/0x280
[ 3528.464023]  common_startup_64+0x13e/0x141

[ 3528.464209] Second to last potentially related work creation:
[ 3528.464306]  kasan_save_stack+0x1e/0x40
[ 3528.464396]  __kasan_record_aux_stack+0xad/0xc0
[ 3528.464485]  task_work_add+0x1bd/0x270
[ 3528.464574]  sched_tick+0x2c0/0x9d0
[ 3528.464662]  update_process_times+0xd5/0x130
[ 3528.464753]  tick_nohz_handler+0x1ae/0x4a0
[ 3528.464839]  __hrtimer_run_queues+0x164/0x880
[ 3528.464923]  hrtimer_interrupt+0x2f1/0x7f0
[ 3528.465007]  __sysvec_apic_timer_interrupt+0xfd/0x3d0
[ 3528.465091]  sysvec_apic_timer_interrupt+0x98/0xc0
[ 3528.465173]  asm_sysvec_apic_timer_interrupt+0x16/0x20

[ 3528.465352] The buggy address belongs to the object at ffff88814a040000
                which belongs to the cache task_struct of size 13128
[ 3528.465457] The buggy address is located 52 bytes inside of
                freed 13128-byte region [ffff88814a040000, ffff88814a043348)

[ 3528.465629] The buggy address belongs to the physical page:
[ 3528.465712] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14a040
[ 3528.465802] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[ 3528.465888] memcg:ffff88812c785601
[ 3528.465963] flags: 0x17ffffc0000040(head|node=0|zone=2|lastcpupid=0x1fffff)
[ 3528.466045] page_type: 0xfdffffff(slab)
[ 3528.466120] raw: 0017ffffc0000040 ffff8881002cea00 ffffea00041e4000 dead000000000004
[ 3528.466198] raw: 0000000000000000 0000000080020002 00000001fdffffff ffff88812c785601
[ 3528.466275] head: 0017ffffc0000040 ffff8881002cea00 ffffea00041e4000 dead000000000004
[ 3528.466351] head: 0000000000000000 0000000080020002 00000001fdffffff ffff88812c785601
[ 3528.466428] head: 0017ffffc0000003 ffffea0005281001 ffffffffffffffff 0000000000000000
[ 3528.466504] head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000
[ 3528.466579] page dumped because: kasan: bad access detected

[ 3528.466717] Memory state around the buggy address:
[ 3528.466789]  ffff88814a03ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 3528.466860]  ffff88814a03ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 3528.466931] >ffff88814a040000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 3528.467001]                                      ^
[ 3528.467069]  ffff88814a040080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 3528.467135]  ffff88814a040100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 3528.467201] ==================================================================
Sean Christopherson Aug. 5, 2024, 10:56 p.m. UTC | #9
On Sun, Aug 04, 2024, Michal Luczaj wrote:
> On 8/1/24 14:41, Will Deacon wrote:
> > On Wed, Jul 31, 2024 at 09:18:56AM -0700, Sean Christopherson wrote:
> >> [...]
> >> Ya, the basic problem is that we have two ways of publishing the vCPU, fd and
> >> vcpu_array, with no way of setting both atomically.  Given that xa_store() should
> >> never fail, I vote we do the simple thing and deliberately leak the memory.
> > 
> > I'm inclined to agree. This conversation did momentarily get me worried
> > about the window between the successful create_vcpu_fd() and the
> > xa_store(), but it looks like 'kvm->online_vcpus' protects that.
> > 
> > I'll spin a v2 leaking the vCPU, then.
> 
> But perhaps you're right. The window you've described may be an issue.
> For example:
> 
> static u64 get_time_ref_counter(struct kvm *kvm)
> {
> 	...
> 	vcpu = kvm_get_vcpu(kvm, 0); // may still be NULL
> 	tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> 	return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
> 		+ hv->tsc_ref.tsc_offset;
> }
> 
> u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
> {
> 	return vcpu->arch.l1_tsc_offset +
> 		kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
> }
> 
> After stuffing msleep() between fd install and vcpu_array store:
> 
> [  125.296110] BUG: kernel NULL pointer dereference, address: 0000000000000b38
> [  125.296203] #PF: supervisor read access in kernel mode
> [  125.296266] #PF: error_code(0x0000) - not-present page
> [  125.296327] PGD 12539e067 P4D 12539e067 PUD 12539d067 PMD 0
> [  125.296392] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
> [  125.296454] CPU: 12 UID: 1000 PID: 1179 Comm: a.out Not tainted 6.11.0-rc1nokasan+ #19
> [  125.296521] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
> [  125.296585] RIP: 0010:kvm_read_l1_tsc+0x6/0x50 [kvm]
> [  125.297376] Call Trace:
> [  125.297430]  <TASK>
> [  125.297919]  get_time_ref_counter+0x70/0x90 [kvm]
> [  125.298039]  kvm_hv_get_msr_common+0xc1/0x7d0 [kvm]
> [  125.298150]  __kvm_get_msr+0x72/0xf0 [kvm]
> [  125.298421]  do_get_msr+0x16/0x50 [kvm]
> [  125.298531]  msr_io+0x9d/0x110 [kvm]
> [  125.298626]  kvm_arch_vcpu_ioctl+0xdc5/0x19c0 [kvm]
> [  125.299345]  kvm_vcpu_ioctl+0x6cc/0x920 [kvm]
> [  125.299540]  __x64_sys_ioctl+0x90/0xd0
> [  125.299582]  do_syscall_64+0x93/0x180
> [  125.300206]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [  125.300243] RIP: 0033:0x7f2d64aded2d
> 
> So, is get_time_ref_counter() broken (with a trivial fix) or should it be
> considered a regression after commit afb2acb2e3a3
> ("KVM: Fix vcpu_array[0] races")?

The latter, though arguably afb2acb2e3a3 isn't really a regression since it
essentially just reverts back to the pre-Xarray code, i.e. the bug was always
there, it was just temporarily masked by a worst bug.

I don't think we want to go down the path of declaring get_time_ref_counter()
broken, because that is going to result in an impossible programming model.

Ha!  We can kill two birds with one stone.  If we take vcpu->mutex before installing
the file descriptor, and hold it until online_vcpus is bumped, userspace

Argh, so close, kvm_arch_vcpu_async_ioctl() throws a wrench in that idea.  Double
argh, whether or not an ioctl is async is buried in arch code.

I still think it makes sense to grab vcpu->mutex for synchronous ioctls.  That
way there's no vibisle change to userspace, and we can lean on that code to reject
the async ioctls, as I can't imagine there's a practical use case for emitting an
an async ioctl without first doing a synchronous ioctl.  E.g. in addition to the
below patch, plus changes to add kvm_arch_is_async_vcpu_ioctl():

	/*
	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
	 * execution; mutex_lock() would break them.  Disallow asynchronous
	 * ioctls until the vCPU is fully online.  This can only happen if
	 * userspace has *never* a done a synchronous ioctl, as acquiring the
	 * vCPU's mutex ensures the vCPU is online, i.e. isn't a restriction
	 * for any practical use case.
	 */
	if (kvm_arch_is_async_vcpu_ioctl(ioctl)) {
		if (vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus))
			return -EINVAL;
		return kvm_vcpu_async_ioctl(filp, ioctl, arg);
	}

Alternatively, we could go for the super simple change and cross our fingers that
no "real" VMM emits vCPU ioctls before KVM_CREATE_VCPU returns.

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d0788d0a72cc..9ae9022a015f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4450,6 +4450,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
        if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
                return -EINVAL;
 
+       if (unlikely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus)))
+               return -EINVAL;
+
        /*
         * Some architectures have vcpu ioctls that are asynchronous to vcpu
         * execution; mutex_lock() would break them.



The mutex approach, sans async ioctl support:

---
 virt/kvm/kvm_main.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d0788d0a72cc..0a9c390b18a3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4269,12 +4269,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 
 	mutex_lock(&kvm->lock);
 
-#ifdef CONFIG_LOCKDEP
-	/* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
-	mutex_lock(&vcpu->mutex);
-	mutex_unlock(&vcpu->mutex);
-#endif
-
 	if (kvm_get_vcpu_by_id(kvm, id)) {
 		r = -EEXIST;
 		goto unlock_vcpu_destroy;
@@ -4285,15 +4279,29 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	if (r)
 		goto unlock_vcpu_destroy;
 
-	/* Now it's all set up, let userspace reach it */
+	/*
+	 * Now it's all set up, let userspace reach it.  Grab the vCPU's mutex
+	 * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully
+	 * visibile (per online_vcpus), e.g. so that KVM doesn't get tricked
+	 * into a NULL-pointer dereference because KVM thinks the _current_
+	 * vCPU doesn't exist.  As a bonus, taking vcpu->mutex ensures lockdep
+	 * knows it's taken *inside* kvm->lock.
+	 */
+	mutex_lock(&vcpu->mutex);
 	kvm_get_kvm(kvm);
 	r = create_vcpu_fd(vcpu);
 	if (r < 0)
 		goto kvm_put_xa_release;
 
+	/*
+	 * xa_store() should never fail, see xa_reserve() above.  Leak the vCPU
+	 * if the impossible happens, as userspace already has access to the
+	 * vCPU, i.e. freeing the vCPU before userspace puts its file reference
+	 * would trigger a use-after-free.
+	 */
 	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
-		r = -EINVAL;
-		goto kvm_put_xa_release;
+		mutex_unlock(&vcpu->mutex);
+		return -EINVAL;
 	}
 
 	/*
@@ -4302,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	 */
 	smp_wmb();
 	atomic_inc(&kvm->online_vcpus);
+	mutex_unlock(&vcpu->mutex);
 
 	mutex_unlock(&kvm->lock);
 	kvm_arch_vcpu_postcreate(vcpu);
@@ -4309,6 +4318,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	return r;
 
 kvm_put_xa_release:
+	mutex_unlock(&vcpu->mutex);
 	kvm_put_kvm_no_destroy(kvm);
 	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
 unlock_vcpu_destroy:

base-commit: 332d2c1d713e232e163386c35a3ba0c1b90df83f
--
Paolo Bonzini Aug. 5, 2024, 11:02 p.m. UTC | #10
On Tue, Aug 6, 2024 at 12:56 AM Sean Christopherson <seanjc@google.com> wrote:
> +       if (unlikely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus)))
> +               return -EINVAL;

+1 to having the test _somewhere_ for async ioctls, there's so much
that can go wrong if a vcpu is not reachable by for_each_vcpu.

>         /*
>          * Some architectures have vcpu ioctls that are asynchronous to vcpu
>          * execution; mutex_lock() would break them.
>
> The mutex approach, sans async ioctl support:

Async ioctls can be handled as you suggested above by checking
vcpu_idx against online_vcpus. This mutex approach also removes the
funky lock/unlock to inform lockdep, which is a nice plus.

Paolo

> ---
>  virt/kvm/kvm_main.c | 28 +++++++++++++++++++---------
>  1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d0788d0a72cc..0a9c390b18a3 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4269,12 +4269,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>
>         mutex_lock(&kvm->lock);
>
> -#ifdef CONFIG_LOCKDEP
> -       /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
> -       mutex_lock(&vcpu->mutex);
> -       mutex_unlock(&vcpu->mutex);
> -#endif
> -
>         if (kvm_get_vcpu_by_id(kvm, id)) {
>                 r = -EEXIST;
>                 goto unlock_vcpu_destroy;
> @@ -4285,15 +4279,29 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>         if (r)
>                 goto unlock_vcpu_destroy;
>
> -       /* Now it's all set up, let userspace reach it */
> +       /*
> +        * Now it's all set up, let userspace reach it.  Grab the vCPU's mutex
> +        * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully
> +        * visibile (per online_vcpus), e.g. so that KVM doesn't get tricked
> +        * into a NULL-pointer dereference because KVM thinks the _current_
> +        * vCPU doesn't exist.  As a bonus, taking vcpu->mutex ensures lockdep
> +        * knows it's taken *inside* kvm->lock.
> +        */
> +       mutex_lock(&vcpu->mutex);
>         kvm_get_kvm(kvm);
>         r = create_vcpu_fd(vcpu);
>         if (r < 0)
>                 goto kvm_put_xa_release;
>
> +       /*
> +        * xa_store() should never fail, see xa_reserve() above.  Leak the vCPU
> +        * if the impossible happens, as userspace already has access to the
> +        * vCPU, i.e. freeing the vCPU before userspace puts its file reference
> +        * would trigger a use-after-free.
> +        */
>         if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
> -               r = -EINVAL;
> -               goto kvm_put_xa_release;
> +               mutex_unlock(&vcpu->mutex);
> +               return -EINVAL;
>         }
>
>         /*
> @@ -4302,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>          */
>         smp_wmb();
>         atomic_inc(&kvm->online_vcpus);
> +       mutex_unlock(&vcpu->mutex);
>
>         mutex_unlock(&kvm->lock);
>         kvm_arch_vcpu_postcreate(vcpu);
> @@ -4309,6 +4318,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>         return r;
>
>  kvm_put_xa_release:
> +       mutex_unlock(&vcpu->mutex);
>         kvm_put_kvm_no_destroy(kvm);
>         xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
>  unlock_vcpu_destroy:
>
> base-commit: 332d2c1d713e232e163386c35a3ba0c1b90df83f
> --
>
Michal Luczaj Aug. 6, 2024, 4:59 p.m. UTC | #11
On 8/6/24 00:56, Sean Christopherson wrote:
> [...]
> +	/*
> +	 * xa_store() should never fail, see xa_reserve() above.  Leak the vCPU
> +	 * if the impossible happens, as userspace already has access to the
> +	 * vCPU, i.e. freeing the vCPU before userspace puts its file reference
> +	 * would trigger a use-after-free.
> +	 */
>  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
> -		r = -EINVAL;
> -		goto kvm_put_xa_release;
> +		mutex_unlock(&vcpu->mutex);
> +		return -EINVAL;
>  	}
>  
>  	/*
> @@ -4302,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>  	 */
>  	smp_wmb();
>  	atomic_inc(&kvm->online_vcpus);
> +	mutex_unlock(&vcpu->mutex);
>  
>  	mutex_unlock(&kvm->lock);
>  	kvm_arch_vcpu_postcreate(vcpu);
> @@ -4309,6 +4318,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
>  	return r;
>  
>  kvm_put_xa_release:
> +	mutex_unlock(&vcpu->mutex);
>  	kvm_put_kvm_no_destroy(kvm);
>  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);

Since we're handling the impossible, isn't the BUG_ON part missing
mutex_unlock(&kvm->lock)?
Sean Christopherson Aug. 7, 2024, 9:58 p.m. UTC | #12
On Tue, Aug 06, 2024, Michal Luczaj wrote:
> On 8/6/24 00:56, Sean Christopherson wrote:
> > [...]
> > +	/*
> > +	 * xa_store() should never fail, see xa_reserve() above.  Leak the vCPU
> > +	 * if the impossible happens, as userspace already has access to the
> > +	 * vCPU, i.e. freeing the vCPU before userspace puts its file reference
> > +	 * would trigger a use-after-free.
> > +	 */
> >  	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
> > -		r = -EINVAL;
> > -		goto kvm_put_xa_release;
> > +		mutex_unlock(&vcpu->mutex);
> > +		return -EINVAL;
> >  	}
> >  
> >  	/*
> > @@ -4302,6 +4310,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> >  	 */
> >  	smp_wmb();
> >  	atomic_inc(&kvm->online_vcpus);
> > +	mutex_unlock(&vcpu->mutex);
> >  
> >  	mutex_unlock(&kvm->lock);
> >  	kvm_arch_vcpu_postcreate(vcpu);
> > @@ -4309,6 +4318,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
> >  	return r;
> >  
> >  kvm_put_xa_release:
> > +	mutex_unlock(&vcpu->mutex);
> >  	kvm_put_kvm_no_destroy(kvm);
> >  	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
> 
> Since we're handling the impossible, isn't the BUG_ON part missing
> mutex_unlock(&kvm->lock)?

Doh, yes.
diff mbox series

Patch

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d0788d0a72cc..b80dd8cead8c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4293,7 +4293,7 @@  static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 
 	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
 		r = -EINVAL;
-		goto kvm_put_xa_release;
+		goto err_xa_release;
 	}
 
 	/*
@@ -4310,6 +4310,7 @@  static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 
 kvm_put_xa_release:
 	kvm_put_kvm_no_destroy(kvm);
+err_xa_release:
 	xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
 unlock_vcpu_destroy:
 	mutex_unlock(&kvm->lock);