mbox series

[PATCHv3,0/2]

Message ID 20250227230631.303431-1-kbusch@meta.com (mailing list archive)
Headers show
Series [PATCHv3,1/2] vhost: return task creation error instead of NULL | expand

Message

Keith Busch Feb. 27, 2025, 11:06 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

changes from v2:

  Fixed up the logical error in vhost on the new failure criteria

Keith Busch (1):
  vhost: return task creation error instead of NULL

Sean Christopherson (1):
  kvm: retry nx_huge_page_recovery_thread creation

 arch/x86/kvm/mmu/mmu.c    | 12 +++++-------
 drivers/vhost/vhost.c     |  2 +-
 include/linux/call_once.h | 16 +++++++++++-----
 kernel/vhost_task.c       |  4 ++--
 4 files changed, 19 insertions(+), 15 deletions(-)

Comments

Lei Yang Feb. 28, 2025, 8:07 a.m. UTC | #1
Hi Keith

V3 introduced a new bug, the following error messages from qemu output
after applying this patch to boot up a guest.
Error messages:
error: kvm run failed Invalid argument
error: kvm run failed Invalid argument
EAX=00000000 EBX=00000000 ECX=00000000 EDX=000806f4
ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000
EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 00000000 0000ffff 00009300
CS =f000 ffff0000 0000ffff 00009b00
SS =0000 00000000 0000ffff 00009300
DS =0000 00000000 0000ffff 00009300
FS =0000 00000000 0000ffff 00009300
GS =0000 00000000 0000ffff 00009300
LDT=0000 00000000 0000ffff 00008200error: kvm run failed Invalid argument

TR =0000 00000000 0000ffff 00008b00
GDT=     00000000 0000ffff
IDT=     00000000 0000ffff
CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000
DR3=0000000000000000
DR6=00000000ffff0ff0 DR7=0000000000000400
EFER=0000000000000000
Code=c5 5a 08 2d 00 00 00 00 00 00 00 00 00 00 00 00 56 54 46 00 <0f>
20 c0 a8 01 74 05 e9 2c ff ff ff e9 11 ff 90 00 00 00 00 00 00 00 00
00 00 00 00 00 00
EAX=00000000 EBX=00000000 ECX=00000000 EDX=000806f4
ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000
EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 00000000 0000ffff 00009300
CS =f000 ffff0000 0000ffff 00009b00
SS =0000 00000000 0000ffff 00009300
DS =0000 00000000 0000ffff 00009300
FS =0000 00000000 0000ffff 00009300
GS =0000 00000000 0000ffff 00009300
LDT=0000 00000000 0000ffff 00008200
TR =0000 00000000 0000ffff 00008b00
GDT=     00000000 0000ffff
IDT=     00000000 0000ffff
CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000
DR3=0000000000000000
DR6=00000000ffff0ff0 DR7=0000000000000400
EFER=0000000000000000
Code=c5 5a 08 2d 00 00 00 00 00 00 00 00 00 00 00 00 56 54 46 00 <0f>
20 c0 a8 01 74 05 e9 2c ff ff ff e9 11 ff 90 00 00 00 00 00 00 00 00
00 00 00 00 00 00
EAX=00000000 EBX=00000000 ECX=00000000 EDX=000806f4
ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000
EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 00000000 0000ffff 00009300
CS =f000 ffff0000 0000ffff 00009b00
SS =0000 00000000 0000ffff 00009300
DS =0000 00000000 0000ffff 00009300
FS =0000 00000000 0000ffff 00009300
GS =0000 00000000 0000ffff 00009300
LDT=0000 00000000 0000ffff 00008200
TR =0000 00000000 0000ffff 00008b00
GDT=     00000000 0000ffff
IDT=     00000000 0000ffff
CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000
DR3=0000000000000000
DR6=00000000ffff0ff0 DR7=0000000000000400
EFER=0000000000000000
Code=c5 5a 08 2d 00 00 00 00 00 00 00 00 00 00 00 00 56 54 46 00 <0f>
20 c0 a8 01 74 05 e9 2c ff ff ff e9 11 ff 90 00 00 00 00 00 00 00 00
00 00 00 00 00 00

Thanks
Lei

On Fri, Feb 28, 2025 at 7:06 AM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> changes from v2:
>
>   Fixed up the logical error in vhost on the new failure criteria
>
> Keith Busch (1):
>   vhost: return task creation error instead of NULL
>
> Sean Christopherson (1):
>   kvm: retry nx_huge_page_recovery_thread creation
>
>  arch/x86/kvm/mmu/mmu.c    | 12 +++++-------
>  drivers/vhost/vhost.c     |  2 +-
>  include/linux/call_once.h | 16 +++++++++++-----
>  kernel/vhost_task.c       |  4 ++--
>  4 files changed, 19 insertions(+), 15 deletions(-)
>
> --
> 2.43.5
>
Sean Christopherson Feb. 28, 2025, 2:15 p.m. UTC | #2
On Fri, Feb 28, 2025, Lei Yang wrote:
> Hi Keith
> 
> V3 introduced a new bug, the following error messages from qemu output
> after applying this patch to boot up a guest.

Doh, my bug.  Not yet tested, but this should fix things.  Assuming it does, I'll
post a v3 so I can add my SoB.

diff --git a/include/linux/call_once.h b/include/linux/call_once.h
index ddcfd91493ea..b053f4701c94 100644
--- a/include/linux/call_once.h
+++ b/include/linux/call_once.h
@@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *))
                return 0;
 
         guard(mutex)(&once->lock);
-        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
-        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
+        if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING))
                 return -EINVAL;
 
+        if (atomic_read(&once->state) == ONCE_COMPLETED)
+                return 0;
+
         atomic_set(&once->state, ONCE_RUNNING);
        r = cb(once);
        if (r)
Sean Christopherson Feb. 28, 2025, 2:32 p.m. UTC | #3
On Fri, Feb 28, 2025, Sean Christopherson wrote:
> On Fri, Feb 28, 2025, Lei Yang wrote:
> > Hi Keith
> > 
> > V3 introduced a new bug, the following error messages from qemu output
> > after applying this patch to boot up a guest.
> 
> Doh, my bug.  Not yet tested, but this should fix things.  Assuming it does, I'll
> post a v3 so I can add my SoB.
         v4

Confirmed that it worked, but deleting the pre-mutex check for ONCE_COMPLETED.
Will post v4 later today.

> diff --git a/include/linux/call_once.h b/include/linux/call_once.h
> index ddcfd91493ea..b053f4701c94 100644
> --- a/include/linux/call_once.h
> +++ b/include/linux/call_once.h
> @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *))
>                 return 0;
>  
>          guard(mutex)(&once->lock);
> -        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
> -        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
> +        if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING))
>                  return -EINVAL;
>  
> +        if (atomic_read(&once->state) == ONCE_COMPLETED)
> +                return 0;
> +
>          atomic_set(&once->state, ONCE_RUNNING);
>         r = cb(once);
>         if (r)
Keith Busch Feb. 28, 2025, 2:58 p.m. UTC | #4
On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote:
> On Fri, Feb 28, 2025, Sean Christopherson wrote:
> > On Fri, Feb 28, 2025, Lei Yang wrote:
> > > Hi Keith
> > > 
> > > V3 introduced a new bug, the following error messages from qemu output
> > > after applying this patch to boot up a guest.
> > 
> > Doh, my bug.  Not yet tested, but this should fix things.  Assuming it does, I'll
> > post a v3 so I can add my SoB.
>          v4
> 
> Confirmed that it worked, but deleting the pre-mutex check for ONCE_COMPLETED.
> Will post v4 later today.
> 
> > diff --git a/include/linux/call_once.h b/include/linux/call_once.h
> > index ddcfd91493ea..b053f4701c94 100644
> > --- a/include/linux/call_once.h
> > +++ b/include/linux/call_once.h
> > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *))
> >                 return 0;
> >  
> >          guard(mutex)(&once->lock);
> > -        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
> > -        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
> > +        if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING))
> >                  return -EINVAL;
> >  
> > +        if (atomic_read(&once->state) == ONCE_COMPLETED)
> > +                return 0;
> > +
> >          atomic_set(&once->state, ONCE_RUNNING);
> >         r = cb(once);
> >         if (r)

Possible suggestion since it seems odd to do an atomic_read twice on the
same value. Maybe make this a switch:

	switch (atomic_read(&once->state) {
	case ONCE_NOT_STARTED:
		atomic_set(&once->state, ONCE_RUNNING);
		break;
	case ONCE_COMPLETED:
		return 0;
	case ONCE_RUNNING:
	default:
		WARN_ON(1);
		return -EINVAL;
	}
Sean Christopherson Feb. 28, 2025, 3:29 p.m. UTC | #5
On Fri, Feb 28, 2025, Keith Busch wrote:
> On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote:
> > > diff --git a/include/linux/call_once.h b/include/linux/call_once.h
> > > index ddcfd91493ea..b053f4701c94 100644
> > > --- a/include/linux/call_once.h
> > > +++ b/include/linux/call_once.h
> > > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *))
> > >                 return 0;
> > >  
> > >          guard(mutex)(&once->lock);
> > > -        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
> > > -        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
> > > +        if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING))
> > >                  return -EINVAL;
> > >  
> > > +        if (atomic_read(&once->state) == ONCE_COMPLETED)
> > > +                return 0;
> > > +
> > >          atomic_set(&once->state, ONCE_RUNNING);
> > >         r = cb(once);
> > >         if (r)
> 
> Possible suggestion since it seems odd to do an atomic_read twice on the
> same value.

Yeah, good call.  At the risk of getting too cute, how about this?

static inline int call_once(struct once *once, int (*cb)(struct once *))
{
	int r, state;

	/* Pairs with atomic_set_release() below.  */
	if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
		return 0;

	guard(mutex)(&once->lock);
	state = atomic_read(&once->state);
	if (unlikely(state != ONCE_NOT_STARTED))
		return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0;

	atomic_set(&once->state, ONCE_RUNNING);
	r = cb(once);
	if (r)
		atomic_set(&once->state, ONCE_NOT_STARTED);
	else
		atomic_set_release(&once->state, ONCE_COMPLETED);
	return r;
}
Keith Busch Feb. 28, 2025, 3:36 p.m. UTC | #6
On Fri, Feb 28, 2025 at 07:29:45AM -0800, Sean Christopherson wrote:
> On Fri, Feb 28, 2025, Keith Busch wrote:
> > On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote:
> > > > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *))
> > > >                 return 0;
> > > >  
> > > >          guard(mutex)(&once->lock);
> > > > -        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
> > > > -        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
> > > > +        if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING))
> > > >                  return -EINVAL;
> > > >  
> > > > +        if (atomic_read(&once->state) == ONCE_COMPLETED)
> > > > +                return 0;
> > > > +
> > > >          atomic_set(&once->state, ONCE_RUNNING);
> > > >         r = cb(once);
> > > >         if (r)
> > 
> > Possible suggestion since it seems odd to do an atomic_read twice on the
> > same value.
> 
> Yeah, good call.  At the risk of getting too cute, how about this?

Sure, that also looks good to me.
Paolo Bonzini Feb. 28, 2025, 4:43 p.m. UTC | #7
On 2/28/25 16:36, Keith Busch wrote:
> On Fri, Feb 28, 2025 at 07:29:45AM -0800, Sean Christopherson wrote:
>> On Fri, Feb 28, 2025, Keith Busch wrote:
>>> On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote:
>>>>> @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *))
>>>>>                  return 0;
>>>>>   
>>>>>           guard(mutex)(&once->lock);
>>>>> -        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
>>>>> -        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
>>>>> +        if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING))
>>>>>                   return -EINVAL;
>>>>>   
>>>>> +        if (atomic_read(&once->state) == ONCE_COMPLETED)
>>>>> +                return 0;
>>>>> +
>>>>>           atomic_set(&once->state, ONCE_RUNNING);
>>>>>          r = cb(once);
>>>>>          if (r)
>>>
>>> Possible suggestion since it seems odd to do an atomic_read twice on the
>>> same value.
>>
>> Yeah, good call.  At the risk of getting too cute, how about this?
> 
> Sure, that also looks good to me.

Just to overthink it a bit more, I'm changing "if (r)" to "if (r < 0)". 
Not because it's particularly useful to return a meaningful nonzero 
value on the first initialization, but more because 0+ for success and 
-errno for failure is a more common.

Queued with this change, thanks.

(Keith, I haven't forgotten about AVX by the way).

Paolo
Sean Christopherson Feb. 28, 2025, 5:03 p.m. UTC | #8
On Fri, Feb 28, 2025, Paolo Bonzini wrote:
> On 2/28/25 16:36, Keith Busch wrote:
> > On Fri, Feb 28, 2025 at 07:29:45AM -0800, Sean Christopherson wrote:
> > > On Fri, Feb 28, 2025, Keith Busch wrote:
> > > > On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote:
> > > > > > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *))
> > > > > >                  return 0;
> > > > > >           guard(mutex)(&once->lock);
> > > > > > -        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
> > > > > > -        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
> > > > > > +        if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING))
> > > > > >                   return -EINVAL;
> > > > > > +        if (atomic_read(&once->state) == ONCE_COMPLETED)
> > > > > > +                return 0;
> > > > > > +
> > > > > >           atomic_set(&once->state, ONCE_RUNNING);
> > > > > >          r = cb(once);
> > > > > >          if (r)
> > > > 
> > > > Possible suggestion since it seems odd to do an atomic_read twice on the
> > > > same value.
> > > 
> > > Yeah, good call.  At the risk of getting too cute, how about this?
> > 
> > Sure, that also looks good to me.
> 
> Just to overthink it a bit more, I'm changing "if (r)" to "if (r < 0)". Not
> because it's particularly useful to return a meaningful nonzero value on the
> first initialization, but more because 0+ for success and -errno for failure
> is a more common.
> 
> Queued with this change, thanks.

If it's not too late, the first patch can/should use ERR_CAST() instead of a
PTR_ERR() => ERR_PTR():

	tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
	if (IS_ERR(tsk)) {
		kfree(vtsk);
		return ERR_CAST(tsk);
	}

And I was going to get greedy and replace spaces with tabs in call_once.

The changelog for this patch is also misleading.  KVM_RUN doesn't currently return
-ERESTARTNOINTR, it only ever returns -ENOMEN.  copy_process() is what returns
-ERESTARTNOINTR.

I also think it's worth calling out that it's a non-fatal signal.

--
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 27 Feb 2025 15:06:31 -0800
Subject: [PATCH] KVM: x86/mmu: Allow retry of nx_huge_page_recovery_thread
 creation

A VMM may send a non-fatal signal to its threads, including vCPU tasks,
at any time, and thus may signal vCPU tasks during KVM_RUN.  If a vCPU
task receives the signal while its trying to spawn the huge page recovery
vhost task, then KVM_RUN will fail due to copy_process() returning
-ERESTARTNOINTR.

Rework call_once() to mark the call complete if and only if the called
function succeeds, and plumb the function's true error code back to the
call_once() invoker.  This provides userspace with the correct, non-fatal
error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows
subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge
page task.

Opportunistically replace spaces with tabs in call_once.h.

Fixes: 931656b9e2ff ("kvm: defer huge page recovery vhost task to later")
Co-developed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c    | 10 ++++------
 include/linux/call_once.h | 36 +++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 70af12b693a3..63bb77ee1bb1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7633,7 +7633,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
 	return true;
 }
 
-static void kvm_mmu_start_lpage_recovery(struct once *once)
+static int kvm_mmu_start_lpage_recovery(struct once *once)
 {
 	struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
 	struct kvm *kvm = container_of(ka, struct kvm, arch);
@@ -7645,12 +7645,13 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
 				      kvm, "kvm-nx-lpage-recovery");
 
 	if (IS_ERR(nx_thread))
-		return;
+		return PTR_ERR(nx_thread);
 
 	vhost_task_start(nx_thread);
 
 	/* Make the task visible only once it is fully started. */
 	WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
+	return 0;
 }
 
 int kvm_mmu_post_init_vm(struct kvm *kvm)
@@ -7658,10 +7659,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
 	if (nx_hugepage_mitigation_hard_disabled)
 		return 0;
 
-	call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
-	if (!kvm->arch.nx_huge_page_recovery_thread)
-		return -ENOMEM;
-	return 0;
+	return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
 }
 
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
diff --git a/include/linux/call_once.h b/include/linux/call_once.h
index 6261aa0b3fb0..56cb9625b48b 100644
--- a/include/linux/call_once.h
+++ b/include/linux/call_once.h
@@ -9,15 +9,15 @@
 #define ONCE_COMPLETED   2
 
 struct once {
-        atomic_t state;
-        struct mutex lock;
+	atomic_t state;
+	struct mutex lock;
 };
 
 static inline void __once_init(struct once *once, const char *name,
 			       struct lock_class_key *key)
 {
-        atomic_set(&once->state, ONCE_NOT_STARTED);
-        __mutex_init(&once->lock, name, key);
+	atomic_set(&once->state, ONCE_NOT_STARTED);
+	__mutex_init(&once->lock, name, key);
 }
 
 #define once_init(once)							\
@@ -26,20 +26,26 @@ do {									\
 	__once_init((once), #once, &__key);				\
 } while (0)
 
-static inline void call_once(struct once *once, void (*cb)(struct once *))
+static inline int call_once(struct once *once, int (*cb)(struct once *))
 {
-        /* Pairs with atomic_set_release() below.  */
-        if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
-                return;
+	int r, state;
 
-        guard(mutex)(&once->lock);
-        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
-        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
-                return;
+	/* Pairs with atomic_set_release() below.  */
+	if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
+		return 0;
 
-        atomic_set(&once->state, ONCE_RUNNING);
-        cb(once);
-        atomic_set_release(&once->state, ONCE_COMPLETED);
+	guard(mutex)(&once->lock);
+	state = atomic_read(&once->state);
+	if (unlikely(state != ONCE_NOT_STARTED))
+		return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0;
+
+	atomic_set(&once->state, ONCE_RUNNING);
+	r = cb(once);
+	if (r)
+		atomic_set(&once->state, ONCE_NOT_STARTED);
+	else
+		atomic_set_release(&once->state, ONCE_COMPLETED);
+	return r;
 }
 
 #endif /* _LINUX_CALL_ONCE_H */

base-commit: 7d2322b0472eb402e8a206ba9b332b6c75f6f130
--