Message ID | 20250227230631.303431-1-kbusch@meta.com (mailing list archive) |
---|---|
Headers | show |
Series | [PATCHv3,1/2] vhost: return task creation error instead of NULL | expand |
Hi Keith V3 introduced a new bug, the following error messages from qemu output after applying this patch to boot up a guest. Error messages: error: kvm run failed Invalid argument error: kvm run failed Invalid argument EAX=00000000 EBX=00000000 ECX=00000000 EDX=000806f4 ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000 EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0000 00000000 0000ffff 00009300 CS =f000 ffff0000 0000ffff 00009b00 SS =0000 00000000 0000ffff 00009300 DS =0000 00000000 0000ffff 00009300 FS =0000 00000000 0000ffff 00009300 GS =0000 00000000 0000ffff 00009300 LDT=0000 00000000 0000ffff 00008200error: kvm run failed Invalid argument TR =0000 00000000 0000ffff 00008b00 GDT= 00000000 0000ffff IDT= 00000000 0000ffff CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000ffff0ff0 DR7=0000000000000400 EFER=0000000000000000 Code=c5 5a 08 2d 00 00 00 00 00 00 00 00 00 00 00 00 56 54 46 00 <0f> 20 c0 a8 01 74 05 e9 2c ff ff ff e9 11 ff 90 00 00 00 00 00 00 00 00 00 00 00 00 00 00 EAX=00000000 EBX=00000000 ECX=00000000 EDX=000806f4 ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000 EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0000 00000000 0000ffff 00009300 CS =f000 ffff0000 0000ffff 00009b00 SS =0000 00000000 0000ffff 00009300 DS =0000 00000000 0000ffff 00009300 FS =0000 00000000 0000ffff 00009300 GS =0000 00000000 0000ffff 00009300 LDT=0000 00000000 0000ffff 00008200 TR =0000 00000000 0000ffff 00008b00 GDT= 00000000 0000ffff IDT= 00000000 0000ffff CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000ffff0ff0 DR7=0000000000000400 EFER=0000000000000000 Code=c5 5a 08 2d 00 00 00 00 00 00 00 00 00 00 00 00 56 54 46 00 <0f> 20 c0 a8 01 74 05 e9 2c ff ff ff e9 11 ff 90 00 00 00 00 00 00 00 00 00 00 00 00 00 00 EAX=00000000 EBX=00000000 ECX=00000000 EDX=000806f4 ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000 EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0000 00000000 0000ffff 00009300 CS =f000 ffff0000 0000ffff 00009b00 SS =0000 00000000 0000ffff 00009300 DS =0000 00000000 0000ffff 00009300 FS =0000 00000000 0000ffff 00009300 GS =0000 00000000 0000ffff 00009300 LDT=0000 00000000 0000ffff 00008200 TR =0000 00000000 0000ffff 00008b00 GDT= 00000000 0000ffff IDT= 00000000 0000ffff CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000ffff0ff0 DR7=0000000000000400 EFER=0000000000000000 Code=c5 5a 08 2d 00 00 00 00 00 00 00 00 00 00 00 00 56 54 46 00 <0f> 20 c0 a8 01 74 05 e9 2c ff ff ff e9 11 ff 90 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Thanks Lei On Fri, Feb 28, 2025 at 7:06 AM Keith Busch <kbusch@meta.com> wrote: > > From: Keith Busch <kbusch@kernel.org> > > changes from v2: > > Fixed up the logical error in vhost on the new failure criteria > > Keith Busch (1): > vhost: return task creation error instead of NULL > > Sean Christopherson (1): > kvm: retry nx_huge_page_recovery_thread creation > > arch/x86/kvm/mmu/mmu.c | 12 +++++------- > drivers/vhost/vhost.c | 2 +- > include/linux/call_once.h | 16 +++++++++++----- > kernel/vhost_task.c | 4 ++-- > 4 files changed, 19 insertions(+), 15 deletions(-) > > -- > 2.43.5 >
On Fri, Feb 28, 2025, Lei Yang wrote: > Hi Keith > > V3 introduced a new bug, the following error messages from qemu output > after applying this patch to boot up a guest. Doh, my bug. Not yet tested, but this should fix things. Assuming it does, I'll post a v3 so I can add my SoB. diff --git a/include/linux/call_once.h b/include/linux/call_once.h index ddcfd91493ea..b053f4701c94 100644 --- a/include/linux/call_once.h +++ b/include/linux/call_once.h @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *)) return 0; guard(mutex)(&once->lock); - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); - if (atomic_read(&once->state) != ONCE_NOT_STARTED) + if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING)) return -EINVAL; + if (atomic_read(&once->state) == ONCE_COMPLETED) + return 0; + atomic_set(&once->state, ONCE_RUNNING); r = cb(once); if (r)
On Fri, Feb 28, 2025, Sean Christopherson wrote: > On Fri, Feb 28, 2025, Lei Yang wrote: > > Hi Keith > > > > V3 introduced a new bug, the following error messages from qemu output > > after applying this patch to boot up a guest. > > Doh, my bug. Not yet tested, but this should fix things. Assuming it does, I'll > post a v3 so I can add my SoB. v4 Confirmed that it worked, but deleting the pre-mutex check for ONCE_COMPLETED. Will post v4 later today. > diff --git a/include/linux/call_once.h b/include/linux/call_once.h > index ddcfd91493ea..b053f4701c94 100644 > --- a/include/linux/call_once.h > +++ b/include/linux/call_once.h > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *)) > return 0; > > guard(mutex)(&once->lock); > - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); > - if (atomic_read(&once->state) != ONCE_NOT_STARTED) > + if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING)) > return -EINVAL; > > + if (atomic_read(&once->state) == ONCE_COMPLETED) > + return 0; > + > atomic_set(&once->state, ONCE_RUNNING); > r = cb(once); > if (r)
On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote: > On Fri, Feb 28, 2025, Sean Christopherson wrote: > > On Fri, Feb 28, 2025, Lei Yang wrote: > > > Hi Keith > > > > > > V3 introduced a new bug, the following error messages from qemu output > > > after applying this patch to boot up a guest. > > > > Doh, my bug. Not yet tested, but this should fix things. Assuming it does, I'll > > post a v3 so I can add my SoB. > v4 > > Confirmed that it worked, but deleting the pre-mutex check for ONCE_COMPLETED. > Will post v4 later today. > > > diff --git a/include/linux/call_once.h b/include/linux/call_once.h > > index ddcfd91493ea..b053f4701c94 100644 > > --- a/include/linux/call_once.h > > +++ b/include/linux/call_once.h > > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *)) > > return 0; > > > > guard(mutex)(&once->lock); > > - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); > > - if (atomic_read(&once->state) != ONCE_NOT_STARTED) > > + if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING)) > > return -EINVAL; > > > > + if (atomic_read(&once->state) == ONCE_COMPLETED) > > + return 0; > > + > > atomic_set(&once->state, ONCE_RUNNING); > > r = cb(once); > > if (r) Possible suggestion since it seems odd to do an atomic_read twice on the same value. Maybe make this a switch: switch (atomic_read(&once->state) { case ONCE_NOT_STARTED: atomic_set(&once->state, ONCE_RUNNING); break; case ONCE_COMPLETED: return 0; case ONCE_RUNNING: default: WARN_ON(1); return -EINVAL; }
On Fri, Feb 28, 2025, Keith Busch wrote: > On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote: > > > diff --git a/include/linux/call_once.h b/include/linux/call_once.h > > > index ddcfd91493ea..b053f4701c94 100644 > > > --- a/include/linux/call_once.h > > > +++ b/include/linux/call_once.h > > > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *)) > > > return 0; > > > > > > guard(mutex)(&once->lock); > > > - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); > > > - if (atomic_read(&once->state) != ONCE_NOT_STARTED) > > > + if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING)) > > > return -EINVAL; > > > > > > + if (atomic_read(&once->state) == ONCE_COMPLETED) > > > + return 0; > > > + > > > atomic_set(&once->state, ONCE_RUNNING); > > > r = cb(once); > > > if (r) > > Possible suggestion since it seems odd to do an atomic_read twice on the > same value. Yeah, good call. At the risk of getting too cute, how about this? static inline int call_once(struct once *once, int (*cb)(struct once *)) { int r, state; /* Pairs with atomic_set_release() below. */ if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) return 0; guard(mutex)(&once->lock); state = atomic_read(&once->state); if (unlikely(state != ONCE_NOT_STARTED)) return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0; atomic_set(&once->state, ONCE_RUNNING); r = cb(once); if (r) atomic_set(&once->state, ONCE_NOT_STARTED); else atomic_set_release(&once->state, ONCE_COMPLETED); return r; }
On Fri, Feb 28, 2025 at 07:29:45AM -0800, Sean Christopherson wrote: > On Fri, Feb 28, 2025, Keith Busch wrote: > > On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote: > > > > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *)) > > > > return 0; > > > > > > > > guard(mutex)(&once->lock); > > > > - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); > > > > - if (atomic_read(&once->state) != ONCE_NOT_STARTED) > > > > + if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING)) > > > > return -EINVAL; > > > > > > > > + if (atomic_read(&once->state) == ONCE_COMPLETED) > > > > + return 0; > > > > + > > > > atomic_set(&once->state, ONCE_RUNNING); > > > > r = cb(once); > > > > if (r) > > > > Possible suggestion since it seems odd to do an atomic_read twice on the > > same value. > > Yeah, good call. At the risk of getting too cute, how about this? Sure, that also looks good to me.
On 2/28/25 16:36, Keith Busch wrote: > On Fri, Feb 28, 2025 at 07:29:45AM -0800, Sean Christopherson wrote: >> On Fri, Feb 28, 2025, Keith Busch wrote: >>> On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote: >>>>> @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *)) >>>>> return 0; >>>>> >>>>> guard(mutex)(&once->lock); >>>>> - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); >>>>> - if (atomic_read(&once->state) != ONCE_NOT_STARTED) >>>>> + if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING)) >>>>> return -EINVAL; >>>>> >>>>> + if (atomic_read(&once->state) == ONCE_COMPLETED) >>>>> + return 0; >>>>> + >>>>> atomic_set(&once->state, ONCE_RUNNING); >>>>> r = cb(once); >>>>> if (r) >>> >>> Possible suggestion since it seems odd to do an atomic_read twice on the >>> same value. >> >> Yeah, good call. At the risk of getting too cute, how about this? > > Sure, that also looks good to me. Just to overthink it a bit more, I'm changing "if (r)" to "if (r < 0)". Not because it's particularly useful to return a meaningful nonzero value on the first initialization, but more because 0+ for success and -errno for failure is a more common. Queued with this change, thanks. (Keith, I haven't forgotten about AVX by the way). Paolo
On Fri, Feb 28, 2025, Paolo Bonzini wrote: > On 2/28/25 16:36, Keith Busch wrote: > > On Fri, Feb 28, 2025 at 07:29:45AM -0800, Sean Christopherson wrote: > > > On Fri, Feb 28, 2025, Keith Busch wrote: > > > > On Fri, Feb 28, 2025 at 06:32:47AM -0800, Sean Christopherson wrote: > > > > > > @@ -35,10 +35,12 @@ static inline int call_once(struct once *once, int (*cb)(struct once *)) > > > > > > return 0; > > > > > > guard(mutex)(&once->lock); > > > > > > - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); > > > > > > - if (atomic_read(&once->state) != ONCE_NOT_STARTED) > > > > > > + if (WARN_ON(atomic_read(&once->state) == ONCE_RUNNING)) > > > > > > return -EINVAL; > > > > > > + if (atomic_read(&once->state) == ONCE_COMPLETED) > > > > > > + return 0; > > > > > > + > > > > > > atomic_set(&once->state, ONCE_RUNNING); > > > > > > r = cb(once); > > > > > > if (r) > > > > > > > > Possible suggestion since it seems odd to do an atomic_read twice on the > > > > same value. > > > > > > Yeah, good call. At the risk of getting too cute, how about this? > > > > Sure, that also looks good to me. > > Just to overthink it a bit more, I'm changing "if (r)" to "if (r < 0)". Not > because it's particularly useful to return a meaningful nonzero value on the > first initialization, but more because 0+ for success and -errno for failure > is a more common. > > Queued with this change, thanks. If it's not too late, the first patch can/should use ERR_CAST() instead of a PTR_ERR() => ERR_PTR(): tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); return ERR_CAST(tsk); } And I was going to get greedy and replace spaces with tabs in call_once. The changelog for this patch is also misleading. KVM_RUN doesn't currently return -ERESTARTNOINTR, it only ever returns -ENOMEN. copy_process() is what returns -ERESTARTNOINTR. I also think it's worth calling out that it's a non-fatal signal. -- From: Sean Christopherson <seanjc@google.com> Date: Thu, 27 Feb 2025 15:06:31 -0800 Subject: [PATCH] KVM: x86/mmu: Allow retry of nx_huge_page_recovery_thread creation A VMM may send a non-fatal signal to its threads, including vCPU tasks, at any time, and thus may signal vCPU tasks during KVM_RUN. If a vCPU task receives the signal while its trying to spawn the huge page recovery vhost task, then KVM_RUN will fail due to copy_process() returning -ERESTARTNOINTR. Rework call_once() to mark the call complete if and only if the called function succeeds, and plumb the function's true error code back to the call_once() invoker. This provides userspace with the correct, non-fatal error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge page task. Opportunistically replace spaces with tabs in call_once.h. Fixes: 931656b9e2ff ("kvm: defer huge page recovery vhost task to later") Co-developed-by: Keith Busch <kbusch@kernel.org> Signed-off-by: Keith Busch <kbusch@kernel.org> Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 10 ++++------ include/linux/call_once.h | 36 +++++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 70af12b693a3..63bb77ee1bb1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7633,7 +7633,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -static void kvm_mmu_start_lpage_recovery(struct once *once) +static int kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); @@ -7645,12 +7645,13 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) kvm, "kvm-nx-lpage-recovery"); if (IS_ERR(nx_thread)) - return; + return PTR_ERR(nx_thread); vhost_task_start(nx_thread); /* Make the task visible only once it is fully started. */ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; } int kvm_mmu_post_init_vm(struct kvm *kvm) @@ -7658,10 +7659,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) if (nx_hugepage_mitigation_hard_disabled) return 0; - call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); - if (!kvm->arch.nx_huge_page_recovery_thread) - return -ENOMEM; - return 0; + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) diff --git a/include/linux/call_once.h b/include/linux/call_once.h index 6261aa0b3fb0..56cb9625b48b 100644 --- a/include/linux/call_once.h +++ b/include/linux/call_once.h @@ -9,15 +9,15 @@ #define ONCE_COMPLETED 2 struct once { - atomic_t state; - struct mutex lock; + atomic_t state; + struct mutex lock; }; static inline void __once_init(struct once *once, const char *name, struct lock_class_key *key) { - atomic_set(&once->state, ONCE_NOT_STARTED); - __mutex_init(&once->lock, name, key); + atomic_set(&once->state, ONCE_NOT_STARTED); + __mutex_init(&once->lock, name, key); } #define once_init(once) \ @@ -26,20 +26,26 @@ do { \ __once_init((once), #once, &__key); \ } while (0) -static inline void call_once(struct once *once, void (*cb)(struct once *)) +static inline int call_once(struct once *once, int (*cb)(struct once *)) { - /* Pairs with atomic_set_release() below. */ - if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) - return; + int r, state; - guard(mutex)(&once->lock); - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); - if (atomic_read(&once->state) != ONCE_NOT_STARTED) - return; + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return 0; - atomic_set(&once->state, ONCE_RUNNING); - cb(once); - atomic_set_release(&once->state, ONCE_COMPLETED); + guard(mutex)(&once->lock); + state = atomic_read(&once->state); + if (unlikely(state != ONCE_NOT_STARTED)) + return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0; + + atomic_set(&once->state, ONCE_RUNNING); + r = cb(once); + if (r) + atomic_set(&once->state, ONCE_NOT_STARTED); + else + atomic_set_release(&once->state, ONCE_COMPLETED); + return r; } #endif /* _LINUX_CALL_ONCE_H */ base-commit: 7d2322b0472eb402e8a206ba9b332b6c75f6f130 --
From: Keith Busch <kbusch@kernel.org> changes from v2: Fixed up the logical error in vhost on the new failure criteria Keith Busch (1): vhost: return task creation error instead of NULL Sean Christopherson (1): kvm: retry nx_huge_page_recovery_thread creation arch/x86/kvm/mmu/mmu.c | 12 +++++------- drivers/vhost/vhost.c | 2 +- include/linux/call_once.h | 16 +++++++++++----- kernel/vhost_task.c | 4 ++-- 4 files changed, 19 insertions(+), 15 deletions(-)