diff mbox series

[3/5] KVM: Add a common kvm_run flag to communicate an exit needs completion

Message ID 20250111012450.1262638-4-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: Add a kvm_run flag to signal need for completion | expand

Commit Message

Sean Christopherson Jan. 11, 2025, 1:24 a.m. UTC
Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
that KVM_RUN needs to be re-executed prior to save/restore in order to
complete the instruction/operation that triggered the userspace exit.

KVM's current approach of adding notes in the Documentation is beyond
brittle, e.g. there is at least one known case where a KVM developer added
a new userspace exit type, and then that same developer forgot to handle
completion when adding userspace support.

On x86, there are multiple exits that need completion, but they are all
conveniently funneled through a single callback, i.e. in theory, this is a
one-time thing for KVM x86 (and other architectures could follow suit with
additional refactoring).

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 Documentation/virt/kvm/api.rst    | 48 ++++++++++++++++++++++---------
 arch/powerpc/kvm/book3s_emulate.c |  1 +
 arch/powerpc/kvm/book3s_hv.c      |  1 +
 arch/powerpc/kvm/book3s_pr.c      |  2 ++
 arch/powerpc/kvm/booke.c          |  1 +
 arch/x86/include/uapi/asm/kvm.h   |  7 +++--
 arch/x86/kvm/x86.c                |  2 ++
 include/uapi/linux/kvm.h          |  3 ++
 virt/kvm/kvm_main.c               |  1 +
 9 files changed, 49 insertions(+), 17 deletions(-)

Comments

Marc Zyngier Jan. 11, 2025, 11:01 a.m. UTC | #1
On Sat, 11 Jan 2025 01:24:48 +0000,
Sean Christopherson <seanjc@google.com> wrote:
> 
> Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
> that KVM_RUN needs to be re-executed prior to save/restore in order to
> complete the instruction/operation that triggered the userspace exit.
> 
> KVM's current approach of adding notes in the Documentation is beyond
> brittle, e.g. there is at least one known case where a KVM developer added
> a new userspace exit type, and then that same developer forgot to handle
> completion when adding userspace support.

Is this going to fix anything? If they couldn't be bothered to read
the documentation, let alone update it, how is that going to be
improved by extra rules and regulations?

I don't see how someone ignoring the documented behaviour of a given
exit reason is, all of a sudden, have an epiphany and take a *new*
flag into account.

> 
> On x86, there are multiple exits that need completion, but they are all
> conveniently funneled through a single callback, i.e. in theory, this is a
> one-time thing for KVM x86 (and other architectures could follow suit with
> additional refactoring).
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  Documentation/virt/kvm/api.rst    | 48 ++++++++++++++++++++++---------
>  arch/powerpc/kvm/book3s_emulate.c |  1 +
>  arch/powerpc/kvm/book3s_hv.c      |  1 +
>  arch/powerpc/kvm/book3s_pr.c      |  2 ++
>  arch/powerpc/kvm/booke.c          |  1 +
>  arch/x86/include/uapi/asm/kvm.h   |  7 +++--
>  arch/x86/kvm/x86.c                |  2 ++
>  include/uapi/linux/kvm.h          |  3 ++
>  virt/kvm/kvm_main.c               |  1 +
>  9 files changed, 49 insertions(+), 17 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index c92c8d4e8779..8e172675d8d6 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6505,7 +6505,7 @@ local APIC is not used.
>  
>  	__u16 flags;
>  
> -More architecture-specific flags detailing state of the VCPU that may
> +Common and architecture-specific flags detailing state of the VCPU that may
>  affect the device's behavior. Current defined flags::
>  
>    /* x86, set if the VCPU is in system management mode */
> @@ -6518,6 +6518,8 @@ affect the device's behavior. Current defined flags::
>    /* arm64, set for KVM_EXIT_DEBUG */
>    #define KVM_DEBUG_ARCH_HSR_HIGH_VALID  (1 << 0)
>  
> +  /* all architectures, set when the exit needs completion (via KVM_RUN) */
> +  #define KVM_RUN_NEEDS_COMPLETION  (1 << 15)
>  ::
>  
>  	/* in (pre_kvm_run), out (post_kvm_run) */
> @@ -6632,19 +6634,10 @@ requires a guest to interact with host userspace.
>  
>  .. note::
>  
> -      For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
> -      KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR, KVM_EXIT_X86_WRMSR, and KVM_EXIT_HYPERCALL
> -      the corresponding operations are complete (and guest state is consistent)
> -      only after userspace has re-entered the kernel with KVM_RUN.  The kernel
> -      side will first finish incomplete operations and then check for pending
> -      signals.
> +      For some exits, userspace must re-enter the kernel with KVM_RUN to
> +      complete the exit and ensure guest state is consistent.
>  
> -      The pending state of the operation is not preserved in state which is
> -      visible to userspace, thus userspace should ensure that the operation is
> -      completed before performing a live migration.  Userspace can re-enter the
> -      guest with an unmasked signal pending or with the immediate_exit field set
> -      to complete pending operations without allowing any further instructions
> -      to be executed.
> +      See KVM_CAP_NEEDS_COMPLETION for details.
>  
>  ::
>  
> @@ -8239,7 +8232,7 @@ Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the
>  core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest.
>  
>  7.36 KVM_CAP_X86_GUEST_MODE
> -------------------------------
> +---------------------------
>  
>  :Architectures: x86
>  :Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
> @@ -8252,6 +8245,33 @@ KVM exits with the register state of either the L1 or L2 guest
>  depending on which executed at the time of an exit. Userspace must
>  take care to differentiate between these cases.
>  
> +7.37 KVM_CAP_NEEDS_COMPLETION
> +-----------------------------
> +
> +:Architectures: all
> +:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
> +
> +The presence of this capability indicates that KVM_RUN will set
> +KVM_RUN_NEEDS_COMPLETION in kvm_run.flags if KVM requires userspace to re-enter
> +the kernel KVM_RUN to complete the exit.
> +
> +For select exits, userspace must re-enter the kernel with KVM_RUN to complete
> +the corresponding operation, only after which is guest state guaranteed to be
> +consistent.  On such a KVM_RUN, the kernel side will first finish incomplete
> +operations and then check for pending signals.
> +
> +The pending state of the operation for such exits is not preserved in state
> +which is visible to userspace, thus userspace should ensure that the operation
> +is completed before performing state save/restore, e.g. for live migration.
> +Userspace can re-enter the guest with an unmasked signal pending or with the
> +immediate_exit field set to complete pending operations without allowing any
> +further instructions to be executed.
> +
> +Without KVM_CAP_NEEDS_COMPLETION, KVM_RUN_NEEDS_COMPLETION will never be set
> +and userspace must assume that exits of type KVM_EXIT_IO, KVM_EXIT_MMIO,
> +KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR,
> +KVM_EXIT_X86_WRMSR, and KVM_EXIT_HYPERCALL require completion.

So once you advertise KVM_CAP_NEEDS_COMPLETION, the completion flag
must be present for all of these exits, right? And from what I can
tell, this capability is unconditionally advertised.

Yet, you don't amend arm64 to publish that flag. Not that I think this
causes any issue (even if you save the state at that point without
reentering the guest, it will be still be consistent), but that
directly contradicts the documentation (isn't that ironic? ;-).

Or is your intent to *relax* the requirements on arm64 (and anything
else but x86 and POWER)?

	M.
Chao Gao Jan. 13, 2025, 2:09 a.m. UTC | #2
On Fri, Jan 10, 2025 at 05:24:48PM -0800, Sean Christopherson wrote:
>Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
>that KVM_RUN needs to be re-executed prior to save/restore in order to
>complete the instruction/operation that triggered the userspace exit.
>
>KVM's current approach of adding notes in the Documentation is beyond
>brittle, e.g. there is at least one known case where a KVM developer added
>a new userspace exit type, and then that same developer forgot to handle
>completion when adding userspace support.

This answers one question I had:
https://lore.kernel.org/kvm/Z1bmUCEdoZ87wIMn@intel.com/

So, it is the VMM's (i.e., QEMU's) responsibility to re-execute KVM_RUN in this
case.

Btw, can this flag be used to address the issue [*] with steal time accounting?
We can set the new flag for each vCPU in the PM notifier and we need to change
the re-execution to handle steal time accounting (not just IO completion).

[*]: https://lore.kernel.org/kvm/Z36XJl1OAahVkxhl@google.com/

one nit below,

>--- a/arch/x86/include/uapi/asm/kvm.h
>+++ b/arch/x86/include/uapi/asm/kvm.h
>@@ -104,9 +104,10 @@ struct kvm_ioapic_state {
> #define KVM_IRQCHIP_IOAPIC       2
> #define KVM_NR_IRQCHIPS          3
> 
>-#define KVM_RUN_X86_SMM		 (1 << 0)
>-#define KVM_RUN_X86_BUS_LOCK     (1 << 1)
>-#define KVM_RUN_X86_GUEST_MODE   (1 << 2)
>+#define KVM_RUN_X86_SMM			(1 << 0)
>+#define KVM_RUN_X86_BUS_LOCK		(1 << 1)
>+#define KVM_RUN_X86_GUEST_MODE		(1 << 2)
>+#define KVM_RUN_X86_NEEDS_COMPLETION	(1 << 2)

This X86_NEEDS_COMPLETION should be dropped. It is never used.
Binbin Wu Jan. 13, 2025, 9:01 a.m. UTC | #3
On 1/13/2025 10:09 AM, Chao Gao wrote:
> On Fri, Jan 10, 2025 at 05:24:48PM -0800, Sean Christopherson wrote:
>> Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
>> that KVM_RUN needs to be re-executed prior to save/restore in order to
>> complete the instruction/operation that triggered the userspace exit.
>>
>> KVM's current approach of adding notes in the Documentation is beyond
>> brittle, e.g. there is at least one known case where a KVM developer added
>> a new userspace exit type, and then that same developer forgot to handle
>> completion when adding userspace support.
> This answers one question I had:
> https://lore.kernel.org/kvm/Z1bmUCEdoZ87wIMn@intel.com/
In current QEMU code, it always returns back to KVM via KVM_RUN after it
successfully handled a KVM exit reason, no matter what the exit reason is.
The complete_userspace_io() callback will be called if it has been setup.
So if a new kvm exit reason is added in QEMU, it seems QEMU doesn't need
special handing to make the complete_userspace_io() callback be called.

However, QEMU is not the only userspace VMM that supports KVM, it makes
sense to make the solution generic and clear for different userspace VMMs.

Regarding the support of MapGPA for TDX when live migration is considered,
since a big range will be split into 2MB chunks, in order the status is
right after TD live migration, it needs to set the return code to retry
with the next_gpa in the complete_userspace_io() callback if vcpu->wants_to_run
is false or vcpu->run->immediate_exit__unsafe is set, otherwise, TDX guest
will see return code as successful and think the whole range has been converted
successfully.

@@ -1093,7 +1093,8 @@ static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
          * immediately after STI or MOV/POP SS.
          */
         if (pi_has_pending_interrupt(vcpu) ||
-           kvm_test_request(KVM_REQ_NMI, vcpu) || vcpu->arch.nmi_pending) {
+           kvm_test_request(KVM_REQ_NMI, vcpu) || vcpu->arch.nmi_pending ||
+           !vcpu->wants_to_run) {
                 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
                 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
                 return 1;

Of course, it can be addressed later when TD live migration is supported.


>
> So, it is the VMM's (i.e., QEMU's) responsibility to re-execute KVM_RUN in this
> case.
>
> Btw, can this flag be used to address the issue [*] with steal time accounting?
> We can set the new flag for each vCPU in the PM notifier and we need to change
> the re-execution to handle steal time accounting (not just IO completion).
>
> [*]: https://lore.kernel.org/kvm/Z36XJl1OAahVkxhl@google.com/
>
> one nit below,
>
>> --- a/arch/x86/include/uapi/asm/kvm.h
>> +++ b/arch/x86/include/uapi/asm/kvm.h
>> @@ -104,9 +104,10 @@ struct kvm_ioapic_state {
>> #define KVM_IRQCHIP_IOAPIC       2
>> #define KVM_NR_IRQCHIPS          3
>>
>> -#define KVM_RUN_X86_SMM		 (1 << 0)
>> -#define KVM_RUN_X86_BUS_LOCK     (1 << 1)
>> -#define KVM_RUN_X86_GUEST_MODE   (1 << 2)
>> +#define KVM_RUN_X86_SMM			(1 << 0)
>> +#define KVM_RUN_X86_BUS_LOCK		(1 << 1)
>> +#define KVM_RUN_X86_GUEST_MODE		(1 << 2)
>> +#define KVM_RUN_X86_NEEDS_COMPLETION	(1 << 2)
> This X86_NEEDS_COMPLETION should be dropped. It is never used.
>
Sean Christopherson Jan. 13, 2025, 3:44 p.m. UTC | #4
On Sat, Jan 11, 2025, Marc Zyngier wrote:
> On Sat, 11 Jan 2025 01:24:48 +0000,
> Sean Christopherson <seanjc@google.com> wrote:
> > 
> > Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
> > that KVM_RUN needs to be re-executed prior to save/restore in order to
> > complete the instruction/operation that triggered the userspace exit.
> > 
> > KVM's current approach of adding notes in the Documentation is beyond
> > brittle, e.g. there is at least one known case where a KVM developer added
> > a new userspace exit type, and then that same developer forgot to handle
> > completion when adding userspace support.
> 
> Is this going to fix anything? If they couldn't be bothered to read
> the documentation, let alone update it, how is that going to be
> improved by extra rules and regulations?
> 
> I don't see how someone ignoring the documented behaviour of a given
> exit reason is, all of a sudden, have an epiphany and take a *new*
> flag into account.

The idea is to reduce the probability of introducing bugs, in KVM or userspace,
every time KVM attaches a completion callback.  Yes, userspace would need to be
updated to handle KVM_RUN_NEEDS_COMPLETION, but once that flag is merged, neither
KVM's documentation nor userspace would never need to be updated again.  And if
all architectures took an approach of handling completion via function callback,
I'm pretty sure we'd never need to manually update KVM itself either.

> > +7.37 KVM_CAP_NEEDS_COMPLETION
> > +-----------------------------
> > +
> > +:Architectures: all
> > +:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
> > +
> > +The presence of this capability indicates that KVM_RUN will set
> > +KVM_RUN_NEEDS_COMPLETION in kvm_run.flags if KVM requires userspace to re-enter
> > +the kernel KVM_RUN to complete the exit.
> > +
> > +For select exits, userspace must re-enter the kernel with KVM_RUN to complete
> > +the corresponding operation, only after which is guest state guaranteed to be
> > +consistent.  On such a KVM_RUN, the kernel side will first finish incomplete
> > +operations and then check for pending signals.
> > +
> > +The pending state of the operation for such exits is not preserved in state
> > +which is visible to userspace, thus userspace should ensure that the operation
> > +is completed before performing state save/restore, e.g. for live migration.
> > +Userspace can re-enter the guest with an unmasked signal pending or with the
> > +immediate_exit field set to complete pending operations without allowing any
> > +further instructions to be executed.
> > +
> > +Without KVM_CAP_NEEDS_COMPLETION, KVM_RUN_NEEDS_COMPLETION will never be set
> > +and userspace must assume that exits of type KVM_EXIT_IO, KVM_EXIT_MMIO,
> > +KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR,
> > +KVM_EXIT_X86_WRMSR, and KVM_EXIT_HYPERCALL require completion.
> 
> So once you advertise KVM_CAP_NEEDS_COMPLETION, the completion flag
> must be present for all of these exits, right? And from what I can
> tell, this capability is unconditionally advertised.
> 
> Yet, you don't amend arm64 to publish that flag. Not that I think this
> causes any issue (even if you save the state at that point without
> reentering the guest, it will be still be consistent), but that
> directly contradicts the documentation (isn't that ironic? ;-).

It does cause issues, I missed this code in kvm_arch_vcpu_ioctl_run():

	if (run->exit_reason == KVM_EXIT_MMIO) {
		ret = kvm_handle_mmio_return(vcpu);
		if (ret <= 0)
			return ret;
	}

> Or is your intent to *relax* the requirements on arm64 (and anything
> else but x86 and POWER)?
Sean Christopherson Jan. 13, 2025, 4:59 p.m. UTC | #5
On Mon, Jan 13, 2025, Chao Gao wrote:
> On Fri, Jan 10, 2025 at 05:24:48PM -0800, Sean Christopherson wrote:
> >Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
> >that KVM_RUN needs to be re-executed prior to save/restore in order to
> >complete the instruction/operation that triggered the userspace exit.
> >
> >KVM's current approach of adding notes in the Documentation is beyond
> >brittle, e.g. there is at least one known case where a KVM developer added
> >a new userspace exit type, and then that same developer forgot to handle
> >completion when adding userspace support.
> 
> This answers one question I had:
> https://lore.kernel.org/kvm/Z1bmUCEdoZ87wIMn@intel.com/
> 
> So, it is the VMM's (i.e., QEMU's) responsibility to re-execute KVM_RUN in this
> case.

Yep.

> Btw, can this flag be used to address the issue [*] with steal time accounting?
> We can set the new flag for each vCPU in the PM notifier and we need to change
> the re-execution to handle steal time accounting (not just IO completion).
> 
> [*]: https://lore.kernel.org/kvm/Z36XJl1OAahVkxhl@google.com/

Uh, hmm.  Partially?  And not without creating new, potentially worse problems.

I like the idea, but (a) there's no guarantee a vCPU would be "in" KVM_RUN at
the time of suspend, and (b) KVM would need to take vcpu->mutex in the PM notifier
in order to avoid clobbering the current completion callback, which is definitely
a net negative (hello, deadlocks).

E.g. if a vCPU task is in userspace processing emulated MMIO at the time of
suspend+resume, KVM's completion callback will be non-zero and must be preserved.
And if a vCPU task is in userspace processing an exit that _doesn't_ require
completion, setting KVM_RUN_NEEDS_COMPLETION would likely be missed by userspace,
e.g. if userspace checks the flag only after regaining control from KVM_RUN.

In general, I think setting KVM_RUN_NEEDS_COMPLETION outside of KVM_RUN would add
too much complexity.

> one nit below,
> 
> >--- a/arch/x86/include/uapi/asm/kvm.h
> >+++ b/arch/x86/include/uapi/asm/kvm.h
> >@@ -104,9 +104,10 @@ struct kvm_ioapic_state {
> > #define KVM_IRQCHIP_IOAPIC       2
> > #define KVM_NR_IRQCHIPS          3
> > 
> >-#define KVM_RUN_X86_SMM		 (1 << 0)
> >-#define KVM_RUN_X86_BUS_LOCK     (1 << 1)
> >-#define KVM_RUN_X86_GUEST_MODE   (1 << 2)
> >+#define KVM_RUN_X86_SMM			(1 << 0)
> >+#define KVM_RUN_X86_BUS_LOCK		(1 << 1)
> >+#define KVM_RUN_X86_GUEST_MODE		(1 << 2)
> >+#define KVM_RUN_X86_NEEDS_COMPLETION	(1 << 2)
> 
> This X86_NEEDS_COMPLETION should be dropped. It is never used.

Gah, thanks!
Marc Zyngier Jan. 13, 2025, 5:58 p.m. UTC | #6
On Mon, 13 Jan 2025 15:44:28 +0000,
Sean Christopherson <seanjc@google.com> wrote:
> 
> On Sat, Jan 11, 2025, Marc Zyngier wrote:
> > On Sat, 11 Jan 2025 01:24:48 +0000,
> > Sean Christopherson <seanjc@google.com> wrote:
> > > 
> > > Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
> > > that KVM_RUN needs to be re-executed prior to save/restore in order to
> > > complete the instruction/operation that triggered the userspace exit.
> > > 
> > > KVM's current approach of adding notes in the Documentation is beyond
> > > brittle, e.g. there is at least one known case where a KVM developer added
> > > a new userspace exit type, and then that same developer forgot to handle
> > > completion when adding userspace support.
> > 
> > Is this going to fix anything? If they couldn't be bothered to read
> > the documentation, let alone update it, how is that going to be
> > improved by extra rules and regulations?
> > 
> > I don't see how someone ignoring the documented behaviour of a given
> > exit reason is, all of a sudden, have an epiphany and take a *new*
> > flag into account.
> 
> The idea is to reduce the probability of introducing bugs, in KVM or userspace,
> every time KVM attaches a completion callback.  Yes, userspace would need to be
> updated to handle KVM_RUN_NEEDS_COMPLETION, but once that flag is merged, neither
> KVM's documentation nor userspace would never need to be updated again.  And if
> all architectures took an approach of handling completion via function callback,
> I'm pretty sure we'd never need to manually update KVM itself either.

You are assuming that we need this completion, and I dispute this
assertion.

>
> > > +7.37 KVM_CAP_NEEDS_COMPLETION
> > > +-----------------------------
> > > +
> > > +:Architectures: all
> > > +:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
> > > +
> > > +The presence of this capability indicates that KVM_RUN will set
> > > +KVM_RUN_NEEDS_COMPLETION in kvm_run.flags if KVM requires userspace to re-enter
> > > +the kernel KVM_RUN to complete the exit.
> > > +
> > > +For select exits, userspace must re-enter the kernel with KVM_RUN to complete
> > > +the corresponding operation, only after which is guest state guaranteed to be
> > > +consistent.  On such a KVM_RUN, the kernel side will first finish incomplete
> > > +operations and then check for pending signals.
> > > +
> > > +The pending state of the operation for such exits is not preserved in state
> > > +which is visible to userspace, thus userspace should ensure that the operation
> > > +is completed before performing state save/restore, e.g. for live migration.
> > > +Userspace can re-enter the guest with an unmasked signal pending or with the
> > > +immediate_exit field set to complete pending operations without allowing any
> > > +further instructions to be executed.
> > > +
> > > +Without KVM_CAP_NEEDS_COMPLETION, KVM_RUN_NEEDS_COMPLETION will never be set
> > > +and userspace must assume that exits of type KVM_EXIT_IO, KVM_EXIT_MMIO,
> > > +KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR,
> > > +KVM_EXIT_X86_WRMSR, and KVM_EXIT_HYPERCALL require completion.
> > 
> > So once you advertise KVM_CAP_NEEDS_COMPLETION, the completion flag
> > must be present for all of these exits, right? And from what I can
> > tell, this capability is unconditionally advertised.
> > 
> > Yet, you don't amend arm64 to publish that flag. Not that I think this
> > causes any issue (even if you save the state at that point without
> > reentering the guest, it will be still be consistent), but that
> > directly contradicts the documentation (isn't that ironic? ;-).
> 
> It does cause issues, I missed this code in kvm_arch_vcpu_ioctl_run():
> 
> 	if (run->exit_reason == KVM_EXIT_MMIO) {
> 		ret = kvm_handle_mmio_return(vcpu);
> 		if (ret <= 0)
> 			return ret;
> 	}

That's satisfying a load from the guest forwarded to userspace. If the
VMM did a save of the guest at this stage, restored and resumed it,
*nothing* bad would happen, as PC still points to the instruction that
got forwarded. You'll see the same load again.

As for all arm64 synchronous exceptions, they are idempotent, and can
be repeated as often as you want without side effects.

	M.
Sean Christopherson Jan. 13, 2025, 6:58 p.m. UTC | #7
On Mon, Jan 13, 2025, Marc Zyngier wrote:
> On Mon, 13 Jan 2025 15:44:28 +0000,
> Sean Christopherson <seanjc@google.com> wrote:
> > 
> > On Sat, Jan 11, 2025, Marc Zyngier wrote:
> > > On Sat, 11 Jan 2025 01:24:48 +0000,
> > > Sean Christopherson <seanjc@google.com> wrote:
> > > > 
> > > > Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
> > > > that KVM_RUN needs to be re-executed prior to save/restore in order to
> > > > complete the instruction/operation that triggered the userspace exit.
> > > > 
> > > > KVM's current approach of adding notes in the Documentation is beyond
> > > > brittle, e.g. there is at least one known case where a KVM developer added
> > > > a new userspace exit type, and then that same developer forgot to handle
> > > > completion when adding userspace support.
> > > 
> > > Is this going to fix anything? If they couldn't be bothered to read
> > > the documentation, let alone update it, how is that going to be
> > > improved by extra rules and regulations?
> > > 
> > > I don't see how someone ignoring the documented behaviour of a given
> > > exit reason is, all of a sudden, have an epiphany and take a *new*
> > > flag into account.
> > 
> > The idea is to reduce the probability of introducing bugs, in KVM or userspace,
> > every time KVM attaches a completion callback.  Yes, userspace would need to be
> > updated to handle KVM_RUN_NEEDS_COMPLETION, but once that flag is merged, neither
> > KVM's documentation nor userspace would never need to be updated again.  And if
> > all architectures took an approach of handling completion via function callback,
> > I'm pretty sure we'd never need to manually update KVM itself either.
> 
> You are assuming that we need this completion, and I dispute this
> assertion.

Ah, gotcha.

> > > > +The pending state of the operation for such exits is not preserved in state
> > > > +which is visible to userspace, thus userspace should ensure that the operation
> > > > +is completed before performing state save/restore, e.g. for live migration.
> > > > +Userspace can re-enter the guest with an unmasked signal pending or with the
> > > > +immediate_exit field set to complete pending operations without allowing any
> > > > +further instructions to be executed.
> > > > +
> > > > +Without KVM_CAP_NEEDS_COMPLETION, KVM_RUN_NEEDS_COMPLETION will never be set
> > > > +and userspace must assume that exits of type KVM_EXIT_IO, KVM_EXIT_MMIO,
> > > > +KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR,
> > > > +KVM_EXIT_X86_WRMSR, and KVM_EXIT_HYPERCALL require completion.
> > > 
> > > So once you advertise KVM_CAP_NEEDS_COMPLETION, the completion flag
> > > must be present for all of these exits, right? And from what I can
> > > tell, this capability is unconditionally advertised.
> > > 
> > > Yet, you don't amend arm64 to publish that flag. Not that I think this
> > > causes any issue (even if you save the state at that point without
> > > reentering the guest, it will be still be consistent), but that
> > > directly contradicts the documentation (isn't that ironic? ;-).
> > 
> > It does cause issues, I missed this code in kvm_arch_vcpu_ioctl_run():
> > 
> > 	if (run->exit_reason == KVM_EXIT_MMIO) {
> > 		ret = kvm_handle_mmio_return(vcpu);
> > 		if (ret <= 0)
> > 			return ret;
> > 	}
> 
> That's satisfying a load from the guest forwarded to userspace.

And MMIO stores, no?  I.e. PC needs to be incremented on stores as well.

> If the VMM did a save of the guest at this stage, restored and resumed it,
> *nothing* bad would happen, as PC still points to the instruction that got
> forwarded. You'll see the same load again.

But replaying an MMIO store could cause all kinds of problems, and even MMIO
loads could theoretically be problematic, e.g. if there are side effects in the
device that trigger on access to a device register.
Marc Zyngier Jan. 13, 2025, 7:38 p.m. UTC | #8
On Mon, 13 Jan 2025 18:58:45 +0000,
Sean Christopherson <seanjc@google.com> wrote:
> 
> On Mon, Jan 13, 2025, Marc Zyngier wrote:
> > On Mon, 13 Jan 2025 15:44:28 +0000,
> > Sean Christopherson <seanjc@google.com> wrote:
> > > 
> > > On Sat, Jan 11, 2025, Marc Zyngier wrote:
> > > > On Sat, 11 Jan 2025 01:24:48 +0000,
> > > > Sean Christopherson <seanjc@google.com> wrote:
> > > > > 
> > > > > Add a kvm_run flag, KVM_RUN_NEEDS_COMPLETION, to communicate to userspace
> > > > > that KVM_RUN needs to be re-executed prior to save/restore in order to
> > > > > complete the instruction/operation that triggered the userspace exit.
> > > > > 
> > > > > KVM's current approach of adding notes in the Documentation is beyond
> > > > > brittle, e.g. there is at least one known case where a KVM developer added
> > > > > a new userspace exit type, and then that same developer forgot to handle
> > > > > completion when adding userspace support.
> > > > 
> > > > Is this going to fix anything? If they couldn't be bothered to read
> > > > the documentation, let alone update it, how is that going to be
> > > > improved by extra rules and regulations?
> > > > 
> > > > I don't see how someone ignoring the documented behaviour of a given
> > > > exit reason is, all of a sudden, have an epiphany and take a *new*
> > > > flag into account.
> > > 
> > > The idea is to reduce the probability of introducing bugs, in KVM or userspace,
> > > every time KVM attaches a completion callback.  Yes, userspace would need to be
> > > updated to handle KVM_RUN_NEEDS_COMPLETION, but once that flag is merged, neither
> > > KVM's documentation nor userspace would never need to be updated again.  And if
> > > all architectures took an approach of handling completion via function callback,
> > > I'm pretty sure we'd never need to manually update KVM itself either.
> > 
> > You are assuming that we need this completion, and I dispute this
> > assertion.
> 
> Ah, gotcha.
> 
> > > > > +The pending state of the operation for such exits is not preserved in state
> > > > > +which is visible to userspace, thus userspace should ensure that the operation
> > > > > +is completed before performing state save/restore, e.g. for live migration.
> > > > > +Userspace can re-enter the guest with an unmasked signal pending or with the
> > > > > +immediate_exit field set to complete pending operations without allowing any
> > > > > +further instructions to be executed.
> > > > > +
> > > > > +Without KVM_CAP_NEEDS_COMPLETION, KVM_RUN_NEEDS_COMPLETION will never be set
> > > > > +and userspace must assume that exits of type KVM_EXIT_IO, KVM_EXIT_MMIO,
> > > > > +KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR,
> > > > > +KVM_EXIT_X86_WRMSR, and KVM_EXIT_HYPERCALL require completion.
> > > > 
> > > > So once you advertise KVM_CAP_NEEDS_COMPLETION, the completion flag
> > > > must be present for all of these exits, right? And from what I can
> > > > tell, this capability is unconditionally advertised.
> > > > 
> > > > Yet, you don't amend arm64 to publish that flag. Not that I think this
> > > > causes any issue (even if you save the state at that point without
> > > > reentering the guest, it will be still be consistent), but that
> > > > directly contradicts the documentation (isn't that ironic? ;-).
> > > 
> > > It does cause issues, I missed this code in kvm_arch_vcpu_ioctl_run():
> > > 
> > > 	if (run->exit_reason == KVM_EXIT_MMIO) {
> > > 		ret = kvm_handle_mmio_return(vcpu);
> > > 		if (ret <= 0)
> > > 			return ret;
> > > 	}
> > 
> > That's satisfying a load from the guest forwarded to userspace.
> 
> And MMIO stores, no?  I.e. PC needs to be incremented on stores as well.

Yes, *after* the store as completed. If you replay the instruction,
the same store comes out.

>
> > If the VMM did a save of the guest at this stage, restored and resumed it,
> > *nothing* bad would happen, as PC still points to the instruction that got
> > forwarded. You'll see the same load again.
> 
> But replaying an MMIO store could cause all kinds of problems, and even MMIO
> loads could theoretically be problematic, e.g. if there are side effects in the
> device that trigger on access to a device register.

But that's the VMM's problem. If it has modified its own state and
doesn't return to the guest to complete the instruction, that's just
as bad as a load, which *do* have side effects as well.

Overall, the guest state exposed by KVM is always correct, and
replaying the instruction is not going to change that. It is if the
VMM is broken that things turn ugly *for the VMM itself*, and I claim
that no amount of flag being added is going to help that.

	M.
Sean Christopherson Jan. 13, 2025, 10:04 p.m. UTC | #9
On Mon, Jan 13, 2025, Marc Zyngier wrote:
> On Mon, 13 Jan 2025 18:58:45 +0000,
> Sean Christopherson <seanjc@google.com> wrote:
> > 
> > On Mon, Jan 13, 2025, Marc Zyngier wrote:
> > > On Mon, 13 Jan 2025 15:44:28 +0000,
> > > Sean Christopherson <seanjc@google.com> wrote:
> > > > 
> > > > On Sat, Jan 11, 2025, Marc Zyngier wrote:
> > > > > Yet, you don't amend arm64 to publish that flag. Not that I think this
> > > > > causes any issue (even if you save the state at that point without
> > > > > reentering the guest, it will be still be consistent), but that
> > > > > directly contradicts the documentation (isn't that ironic? ;-).
> > > > 
> > > > It does cause issues, I missed this code in kvm_arch_vcpu_ioctl_run():
> > > > 
> > > > 	if (run->exit_reason == KVM_EXIT_MMIO) {
> > > > 		ret = kvm_handle_mmio_return(vcpu);
> > > > 		if (ret <= 0)
> > > > 			return ret;
> > > > 	}
> > > 
> > > That's satisfying a load from the guest forwarded to userspace.
> > 
> > And MMIO stores, no?  I.e. PC needs to be incremented on stores as well.
> 
> Yes, *after* the store as completed. If you replay the instruction,
> the same store comes out.
> 
> > > If the VMM did a save of the guest at this stage, restored and resumed it,
> > > *nothing* bad would happen, as PC still points to the instruction that got
> > > forwarded. You'll see the same load again.
> > 
> > But replaying an MMIO store could cause all kinds of problems, and even MMIO
> > loads could theoretically be problematic, e.g. if there are side effects in the
> > device that trigger on access to a device register.
> 
> But that's the VMM's problem. If it has modified its own state and
> doesn't return to the guest to complete the instruction, that's just
> as bad as a load, which *do* have side effects as well.

Agreed, just wanted to make sure I wasn't completely misunderstanding something
about arm64.

> Overall, the guest state exposed by KVM is always correct, and
> replaying the instruction is not going to change that. It is if the
> VMM is broken that things turn ugly *for the VMM itself*, 
> and I claim that no amount of flag being added is going to help that.

On x86 at least, adding KVM_RUN_NEEDS_COMPLETION reduces the chances for human
error.  x86 has had bugs in both KVM (patch 1) and userspace (Google's VMM when
handling MSR exits) that would have been avoided if KVM_RUN_NEEDS_COMPLETION existed.
Unless the VMM is doing something decidely odd, userspace needs to write code once
(maybe even just once for all architectures).  For KVM, the flag is set based on
whether or not the vCPU has a valid completion callback, i.e. will be correct so
long as the underlying KVM code is correct.

Contrast that with the current approach, where the KVM developer needs to get
the KVM code correct and remember to update KVM's documentation.  Documentation
is especially problematic, because in practice it can't be tested, i.e. is much
more likely to be missed by the developer and the maintainer.  The VMM either
needs to blindly redo KVM_RUN (as selftests do, and apparently as QEMU does), or
the developer adding VMM support needs to be diligent in reading KVM's documentation.
And like KVM documentation, testing that the VMM is implemented to KVM's "spec"
is effectively impossible in practice, because 99.9999% of the time userspace
exits and save/restore will work just fine.

I do agree that the VMM is likely going to run into problems sooner or later if
the developers/maintainers don't fundamentally understand the need to redo KVM_RUN,
but I also think there's significant value in reducing the chances for simple
human error to result in broken VMs.
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index c92c8d4e8779..8e172675d8d6 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6505,7 +6505,7 @@  local APIC is not used.
 
 	__u16 flags;
 
-More architecture-specific flags detailing state of the VCPU that may
+Common and architecture-specific flags detailing state of the VCPU that may
 affect the device's behavior. Current defined flags::
 
   /* x86, set if the VCPU is in system management mode */
@@ -6518,6 +6518,8 @@  affect the device's behavior. Current defined flags::
   /* arm64, set for KVM_EXIT_DEBUG */
   #define KVM_DEBUG_ARCH_HSR_HIGH_VALID  (1 << 0)
 
+  /* all architectures, set when the exit needs completion (via KVM_RUN) */
+  #define KVM_RUN_NEEDS_COMPLETION  (1 << 15)
 ::
 
 	/* in (pre_kvm_run), out (post_kvm_run) */
@@ -6632,19 +6634,10 @@  requires a guest to interact with host userspace.
 
 .. note::
 
-      For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
-      KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR, KVM_EXIT_X86_WRMSR, and KVM_EXIT_HYPERCALL
-      the corresponding operations are complete (and guest state is consistent)
-      only after userspace has re-entered the kernel with KVM_RUN.  The kernel
-      side will first finish incomplete operations and then check for pending
-      signals.
+      For some exits, userspace must re-enter the kernel with KVM_RUN to
+      complete the exit and ensure guest state is consistent.
 
-      The pending state of the operation is not preserved in state which is
-      visible to userspace, thus userspace should ensure that the operation is
-      completed before performing a live migration.  Userspace can re-enter the
-      guest with an unmasked signal pending or with the immediate_exit field set
-      to complete pending operations without allowing any further instructions
-      to be executed.
+      See KVM_CAP_NEEDS_COMPLETION for details.
 
 ::
 
@@ -8239,7 +8232,7 @@  Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the
 core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest.
 
 7.36 KVM_CAP_X86_GUEST_MODE
-------------------------------
+---------------------------
 
 :Architectures: x86
 :Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
@@ -8252,6 +8245,33 @@  KVM exits with the register state of either the L1 or L2 guest
 depending on which executed at the time of an exit. Userspace must
 take care to differentiate between these cases.
 
+7.37 KVM_CAP_NEEDS_COMPLETION
+-----------------------------
+
+:Architectures: all
+:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
+
+The presence of this capability indicates that KVM_RUN will set
+KVM_RUN_NEEDS_COMPLETION in kvm_run.flags if KVM requires userspace to re-enter
+the kernel KVM_RUN to complete the exit.
+
+For select exits, userspace must re-enter the kernel with KVM_RUN to complete
+the corresponding operation, only after which is guest state guaranteed to be
+consistent.  On such a KVM_RUN, the kernel side will first finish incomplete
+operations and then check for pending signals.
+
+The pending state of the operation for such exits is not preserved in state
+which is visible to userspace, thus userspace should ensure that the operation
+is completed before performing state save/restore, e.g. for live migration.
+Userspace can re-enter the guest with an unmasked signal pending or with the
+immediate_exit field set to complete pending operations without allowing any
+further instructions to be executed.
+
+Without KVM_CAP_NEEDS_COMPLETION, KVM_RUN_NEEDS_COMPLETION will never be set
+and userspace must assume that exits of type KVM_EXIT_IO, KVM_EXIT_MMIO,
+KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR,
+KVM_EXIT_X86_WRMSR, and KVM_EXIT_HYPERCALL require completion.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index de126d153328..15014a66c167 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -374,6 +374,7 @@  int kvmppc_core_emulate_op_pr(struct kvm_vcpu *vcpu,
 			}
 
 			vcpu->run->exit_reason = KVM_EXIT_PAPR_HCALL;
+			vcpu->run->flags |= KVM_RUN_NEEDS_COMPLETION;
 			vcpu->arch.hcall_needed = 1;
 			emulated = EMULATE_EXIT_USER;
 			break;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b253f7372774..05ad0c3494f1 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1767,6 +1767,7 @@  static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
 		for (i = 0; i < 9; ++i)
 			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
 		run->exit_reason = KVM_EXIT_PAPR_HCALL;
+		run->flags |= KVM_RUN_NEEDS_COMPLETION;
 		vcpu->arch.hcall_needed = 1;
 		r = RESUME_HOST;
 		break;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 83bcdc80ce51..c45beb64905a 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1310,6 +1310,7 @@  int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr)
 				run->papr_hcall.args[i] = gpr;
 			}
 			run->exit_reason = KVM_EXIT_PAPR_HCALL;
+			run->flags |= KVM_RUN_NEEDS_COMPLETION;
 			vcpu->arch.hcall_needed = 1;
 			r = RESUME_HOST;
 		} else if (vcpu->arch.osi_enabled &&
@@ -1320,6 +1321,7 @@  int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr)
 			int i;
 
 			run->exit_reason = KVM_EXIT_OSI;
+			run->flags |= KVM_RUN_NEEDS_COMPLETION;
 			for (i = 0; i < 32; i++)
 				gprs[i] = kvmppc_get_gpr(vcpu, i);
 			vcpu->arch.osi_needed = 1;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 6a5be025a8af..3a0e00178fa5 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -751,6 +751,7 @@  int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
 		vcpu->run->epr.epr = 0;
 		vcpu->arch.epr_needed = true;
 		vcpu->run->exit_reason = KVM_EXIT_EPR;
+		vcpu->run->flags |= KVM_RUN_NEEDS_COMPLETION;
 		r = 0;
 	}
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 88585c1de416..e2ec32a8970c 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -104,9 +104,10 @@  struct kvm_ioapic_state {
 #define KVM_IRQCHIP_IOAPIC       2
 #define KVM_NR_IRQCHIPS          3
 
-#define KVM_RUN_X86_SMM		 (1 << 0)
-#define KVM_RUN_X86_BUS_LOCK     (1 << 1)
-#define KVM_RUN_X86_GUEST_MODE   (1 << 2)
+#define KVM_RUN_X86_SMM			(1 << 0)
+#define KVM_RUN_X86_BUS_LOCK		(1 << 1)
+#define KVM_RUN_X86_GUEST_MODE		(1 << 2)
+#define KVM_RUN_X86_NEEDS_COMPLETION	(1 << 2)
 
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a8aa12e0911d..67034b089371 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10154,6 +10154,8 @@  static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 		kvm_run->flags |= KVM_RUN_X86_SMM;
 	if (is_guest_mode(vcpu))
 		kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
+	if (vcpu->arch.complete_userspace_io)
+		kvm_run->flags |= KVM_RUN_NEEDS_COMPLETION;
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 343de0a51797..91dbee3ae0bc 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -192,6 +192,8 @@  struct kvm_xen_exit {
 /* Flags that describe what fields in emulation_failure hold valid data. */
 #define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0)
 
+#define KVM_RUN_NEEDS_COMPLETION	(1 << 15)
+
 /*
  * struct kvm_run can be modified by userspace at any time, so KVM must be
  * careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM()
@@ -933,6 +935,7 @@  struct kvm_enable_cap {
 #define KVM_CAP_PRE_FAULT_MEMORY 236
 #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237
 #define KVM_CAP_X86_GUEST_MODE 238
+#define KVM_CAP_NEEDS_COMPLETION 239
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7d2076439081..28aa89e5ba85 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4746,6 +4746,7 @@  static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_CHECK_EXTENSION_VM:
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_HALT_POLL:
+	case KVM_CAP_NEEDS_COMPLETION:
 		return 1;
 #ifdef CONFIG_KVM_MMIO
 	case KVM_CAP_COALESCED_MMIO: