diff mbox series

[v2,for-8.2?] i386/sev: Avoid SEV-ES crash due to missing MSR_EFER_LMA bit

Message ID 20231205222816.1152720-1-michael.roth@amd.com (mailing list archive)
State New, archived
Headers show
Series [v2,for-8.2?] i386/sev: Avoid SEV-ES crash due to missing MSR_EFER_LMA bit | expand

Commit Message

Michael Roth Dec. 5, 2023, 10:28 p.m. UTC
Commit 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
added error checking for KVM_SET_SREGS/KVM_SET_SREGS2. In doing so, it
exposed a long-running bug in current KVM support for SEV-ES where the
kernel assumes that MSR_EFER_LMA will be set explicitly by the guest
kernel, in which case EFER write traps would result in KVM eventually
seeing MSR_EFER_LMA get set and recording it in such a way that it would
be subsequently visible when accessing it via KVM_GET_SREGS/etc.

However, guests kernels currently rely on MSR_EFER_LMA getting set
automatically when MSR_EFER_LME is set and paging is enabled via
CR0_PG_MASK. As a result, the EFER write traps don't actually expose the
MSR_EFER_LMA even though it is set internally, and when QEMU
subsequently tries to pass this EFER value back to KVM via
KVM_SET_SREGS* it will fail various sanity checks and return -EINVAL,
which is now considered fatal due to the aforementioned QEMU commit.

This can be addressed by inferring the MSR_EFER_LMA bit being set when
paging is enabled and MSR_EFER_LME is set, and synthesizing it to ensure
the expected bits are all present in subsequent handling on the host
side.

Ultimately, this handling will be implemented in the host kernel, but to
avoid breaking QEMU's SEV-ES support when using older host kernels, the
same handling can be done in QEMU just after fetching the register
values via KVM_GET_SREGS*. Implement that here.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
Cc: kvm@vger.kernel.org
Fixes: 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
v2:
  - Add handling for KVM_GET_SREGS, not just KVM_GET_SREGS2

 target/i386/kvm/kvm.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

Comments

Philippe Mathieu-Daudé Dec. 6, 2023, 11:48 a.m. UTC | #1
Hi Michael,

(Cc'ing Lara, Vitaly and Maxim)

On 5/12/23 23:28, Michael Roth wrote:
> Commit 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> added error checking for KVM_SET_SREGS/KVM_SET_SREGS2. In doing so, it
> exposed a long-running bug in current KVM support for SEV-ES where the
> kernel assumes that MSR_EFER_LMA will be set explicitly by the guest
> kernel, in which case EFER write traps would result in KVM eventually
> seeing MSR_EFER_LMA get set and recording it in such a way that it would
> be subsequently visible when accessing it via KVM_GET_SREGS/etc.
> 
> However, guests kernels currently rely on MSR_EFER_LMA getting set
> automatically when MSR_EFER_LME is set and paging is enabled via
> CR0_PG_MASK. As a result, the EFER write traps don't actually expose the
> MSR_EFER_LMA even though it is set internally, and when QEMU
> subsequently tries to pass this EFER value back to KVM via
> KVM_SET_SREGS* it will fail various sanity checks and return -EINVAL,
> which is now considered fatal due to the aforementioned QEMU commit.
> 
> This can be addressed by inferring the MSR_EFER_LMA bit being set when
> paging is enabled and MSR_EFER_LME is set, and synthesizing it to ensure
> the expected bits are all present in subsequent handling on the host
> side.
> 
> Ultimately, this handling will be implemented in the host kernel, but to
> avoid breaking QEMU's SEV-ES support when using older host kernels, the
> same handling can be done in QEMU just after fetching the register
> values via KVM_GET_SREGS*. Implement that here.
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
> Cc: kvm@vger.kernel.org
> Fixes: 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")

This 'Fixes:' tag is misleading, since as you mentioned this commit
only exposes the issue.

Commit d499f196fe ("target/i386: Added consistency checks for EFER")
or around it seems more appropriate.

Is this feature easily testable on our CI, on a x86 runner with KVM
access?

> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
> v2:
>    - Add handling for KVM_GET_SREGS, not just KVM_GET_SREGS2
> 
>   target/i386/kvm/kvm.c | 14 ++++++++++++++
>   1 file changed, 14 insertions(+)
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 11b8177eff..8721c1bf8f 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -3610,6 +3610,7 @@ static int kvm_get_sregs(X86CPU *cpu)
>   {
>       CPUX86State *env = &cpu->env;
>       struct kvm_sregs sregs;
> +    target_ulong cr0_old;
>       int ret;
>   
>       ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
> @@ -3637,12 +3638,18 @@ static int kvm_get_sregs(X86CPU *cpu)
>       env->gdt.limit = sregs.gdt.limit;
>       env->gdt.base = sregs.gdt.base;
>   
> +    cr0_old = env->cr[0];
>       env->cr[0] = sregs.cr0;
>       env->cr[2] = sregs.cr2;
>       env->cr[3] = sregs.cr3;
>       env->cr[4] = sregs.cr4;
>   
>       env->efer = sregs.efer;
> +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> +            env->efer |= MSR_EFER_LMA;
> +        }
> +    }
>   
>       /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
>       x86_update_hflags(env);
> @@ -3654,6 +3661,7 @@ static int kvm_get_sregs2(X86CPU *cpu)
>   {
>       CPUX86State *env = &cpu->env;
>       struct kvm_sregs2 sregs;
> +    target_ulong cr0_old;
>       int i, ret;
>   
>       ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
> @@ -3676,12 +3684,18 @@ static int kvm_get_sregs2(X86CPU *cpu)
>       env->gdt.limit = sregs.gdt.limit;
>       env->gdt.base = sregs.gdt.base;
>   
> +    cr0_old = env->cr[0];
>       env->cr[0] = sregs.cr0;
>       env->cr[2] = sregs.cr2;
>       env->cr[3] = sregs.cr3;
>       env->cr[4] = sregs.cr4;
>   
>       env->efer = sregs.efer;
> +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> +            env->efer |= MSR_EFER_LMA;
> +        }
> +    }
>   
>       env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
>
Michael Roth Dec. 6, 2023, 1:12 p.m. UTC | #2
On Wed, Dec 06, 2023 at 12:48:35PM +0100, Philippe Mathieu-Daudé wrote:
> Hi Michael,
> 
> (Cc'ing Lara, Vitaly and Maxim)
> 
> On 5/12/23 23:28, Michael Roth wrote:
> > Commit 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> > added error checking for KVM_SET_SREGS/KVM_SET_SREGS2. In doing so, it
> > exposed a long-running bug in current KVM support for SEV-ES where the
> > kernel assumes that MSR_EFER_LMA will be set explicitly by the guest
> > kernel, in which case EFER write traps would result in KVM eventually
> > seeing MSR_EFER_LMA get set and recording it in such a way that it would
> > be subsequently visible when accessing it via KVM_GET_SREGS/etc.
> > 
> > However, guests kernels currently rely on MSR_EFER_LMA getting set
> > automatically when MSR_EFER_LME is set and paging is enabled via
> > CR0_PG_MASK. As a result, the EFER write traps don't actually expose the
> > MSR_EFER_LMA even though it is set internally, and when QEMU
> > subsequently tries to pass this EFER value back to KVM via
> > KVM_SET_SREGS* it will fail various sanity checks and return -EINVAL,
> > which is now considered fatal due to the aforementioned QEMU commit.
> > 
> > This can be addressed by inferring the MSR_EFER_LMA bit being set when
> > paging is enabled and MSR_EFER_LME is set, and synthesizing it to ensure
> > the expected bits are all present in subsequent handling on the host
> > side.
> > 
> > Ultimately, this handling will be implemented in the host kernel, but to
> > avoid breaking QEMU's SEV-ES support when using older host kernels, the
> > same handling can be done in QEMU just after fetching the register
> > values via KVM_GET_SREGS*. Implement that here.
> > 
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
> > Cc: kvm@vger.kernel.org
> > Fixes: 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> 

Hi Philippe,

> This 'Fixes:' tag is misleading, since as you mentioned this commit
> only exposes the issue.

That's true, a "Workaround-for: " tag or something like that might be more
appropriate. I just wanted to make it clear that SEV-ES support is no longer
working with that patch applied, so I used Fixes: and elaborated on the
commit message. I can change it if there's a better way to convey this
though.

> 
> Commit d499f196fe ("target/i386: Added consistency checks for EFER")
> or around it seems more appropriate.

Those checks seem to be more for TCG. The actual bug is in the host
kernel, and it seems to have been there basically since the original
SEV-ES host support went in in 2020. I've also sent a patch to address
this in KVM:

  https://lore.kernel.org/lkml/20231205234956.1156210-1-michael.roth@amd.com/T/#u

but in the meantime it means that QEMU 8.2+ SEV-ES support would no
longer work for any current/older host kernels, so I'm hoping a targeted
workaround is warranted to cover that gap.

> 
> Is this feature easily testable on our CI, on a x86 runner with KVM
> access?

SEV-ES support was introduced with EPYC Zen2 architecture (EPYC 7002
series processors, aka "Rome"). If there are any systems in the test pool
that are Zen2 or greater, then a simple boot of a SEV-ES linux guest would
be enough to trigger the QEMU crash. I'm not sure if there are any systems
of that sort in the pool though.

Thanks,

Mike
Paolo Bonzini Dec. 6, 2023, 1:41 p.m. UTC | #3
On Tue, Dec 5, 2023 at 11:28 PM Michael Roth <michael.roth@amd.com> wrote:
> @@ -3637,12 +3638,18 @@ static int kvm_get_sregs(X86CPU *cpu)
>      env->gdt.limit = sregs.gdt.limit;
>      env->gdt.base = sregs.gdt.base;
>
> +    cr0_old = env->cr[0];
>      env->cr[0] = sregs.cr0;
>      env->cr[2] = sregs.cr2;
>      env->cr[3] = sregs.cr3;
>      env->cr[4] = sregs.cr4;
>
>      env->efer = sregs.efer;
> +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> +            env->efer |= MSR_EFER_LMA;
> +        }
> +    }

There is no need to check cr0_old or sev_es_enabled(); EFER.LMA is
simply EFER.LME && CR0.PG.

Alternatively, sev_es_enabled() could be an assertion, that is:

    if ((env->efer & MSR_EFER_LME) && (env->cr[0] & CR0_PG_MASK) &&
       !(env->efer & MSR_EFER_LMA)) {
        /* Workaround for... */
        assert(sev_es_enabled());
        env->efer |= MSR_EFER_LMA;
    }

What do you think?

Thanks,

Paolo

>      /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
>      x86_update_hflags(env);
> @@ -3654,6 +3661,7 @@ static int kvm_get_sregs2(X86CPU *cpu)
>  {
>      CPUX86State *env = &cpu->env;
>      struct kvm_sregs2 sregs;
> +    target_ulong cr0_old;
>      int i, ret;
>
>      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
> @@ -3676,12 +3684,18 @@ static int kvm_get_sregs2(X86CPU *cpu)
>      env->gdt.limit = sregs.gdt.limit;
>      env->gdt.base = sregs.gdt.base;
>
> +    cr0_old = env->cr[0];
>      env->cr[0] = sregs.cr0;
>      env->cr[2] = sregs.cr2;
>      env->cr[3] = sregs.cr3;
>      env->cr[4] = sregs.cr4;
>
>      env->efer = sregs.efer;
> +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> +            env->efer |= MSR_EFER_LMA;
> +        }
> +    }
>
>      env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
>
> --
> 2.25.1
>
Paolo Bonzini Dec. 6, 2023, 1:43 p.m. UTC | #4
On Wed, Dec 6, 2023 at 2:13 PM Michael Roth <michael.roth@amd.com> wrote:
> > This 'Fixes:' tag is misleading, since as you mentioned this commit
> > only exposes the issue.
>
> That's true, a "Workaround-for: " tag or something like that might be more
> appropriate. I just wanted to make it clear that SEV-ES support is no longer
> working with that patch applied, so I used Fixes: and elaborated on the
> commit message. I can change it if there's a better way to convey this
> though.

That's fine, Fixes is also for automated checks, like "if you have
this commit you also want this one".

> >
> > Commit d499f196fe ("target/i386: Added consistency checks for EFER")
> > or around it seems more appropriate.
>
> Those checks seem to be more for TCG.

Yes, that's 100% TCG code.

> The actual bug is in the host
> kernel, and it seems to have been there basically since the original
> SEV-ES host support went in in 2020. I've also sent a patch to address
> this in KVM:
>
>   https://lore.kernel.org/lkml/20231205234956.1156210-1-michael.roth@amd.com/T/#u

Thanks, looking at it.

Paolo
Michael Roth Dec. 6, 2023, 2:46 p.m. UTC | #5
On Wed, Dec 06, 2023 at 02:41:13PM +0100, Paolo Bonzini wrote:
> On Tue, Dec 5, 2023 at 11:28 PM Michael Roth <michael.roth@amd.com> wrote:
> > @@ -3637,12 +3638,18 @@ static int kvm_get_sregs(X86CPU *cpu)
> >      env->gdt.limit = sregs.gdt.limit;
> >      env->gdt.base = sregs.gdt.base;
> >
> > +    cr0_old = env->cr[0];
> >      env->cr[0] = sregs.cr0;
> >      env->cr[2] = sregs.cr2;
> >      env->cr[3] = sregs.cr3;
> >      env->cr[4] = sregs.cr4;
> >
> >      env->efer = sregs.efer;
> > +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> > +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> > +            env->efer |= MSR_EFER_LMA;
> > +        }
> > +    }
> 
> There is no need to check cr0_old or sev_es_enabled(); EFER.LMA is
> simply EFER.LME && CR0.PG.

Yah, I originally had it like that, but svm_set_cr0() in the kernel only
sets it in vcpu->arch.efer it when setting CR0.PG, so I thought it might
be safer to be more selective and mirror that handling on the QEMU side
so we can leave as much of any other sanity checks on kernel/QEMU side
intact as possible. E.g., if some other bug in the kernel ends up
unsetting EFER.LMA while paging is still enabled, we'd still notice that
when passing it back in via KVM_SET_SREGS*.

But agree it's simpler to just always set it based on CR0.PG and EFER.LMA
and can send a v3 if that's preferred.

> 
> Alternatively, sev_es_enabled() could be an assertion, that is:
> 
>     if ((env->efer & MSR_EFER_LME) && (env->cr[0] & CR0_PG_MASK) &&
>        !(env->efer & MSR_EFER_LMA)) {
>         /* Workaround for... */
>         assert(sev_es_enabled());
>         env->efer |= MSR_EFER_LMA;
>     }
> 
> What do you think?

I'm a little apprehensive about this approach for similar reasons as
above. The current patch is guaranteed to only affect SEV-ES, whereas
this approach could trigger assertions for other edge-cases we aren't
aware of that could further impact the release. For instance, "in
theory", QEMU or KVM might have some handling where EFER.LMA is set
somewhere after (or outside of) KVM_GET_SREGS, but now with this
proposed change QEMU would become more restrictive and generate an
assert for those cases.

I don't think that's actually the case, but in the off-chance that such
a case exists there could be more fall-out such as further delays, or
the need for a stable fix. But no strong opinion there either if that
ends up being the preferred approach, just trying to consider the
pros/cons.

Thanks,

Mike

> 
> Thanks,
> 
> Paolo
> 
> >      /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
> >      x86_update_hflags(env);
> > @@ -3654,6 +3661,7 @@ static int kvm_get_sregs2(X86CPU *cpu)
> >  {
> >      CPUX86State *env = &cpu->env;
> >      struct kvm_sregs2 sregs;
> > +    target_ulong cr0_old;
> >      int i, ret;
> >
> >      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
> > @@ -3676,12 +3684,18 @@ static int kvm_get_sregs2(X86CPU *cpu)
> >      env->gdt.limit = sregs.gdt.limit;
> >      env->gdt.base = sregs.gdt.base;
> >
> > +    cr0_old = env->cr[0];
> >      env->cr[0] = sregs.cr0;
> >      env->cr[2] = sregs.cr2;
> >      env->cr[3] = sregs.cr3;
> >      env->cr[4] = sregs.cr4;
> >
> >      env->efer = sregs.efer;
> > +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> > +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> > +            env->efer |= MSR_EFER_LMA;
> > +        }
> > +    }
> >
> >      env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
> >
> > --
> > 2.25.1
> >
>
Paolo Bonzini Dec. 6, 2023, 3:04 p.m. UTC | #6
On Wed, Dec 6, 2023 at 3:46 PM Michael Roth <michael.roth@amd.com> wrote:
> > There is no need to check cr0_old or sev_es_enabled(); EFER.LMA is
> > simply EFER.LME && CR0.PG.
>
> Yah, I originally had it like that, but svm_set_cr0() in the kernel only
> sets it in vcpu->arch.efer it when setting CR0.PG, so I thought it might
> be safer to be more selective and mirror that handling on the QEMU side
> so we can leave as much of any other sanity checks on kernel/QEMU side
> intact as possible. E.g., if some other bug in the kernel ends up
> unsetting EFER.LMA while paging is still enabled, we'd still notice that
> when passing it back in via KVM_SET_SREGS*.
>
> But agree it's simpler to just always set it based on CR0.PG and EFER.LMA
> and can send a v3 if that's preferred.

Yeah, in this case I think the chance of something breaking is really,
really small.

The behavior of svm_set_cr0() is more due to how the surrounding code
looks like, than anything else.

> > Alternatively, sev_es_enabled() could be an assertion, that is:
> >
> >     if ((env->efer & MSR_EFER_LME) && (env->cr[0] & CR0_PG_MASK) &&
> >        !(env->efer & MSR_EFER_LMA)) {
> >         /* Workaround for... */
> >         assert(sev_es_enabled());
> >         env->efer |= MSR_EFER_LMA;
> >     }
> >
> > What do you think?
>
> I'm a little apprehensive about this approach for similar reasons as
> above

I agree on this. I think it's worth in general to have clear
expectations, though. If you think it's worrisome, we can commit it
without assertion now and add it in 9.0.

Paolo
Michael Roth Dec. 6, 2023, 3:15 p.m. UTC | #7
On Wed, Dec 06, 2023 at 04:04:43PM +0100, Paolo Bonzini wrote:
> On Wed, Dec 6, 2023 at 3:46 PM Michael Roth <michael.roth@amd.com> wrote:
> > > There is no need to check cr0_old or sev_es_enabled(); EFER.LMA is
> > > simply EFER.LME && CR0.PG.
> >
> > Yah, I originally had it like that, but svm_set_cr0() in the kernel only
> > sets it in vcpu->arch.efer it when setting CR0.PG, so I thought it might
> > be safer to be more selective and mirror that handling on the QEMU side
> > so we can leave as much of any other sanity checks on kernel/QEMU side
> > intact as possible. E.g., if some other bug in the kernel ends up
> > unsetting EFER.LMA while paging is still enabled, we'd still notice that
> > when passing it back in via KVM_SET_SREGS*.
> >
> > But agree it's simpler to just always set it based on CR0.PG and EFER.LMA
> > and can send a v3 if that's preferred.
> 
> Yeah, in this case I think the chance of something breaking is really,
> really small.
> 
> The behavior of svm_set_cr0() is more due to how the surrounding code
> looks like, than anything else.

Ok, seems reasonable. I'll plan to send a v3 with the old_cr0 stuff
dropped.

> 
> > > Alternatively, sev_es_enabled() could be an assertion, that is:
> > >
> > >     if ((env->efer & MSR_EFER_LME) && (env->cr[0] & CR0_PG_MASK) &&
> > >        !(env->efer & MSR_EFER_LMA)) {
> > >         /* Workaround for... */
> > >         assert(sev_es_enabled());
> > >         env->efer |= MSR_EFER_LMA;
> > >     }
> > >
> > > What do you think?
> >
> > I'm a little apprehensive about this approach for similar reasons as
> > above
> 
> I agree on this. I think it's worth in general to have clear
> expectations, though. If you think it's worrisome, we can commit it
> without assertion now and add it in 9.0.

I think that seems like a good approach. That would give us more time to
discuss the fix/handling on the kernel side, and then as a follow-up we
can tighten down the QEMU handling/expectations based on that.

Thanks,

Mike

> 
> Paolo
> 
>
Maxim Levitsky Dec. 6, 2023, 5:20 p.m. UTC | #8
On Tue, 2023-12-05 at 16:28 -0600, Michael Roth wrote:
> Commit 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> added error checking for KVM_SET_SREGS/KVM_SET_SREGS2. In doing so, it
> exposed a long-running bug in current KVM support for SEV-ES where the
> kernel assumes that MSR_EFER_LMA will be set explicitly by the guest
> kernel, in which case EFER write traps would result in KVM eventually
> seeing MSR_EFER_LMA get set and recording it in such a way that it would
> be subsequently visible when accessing it via KVM_GET_SREGS/etc.
> 
> However, guests kernels currently rely on MSR_EFER_LMA getting set
> automatically when MSR_EFER_LME is set and paging is enabled via
> CR0_PG_MASK. As a result, the EFER write traps don't actually expose the
> MSR_EFER_LMA even though it is set internally, and when QEMU
> subsequently tries to pass this EFER value back to KVM via
> KVM_SET_SREGS* it will fail various sanity checks and return -EINVAL,
> which is now considered fatal due to the aforementioned QEMU commit.
> 
> This can be addressed by inferring the MSR_EFER_LMA bit being set when
> paging is enabled and MSR_EFER_LME is set, and synthesizing it to ensure
> the expected bits are all present in subsequent handling on the host
> side.
> 
> Ultimately, this handling will be implemented in the host kernel, but to
> avoid breaking QEMU's SEV-ES support when using older host kernels, the
> same handling can be done in QEMU just after fetching the register
> values via KVM_GET_SREGS*. Implement that here.
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
> Cc: kvm@vger.kernel.org
> Fixes: 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
> v2:
>   - Add handling for KVM_GET_SREGS, not just KVM_GET_SREGS2
> 
>  target/i386/kvm/kvm.c | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 11b8177eff..8721c1bf8f 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -3610,6 +3610,7 @@ static int kvm_get_sregs(X86CPU *cpu)
>  {
>      CPUX86State *env = &cpu->env;
>      struct kvm_sregs sregs;
> +    target_ulong cr0_old;
>      int ret;
>  
>      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
> @@ -3637,12 +3638,18 @@ static int kvm_get_sregs(X86CPU *cpu)
>      env->gdt.limit = sregs.gdt.limit;
>      env->gdt.base = sregs.gdt.base;
>  
> +    cr0_old = env->cr[0];
>      env->cr[0] = sregs.cr0;
>      env->cr[2] = sregs.cr2;
>      env->cr[3] = sregs.cr3;
>      env->cr[4] = sregs.cr4;
>  
>      env->efer = sregs.efer;
> +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> +            env->efer |= MSR_EFER_LMA;
> +        }
> +    }

I think that we should not check that CR0_PG has changed, and just blindly assume
that if EFER.LME is set and CR0.PG is set, then EFER.LMA must be set as defined in x86 spec.

Otherwise, suppose qemu calls kvm_get_sregs twice: First time it will work,
but second time CR0.PG will match one that is stored in the env, and thus the workaround
will not be executed, and instead we will revert back to wrong EFER value 
reported by the kernel.

How about something like that:


if (sev_es_enabled() && env->efer & MSR_EFER_LME && env->cr[0] & CR0_PG_MASK) {
	/* 
         * Workaround KVM bug, because of which KVM might not be aware of the 
         * fact that EFER.LMA was toggled by the hardware 
         */
	env->efer |= MSR_EFER_LMA;
}


Best regards,
	Maxim Levitsky

>  
>      /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
>      x86_update_hflags(env);
> @@ -3654,6 +3661,7 @@ static int kvm_get_sregs2(X86CPU *cpu)
>  {
>      CPUX86State *env = &cpu->env;
>      struct kvm_sregs2 sregs;
> +    target_ulong cr0_old;
>      int i, ret;
>  
>      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
> @@ -3676,12 +3684,18 @@ static int kvm_get_sregs2(X86CPU *cpu)
>      env->gdt.limit = sregs.gdt.limit;
>      env->gdt.base = sregs.gdt.base;
>  
> +    cr0_old = env->cr[0];
>      env->cr[0] = sregs.cr0;
>      env->cr[2] = sregs.cr2;
>      env->cr[3] = sregs.cr3;
>      env->cr[4] = sregs.cr4;
>  
>      env->efer = sregs.efer;
> +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> +            env->efer |= MSR_EFER_LMA;
> +        }
> +    }
>  
>      env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
>
Michael Roth Dec. 6, 2023, 5:42 p.m. UTC | #9
On Wed, Dec 06, 2023 at 07:20:14PM +0200, Maxim Levitsky wrote:
> On Tue, 2023-12-05 at 16:28 -0600, Michael Roth wrote:
> > Commit 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> > added error checking for KVM_SET_SREGS/KVM_SET_SREGS2. In doing so, it
> > exposed a long-running bug in current KVM support for SEV-ES where the
> > kernel assumes that MSR_EFER_LMA will be set explicitly by the guest
> > kernel, in which case EFER write traps would result in KVM eventually
> > seeing MSR_EFER_LMA get set and recording it in such a way that it would
> > be subsequently visible when accessing it via KVM_GET_SREGS/etc.
> > 
> > However, guests kernels currently rely on MSR_EFER_LMA getting set
> > automatically when MSR_EFER_LME is set and paging is enabled via
> > CR0_PG_MASK. As a result, the EFER write traps don't actually expose the
> > MSR_EFER_LMA even though it is set internally, and when QEMU
> > subsequently tries to pass this EFER value back to KVM via
> > KVM_SET_SREGS* it will fail various sanity checks and return -EINVAL,
> > which is now considered fatal due to the aforementioned QEMU commit.
> > 
> > This can be addressed by inferring the MSR_EFER_LMA bit being set when
> > paging is enabled and MSR_EFER_LME is set, and synthesizing it to ensure
> > the expected bits are all present in subsequent handling on the host
> > side.
> > 
> > Ultimately, this handling will be implemented in the host kernel, but to
> > avoid breaking QEMU's SEV-ES support when using older host kernels, the
> > same handling can be done in QEMU just after fetching the register
> > values via KVM_GET_SREGS*. Implement that here.
> > 
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
> > Cc: kvm@vger.kernel.org
> > Fixes: 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > ---
> > v2:
> >   - Add handling for KVM_GET_SREGS, not just KVM_GET_SREGS2
> > 
> >  target/i386/kvm/kvm.c | 14 ++++++++++++++
> >  1 file changed, 14 insertions(+)
> > 
> > diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> > index 11b8177eff..8721c1bf8f 100644
> > --- a/target/i386/kvm/kvm.c
> > +++ b/target/i386/kvm/kvm.c
> > @@ -3610,6 +3610,7 @@ static int kvm_get_sregs(X86CPU *cpu)
> >  {
> >      CPUX86State *env = &cpu->env;
> >      struct kvm_sregs sregs;
> > +    target_ulong cr0_old;
> >      int ret;
> >  
> >      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
> > @@ -3637,12 +3638,18 @@ static int kvm_get_sregs(X86CPU *cpu)
> >      env->gdt.limit = sregs.gdt.limit;
> >      env->gdt.base = sregs.gdt.base;
> >  
> > +    cr0_old = env->cr[0];
> >      env->cr[0] = sregs.cr0;
> >      env->cr[2] = sregs.cr2;
> >      env->cr[3] = sregs.cr3;
> >      env->cr[4] = sregs.cr4;
> >  
> >      env->efer = sregs.efer;
> > +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> > +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> > +            env->efer |= MSR_EFER_LMA;
> > +        }
> > +    }
> 
> I think that we should not check that CR0_PG has changed, and just blindly assume
> that if EFER.LME is set and CR0.PG is set, then EFER.LMA must be set as defined in x86 spec.
> 
> Otherwise, suppose qemu calls kvm_get_sregs twice: First time it will work,
> but second time CR0.PG will match one that is stored in the env, and thus the workaround
> will not be executed, and instead we will revert back to wrong EFER value 
> reported by the kernel.
> 
> How about something like that:
> 
> 
> if (sev_es_enabled() && env->efer & MSR_EFER_LME && env->cr[0] & CR0_PG_MASK) {
> 	/* 
>          * Workaround KVM bug, because of which KVM might not be aware of the 
>          * fact that EFER.LMA was toggled by the hardware 
>          */
> 	env->efer |= MSR_EFER_LMA;
> }

Hi Maxim,

I'd already sent a v3 based on a similar suggestion from Paolo:

  https://lists.gnu.org/archive/html/qemu-devel/2023-12/msg00751.html

Does that one look okay to you?

Thanks,

Mike

> 
> 
> Best regards,
> 	Maxim Levitsky
> 
> >  
> >      /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
> >      x86_update_hflags(env);
> > @@ -3654,6 +3661,7 @@ static int kvm_get_sregs2(X86CPU *cpu)
> >  {
> >      CPUX86State *env = &cpu->env;
> >      struct kvm_sregs2 sregs;
> > +    target_ulong cr0_old;
> >      int i, ret;
> >  
> >      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
> > @@ -3676,12 +3684,18 @@ static int kvm_get_sregs2(X86CPU *cpu)
> >      env->gdt.limit = sregs.gdt.limit;
> >      env->gdt.base = sregs.gdt.base;
> >  
> > +    cr0_old = env->cr[0];
> >      env->cr[0] = sregs.cr0;
> >      env->cr[2] = sregs.cr2;
> >      env->cr[3] = sregs.cr3;
> >      env->cr[4] = sregs.cr4;
> >  
> >      env->efer = sregs.efer;
> > +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> > +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> > +            env->efer |= MSR_EFER_LMA;
> > +        }
> > +    }
> >  
> >      env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
> >  
> 
>
Maxim Levitsky Dec. 8, 2023, 3:20 p.m. UTC | #10
On Wed, 2023-12-06 at 11:42 -0600, Michael Roth wrote:
> On Wed, Dec 06, 2023 at 07:20:14PM +0200, Maxim Levitsky wrote:
> > On Tue, 2023-12-05 at 16:28 -0600, Michael Roth wrote:
> > > Commit 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> > > added error checking for KVM_SET_SREGS/KVM_SET_SREGS2. In doing so, it
> > > exposed a long-running bug in current KVM support for SEV-ES where the
> > > kernel assumes that MSR_EFER_LMA will be set explicitly by the guest
> > > kernel, in which case EFER write traps would result in KVM eventually
> > > seeing MSR_EFER_LMA get set and recording it in such a way that it would
> > > be subsequently visible when accessing it via KVM_GET_SREGS/etc.
> > > 
> > > However, guests kernels currently rely on MSR_EFER_LMA getting set
> > > automatically when MSR_EFER_LME is set and paging is enabled via
> > > CR0_PG_MASK. As a result, the EFER write traps don't actually expose the
> > > MSR_EFER_LMA even though it is set internally, and when QEMU
> > > subsequently tries to pass this EFER value back to KVM via
> > > KVM_SET_SREGS* it will fail various sanity checks and return -EINVAL,
> > > which is now considered fatal due to the aforementioned QEMU commit.
> > > 
> > > This can be addressed by inferring the MSR_EFER_LMA bit being set when
> > > paging is enabled and MSR_EFER_LME is set, and synthesizing it to ensure
> > > the expected bits are all present in subsequent handling on the host
> > > side.
> > > 
> > > Ultimately, this handling will be implemented in the host kernel, but to
> > > avoid breaking QEMU's SEV-ES support when using older host kernels, the
> > > same handling can be done in QEMU just after fetching the register
> > > values via KVM_GET_SREGS*. Implement that here.
> > > 
> > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > > Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
> > > Cc: kvm@vger.kernel.org
> > > Fixes: 7191f24c7fcf ("accel/kvm/kvm-all: Handle register access errors")
> > > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > > ---
> > > v2:
> > >   - Add handling for KVM_GET_SREGS, not just KVM_GET_SREGS2
> > > 
> > >  target/i386/kvm/kvm.c | 14 ++++++++++++++
> > >  1 file changed, 14 insertions(+)
> > > 
> > > diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> > > index 11b8177eff..8721c1bf8f 100644
> > > --- a/target/i386/kvm/kvm.c
> > > +++ b/target/i386/kvm/kvm.c
> > > @@ -3610,6 +3610,7 @@ static int kvm_get_sregs(X86CPU *cpu)
> > >  {
> > >      CPUX86State *env = &cpu->env;
> > >      struct kvm_sregs sregs;
> > > +    target_ulong cr0_old;
> > >      int ret;
> > >  
> > >      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
> > > @@ -3637,12 +3638,18 @@ static int kvm_get_sregs(X86CPU *cpu)
> > >      env->gdt.limit = sregs.gdt.limit;
> > >      env->gdt.base = sregs.gdt.base;
> > >  
> > > +    cr0_old = env->cr[0];
> > >      env->cr[0] = sregs.cr0;
> > >      env->cr[2] = sregs.cr2;
> > >      env->cr[3] = sregs.cr3;
> > >      env->cr[4] = sregs.cr4;
> > >  
> > >      env->efer = sregs.efer;
> > > +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> > > +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> > > +            env->efer |= MSR_EFER_LMA;
> > > +        }
> > > +    }
> > 
> > I think that we should not check that CR0_PG has changed, and just blindly assume
> > that if EFER.LME is set and CR0.PG is set, then EFER.LMA must be set as defined in x86 spec.
> > 
> > Otherwise, suppose qemu calls kvm_get_sregs twice: First time it will work,
> > but second time CR0.PG will match one that is stored in the env, and thus the workaround
> > will not be executed, and instead we will revert back to wrong EFER value 
> > reported by the kernel.
> > 
> > How about something like that:
> > 
> > 
> > if (sev_es_enabled() && env->efer & MSR_EFER_LME && env->cr[0] & CR0_PG_MASK) {
> > 	/* 
> >          * Workaround KVM bug, because of which KVM might not be aware of the 
> >          * fact that EFER.LMA was toggled by the hardware 
> >          */
> > 	env->efer |= MSR_EFER_LMA;
> > }
> 
> Hi Maxim,
> 
> I'd already sent a v3 based on a similar suggestion from Paolo:
> 
>   https://lists.gnu.org/archive/html/qemu-devel/2023-12/msg00751.html
> 
> Does that one look okay to you?

Yep, thanks!

Best regards,
	Maxim Levitsky
> 
> Thanks,
> 
> Mike
> 
> > 
> > Best regards,
> > 	Maxim Levitsky
> > 
> > >  
> > >      /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
> > >      x86_update_hflags(env);
> > > @@ -3654,6 +3661,7 @@ static int kvm_get_sregs2(X86CPU *cpu)
> > >  {
> > >      CPUX86State *env = &cpu->env;
> > >      struct kvm_sregs2 sregs;
> > > +    target_ulong cr0_old;
> > >      int i, ret;
> > >  
> > >      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
> > > @@ -3676,12 +3684,18 @@ static int kvm_get_sregs2(X86CPU *cpu)
> > >      env->gdt.limit = sregs.gdt.limit;
> > >      env->gdt.base = sregs.gdt.base;
> > >  
> > > +    cr0_old = env->cr[0];
> > >      env->cr[0] = sregs.cr0;
> > >      env->cr[2] = sregs.cr2;
> > >      env->cr[3] = sregs.cr3;
> > >      env->cr[4] = sregs.cr4;
> > >  
> > >      env->efer = sregs.efer;
> > > +    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
> > > +        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
> > > +            env->efer |= MSR_EFER_LMA;
> > > +        }
> > > +    }
> > >  
> > >      env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
> > >
diff mbox series

Patch

diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 11b8177eff..8721c1bf8f 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -3610,6 +3610,7 @@  static int kvm_get_sregs(X86CPU *cpu)
 {
     CPUX86State *env = &cpu->env;
     struct kvm_sregs sregs;
+    target_ulong cr0_old;
     int ret;
 
     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
@@ -3637,12 +3638,18 @@  static int kvm_get_sregs(X86CPU *cpu)
     env->gdt.limit = sregs.gdt.limit;
     env->gdt.base = sregs.gdt.base;
 
+    cr0_old = env->cr[0];
     env->cr[0] = sregs.cr0;
     env->cr[2] = sregs.cr2;
     env->cr[3] = sregs.cr3;
     env->cr[4] = sregs.cr4;
 
     env->efer = sregs.efer;
+    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
+        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
+            env->efer |= MSR_EFER_LMA;
+        }
+    }
 
     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
     x86_update_hflags(env);
@@ -3654,6 +3661,7 @@  static int kvm_get_sregs2(X86CPU *cpu)
 {
     CPUX86State *env = &cpu->env;
     struct kvm_sregs2 sregs;
+    target_ulong cr0_old;
     int i, ret;
 
     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
@@ -3676,12 +3684,18 @@  static int kvm_get_sregs2(X86CPU *cpu)
     env->gdt.limit = sregs.gdt.limit;
     env->gdt.base = sregs.gdt.base;
 
+    cr0_old = env->cr[0];
     env->cr[0] = sregs.cr0;
     env->cr[2] = sregs.cr2;
     env->cr[3] = sregs.cr3;
     env->cr[4] = sregs.cr4;
 
     env->efer = sregs.efer;
+    if (sev_es_enabled() && env->efer & MSR_EFER_LME) {
+        if (!(cr0_old & CR0_PG_MASK) && env->cr[0] & CR0_PG_MASK) {
+            env->efer |= MSR_EFER_LMA;
+        }
+    }
 
     env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;