diff mbox series

[v19,059/130] KVM: x86/tdp_mmu: Don't zap private pages for unsupported cases

Message ID 1ed955a44cd81738b498fe52823766622d8ad57f.1708933498.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand

Commit Message

Isaku Yamahata Feb. 26, 2024, 8:26 a.m. UTC
From: Sean Christopherson <sean.j.christopherson@intel.com>

TDX supports only write-back(WB) memory type for private memory
architecturally so that (virtualized) memory type change doesn't make sense
for private memory.  Also currently, page migration isn't supported for TDX
yet. (TDX architecturally supports page migration. it's KVM and kernel
implementation issue.)

Regarding memory type change (mtrr virtualization and lapic page mapping
change), pages are zapped by kvm_zap_gfn_range().  On the next KVM page
fault, the SPTE entry with a new memory type for the page is populated.
Regarding page migration, pages are zapped by the mmu notifier. On the next
KVM page fault, the new migrated page is populated.  Don't zap private
pages on unmapping for those two cases.

When deleting/moving a KVM memory slot, zap private pages. Typically
tearing down VM.  Don't invalidate private page tables. i.e. zap only leaf
SPTEs for KVM mmu that has a shared bit mask. The existing
kvm_tdp_mmu_invalidate_all_roots() depends on role.invalid with read-lock
of mmu_lock so that other vcpu can operate on KVM mmu concurrently.  It
marks the root page table invalid and zaps SPTEs of the root page
tables. The TDX module doesn't allow to unlink a protected root page table
from the hardware and then allocate a new one for it. i.e. replacing a
protected root page table.  Instead, zap only leaf SPTEs for KVM mmu with a
shared bit mask set.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/mmu/mmu.c     | 61 ++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/mmu/tdp_mmu.c | 37 +++++++++++++++++++----
 arch/x86/kvm/mmu/tdp_mmu.h |  5 ++--
 3 files changed, 92 insertions(+), 11 deletions(-)

Comments

Edgecombe, Rick P March 18, 2024, 11:46 p.m. UTC | #1
On Mon, 2024-02-26 at 00:26 -0800, isaku.yamahata@intel.com wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> 
> TDX supports only write-back(WB) memory type for private memory
> architecturally so that (virtualized) memory type change doesn't make
> sense
> for private memory.  Also currently, page migration isn't supported
> for TDX
> yet. (TDX architecturally supports page migration. it's KVM and
> kernel
> implementation issue.)
> 
> Regarding memory type change (mtrr virtualization and lapic page
> mapping
> change), pages are zapped by kvm_zap_gfn_range().  On the next KVM
> page
> fault, the SPTE entry with a new memory type for the page is
> populated.
> Regarding page migration, pages are zapped by the mmu notifier. On
> the next
> KVM page fault, the new migrated page is populated.  Don't zap
> private
> pages on unmapping for those two cases.

Is the migration case relevant to TDX?

> 
> When deleting/moving a KVM memory slot, zap private pages. Typically
> tearing down VM.  Don't invalidate private page tables. i.e. zap only
> leaf
> SPTEs for KVM mmu that has a shared bit mask. The existing
> kvm_tdp_mmu_invalidate_all_roots() depends on role.invalid with read-
> lock
> of mmu_lock so that other vcpu can operate on KVM mmu concurrently. 
> It
> marks the root page table invalid and zaps SPTEs of the root page
> tables. The TDX module doesn't allow to unlink a protected root page
> table
> from the hardware and then allocate a new one for it. i.e. replacing
> a
> protected root page table.  Instead, zap only leaf SPTEs for KVM mmu
> with a
> shared bit mask set.

I get the part about only zapping leafs and not the root and mid-level
PTEs. But why the MTRR, lapic page and migration part? Why should those
not be zapped? Why is migration a consideration when it is not
supported?

> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/kvm/mmu/mmu.c     | 61
> ++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/mmu/tdp_mmu.c | 37 +++++++++++++++++++----
>  arch/x86/kvm/mmu/tdp_mmu.h |  5 ++--
>  3 files changed, 92 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0d6d4506ec97..30c86e858ae4 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -6339,7 +6339,7 @@ static void kvm_mmu_zap_all_fast(struct kvm
> *kvm)
>          * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock
> and yield.
>          */
>         if (tdp_mmu_enabled)
> -               kvm_tdp_mmu_invalidate_all_roots(kvm);
> +               kvm_tdp_mmu_invalidate_all_roots(kvm, true);
>  
>         /*
>          * Notify all vcpus to reload its shadow page table and flush
> TLB.
> @@ -6459,7 +6459,16 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t
> gfn_start, gfn_t gfn_end)
>         flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
>  
>         if (tdp_mmu_enabled)
> -               flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start,
> gfn_end, flush);
> +               /*
> +                * zap_private = false. Zap only shared pages.
> +                *
> +                * kvm_zap_gfn_range() is used when MTRR or PAT
> memory
> +                * type was changed.  Later on the next kvm page
> fault,
> +                * populate it with updated spte entry.
> +                * Because only WB is supported for private pages,
> don't
> +                * care of private pages.
> +                */
> +               flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start,
> gfn_end, flush, false);
>  
>         if (flush)
>                 kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end -
> gfn_start);
> @@ -6905,10 +6914,56 @@ void kvm_arch_flush_shadow_all(struct kvm
> *kvm)
>         kvm_mmu_zap_all(kvm);
>  }
>  
> +static void kvm_mmu_zap_memslot(struct kvm *kvm, struct
> kvm_memory_slot *slot)

What about kvm_mmu_zap_memslot_leafs() instead?

> +{
> +       bool flush = false;

It doesn't need to be initialized if it passes false directly into
kvm_tdp_mmu_unmap_gfn_range(). It would make the code easier to
understand.

> +
> +       write_lock(&kvm->mmu_lock);
> +
> +       /*
> +        * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't
> required, worst
> +        * case scenario we'll have unused shadow pages lying around
> until they
> +        * are recycled due to age or when the VM is destroyed.
> +        */
> +       if (tdp_mmu_enabled) {
> +               struct kvm_gfn_range range = {
> +                     .slot = slot,
> +                     .start = slot->base_gfn,
> +                     .end = slot->base_gfn + slot->npages,
> +                     .may_block = true,
> +
> +                     /*
> +                      * This handles both private gfn and shared
> gfn.
> +                      * All private page should be zapped on memslot
> deletion.
> +                      */
> +                     .only_private = true,
> +                     .only_shared = true,

only_private and only_shared are both true? Shouldn't they both be
false? (or just unset)

> +               };
> +
> +               flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range,
> flush);
> +       } else {
> +               /* TDX supports only TDP-MMU case. */
> +               WARN_ON_ONCE(1);

How about a KVM_BUG_ON() instead? If somehow this is reached, we don't
want the caller thinking the pages are zapped, then enter the guest
with pages mapped that have gone elsewhere.

> +               flush = true;

Why flush?

> +       }
> +       if (flush)
> +               kvm_flush_remote_tlbs(kvm);
> +
> +       write_unlock(&kvm->mmu_lock);
> +}
> +
>  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>                                    struct kvm_memory_slot *slot)
>  {
> -       kvm_mmu_zap_all_fast(kvm);
> +       if (kvm_gfn_shared_mask(kvm))

There seems to be an attempt to abstract away the existence of Secure-
EPT in mmu.c, that is not fully successful. In this case the code
checks kvm_gfn_shared_mask() to see if it needs to handle the zapping
in a way specific needed by S-EPT. It ends up being a little confusing
because the actual check is about whether there is a shared bit. It
only works because only S-EPT is the only thing that has a
kvm_gfn_shared_mask().

Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks wrong,
but is more honest about what we are getting up to here. I'm not sure
though, what do you think?

> +               /*
> +                * Secure-EPT requires to release PTs from the leaf. 
> The
> +                * optimization to zap root PT first with child PT
> doesn't
> +                * work.
> +                */
> +               kvm_mmu_zap_memslot(kvm, slot);
> +       else
> +               kvm_mmu_zap_all_fast(kvm);
>  }
>  
>  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index d47f0daf1b03..e7514a807134 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -37,7 +37,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>          * for zapping and thus puts the TDP MMU's reference to each
> root, i.e.
>          * ultimately frees all roots.
>          */
> -       kvm_tdp_mmu_invalidate_all_roots(kvm);
> +       kvm_tdp_mmu_invalidate_all_roots(kvm, false);
>         kvm_tdp_mmu_zap_invalidated_roots(kvm);
>  
>         WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
> @@ -771,7 +771,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct
> kvm_mmu_page *sp)
>   * operation can cause a soft lockup.
>   */
>  static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page
> *root,
> -                             gfn_t start, gfn_t end, bool can_yield,
> bool flush)
> +                             gfn_t start, gfn_t end, bool can_yield,
> bool flush,
> +                             bool zap_private)
>  {
>         struct tdp_iter iter;
>  
> @@ -779,6 +780,10 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm,
> struct kvm_mmu_page *root,
>  
>         lockdep_assert_held_write(&kvm->mmu_lock);
>  
> +       WARN_ON_ONCE(zap_private && !is_private_sp(root));

All the callers have zap_private as zap_private && is_private_sp(root).
What badness is it trying to uncover?

> +       if (!zap_private && is_private_sp(root))
> +               return false;
> +
Isaku Yamahata March 19, 2024, 11:56 p.m. UTC | #2
On Mon, Mar 18, 2024 at 11:46:11PM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> On Mon, 2024-02-26 at 00:26 -0800, isaku.yamahata@intel.com wrote:
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > 
> > TDX supports only write-back(WB) memory type for private memory
> > architecturally so that (virtualized) memory type change doesn't make
> > sense
> > for private memory.  Also currently, page migration isn't supported
> > for TDX
> > yet. (TDX architecturally supports page migration. it's KVM and
> > kernel
> > implementation issue.)
> > 
> > Regarding memory type change (mtrr virtualization and lapic page
> > mapping
> > change), pages are zapped by kvm_zap_gfn_range().  On the next KVM
> > page
> > fault, the SPTE entry with a new memory type for the page is
> > populated.
> > Regarding page migration, pages are zapped by the mmu notifier. On
> > the next
> > KVM page fault, the new migrated page is populated.  Don't zap
> > private
> > pages on unmapping for those two cases.
> 
> Is the migration case relevant to TDX?

We can forget about it because the page migration isn't supported yet.


> > When deleting/moving a KVM memory slot, zap private pages. Typically
> > tearing down VM.  Don't invalidate private page tables. i.e. zap only
> > leaf
> > SPTEs for KVM mmu that has a shared bit mask. The existing
> > kvm_tdp_mmu_invalidate_all_roots() depends on role.invalid with read-
> > lock
> > of mmu_lock so that other vcpu can operate on KVM mmu concurrently. 
> > It
> > marks the root page table invalid and zaps SPTEs of the root page
> > tables. The TDX module doesn't allow to unlink a protected root page
> > table
> > from the hardware and then allocate a new one for it. i.e. replacing
> > a
> > protected root page table.  Instead, zap only leaf SPTEs for KVM mmu
> > with a
> > shared bit mask set.
> 
> I get the part about only zapping leafs and not the root and mid-level
> PTEs. But why the MTRR, lapic page and migration part? Why should those
> not be zapped? Why is migration a consideration when it is not
> supported?


When we zap a page from the guest, and add it again on TDX even with the same
GPA, the page is zeroed.  We'd like to keep memory contents for those cases.

Ok, let me add those whys and drop migration part. Here is the updated one.

TDX supports only write-back(WB) memory type for private memory
architecturally so that (virtualized) memory type change doesn't make
sense for private memory.  When we remove the private page from the guest
and re-add it with the same GPA, the page is zeroed.

Regarding memory type change (mtrr virtualization and lapic page
mapping change), the current implementation zaps pages, and populate
the page with new memory type on the next KVM page fault.  It doesn't
work for TDX to have zeroed pages. Because TDX supports only WB, we
ignore the request for MTRR and lapic page change to not zap private
pages on unmapping for those two cases

TDX Secure-EPT requires removing the guest pages first and leaf
Secure-EPT pages in order. It doesn't allow zap a Secure-EPT entry
that has child pages.  It doesn't work with the current TDP MMU
zapping logic that zaps the root page table without touching child
pages.  Instead, zap only leaf SPTEs for KVM mmu that has a shared bit
mask.

> 
> > 
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c     | 61
> > ++++++++++++++++++++++++++++++++++++--
> >  arch/x86/kvm/mmu/tdp_mmu.c | 37 +++++++++++++++++++----
> >  arch/x86/kvm/mmu/tdp_mmu.h |  5 ++--
> >  3 files changed, 92 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 0d6d4506ec97..30c86e858ae4 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -6339,7 +6339,7 @@ static void kvm_mmu_zap_all_fast(struct kvm
> > *kvm)
> >          * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock
> > and yield.
> >          */
> >         if (tdp_mmu_enabled)
> > -               kvm_tdp_mmu_invalidate_all_roots(kvm);
> > +               kvm_tdp_mmu_invalidate_all_roots(kvm, true);
> >  
> >         /*
> >          * Notify all vcpus to reload its shadow page table and flush
> > TLB.
> > @@ -6459,7 +6459,16 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t
> > gfn_start, gfn_t gfn_end)
> >         flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
> >  
> >         if (tdp_mmu_enabled)
> > -               flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start,
> > gfn_end, flush);
> > +               /*
> > +                * zap_private = false. Zap only shared pages.
> > +                *
> > +                * kvm_zap_gfn_range() is used when MTRR or PAT
> > memory
> > +                * type was changed.  Later on the next kvm page
> > fault,
> > +                * populate it with updated spte entry.
> > +                * Because only WB is supported for private pages,
> > don't
> > +                * care of private pages.
> > +                */
> > +               flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start,
> > gfn_end, flush, false);
> >  
> >         if (flush)
> >                 kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end -
> > gfn_start);
> > @@ -6905,10 +6914,56 @@ void kvm_arch_flush_shadow_all(struct kvm
> > *kvm)
> >         kvm_mmu_zap_all(kvm);
> >  }
> >  
> > +static void kvm_mmu_zap_memslot(struct kvm *kvm, struct
> > kvm_memory_slot *slot)
> 
> What about kvm_mmu_zap_memslot_leafs() instead?

Ok.


> > +{
> > +       bool flush = false;
> 
> It doesn't need to be initialized if it passes false directly into
> kvm_tdp_mmu_unmap_gfn_range(). It would make the code easier to
> understand.
> 
> > +
> > +       write_lock(&kvm->mmu_lock);
> > +
> > +       /*
> > +        * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't
> > required, worst
> > +        * case scenario we'll have unused shadow pages lying around
> > until they
> > +        * are recycled due to age or when the VM is destroyed.
> > +        */
> > +       if (tdp_mmu_enabled) {
> > +               struct kvm_gfn_range range = {
> > +                     .slot = slot,
> > +                     .start = slot->base_gfn,
> > +                     .end = slot->base_gfn + slot->npages,
> > +                     .may_block = true,
> > +
> > +                     /*
> > +                      * This handles both private gfn and shared
> > gfn.
> > +                      * All private page should be zapped on memslot
> > deletion.
> > +                      */
> > +                     .only_private = true,
> > +                     .only_shared = true,
> 
> only_private and only_shared are both true? Shouldn't they both be
> false? (or just unset)

I replied at.
https://lore.kernel.org/kvm/ZUO1Giju0GkUdF0o@google.com/

> 
> > +               };
> > +
> > +               flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range,
> > flush);
> > +       } else {
> > +               /* TDX supports only TDP-MMU case. */
> > +               WARN_ON_ONCE(1);
> 
> How about a KVM_BUG_ON() instead? If somehow this is reached, we don't
> want the caller thinking the pages are zapped, then enter the guest
> with pages mapped that have gone elsewhere.
> 
> > +               flush = true;
> 
> Why flush?

Those are only safe guard. TDX supports only TDP MMU. Let me drop them.


> > +       }
> > +       if (flush)
> > +               kvm_flush_remote_tlbs(kvm);
> > +
> > +       write_unlock(&kvm->mmu_lock);
> > +}
> > +
> >  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> >                                    struct kvm_memory_slot *slot)
> >  {
> > -       kvm_mmu_zap_all_fast(kvm);
> > +       if (kvm_gfn_shared_mask(kvm))
> 
> There seems to be an attempt to abstract away the existence of Secure-
> EPT in mmu.c, that is not fully successful. In this case the code
> checks kvm_gfn_shared_mask() to see if it needs to handle the zapping
> in a way specific needed by S-EPT. It ends up being a little confusing
> because the actual check is about whether there is a shared bit. It
> only works because only S-EPT is the only thing that has a
> kvm_gfn_shared_mask().
> 
> Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks wrong,
> but is more honest about what we are getting up to here. I'm not sure
> though, what do you think?

Right, I attempted and failed in zapping case.  This is due to the restriction
that the Secure-EPT pages must be removed from the leaves.  the VMX case (also
NPT, even SNP) heavily depends on zapping root entry as optimization.

I can think of
- add TDX check. Looks wrong
- Use kvm_gfn_shared_mask(kvm). confusing
- Give other name for this check like zap_from_leafs (or better name?)
  The implementation is same to kvm_gfn_shared_mask() with comment.
  - Or we can add a boolean variable to struct kvm
  

> >  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index d47f0daf1b03..e7514a807134 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -37,7 +37,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
> >          * for zapping and thus puts the TDP MMU's reference to each
> > root, i.e.
> >          * ultimately frees all roots.
> >          */
> > -       kvm_tdp_mmu_invalidate_all_roots(kvm);
> > +       kvm_tdp_mmu_invalidate_all_roots(kvm, false);
> >         kvm_tdp_mmu_zap_invalidated_roots(kvm);
> >  
> >         WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
> > @@ -771,7 +771,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct
> > kvm_mmu_page *sp)
> >   * operation can cause a soft lockup.
> >   */
> >  static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page
> > *root,
> > -                             gfn_t start, gfn_t end, bool can_yield,
> > bool flush)
> > +                             gfn_t start, gfn_t end, bool can_yield,
> > bool flush,
> > +                             bool zap_private)
> >  {
> >         struct tdp_iter iter;
> >  
> > @@ -779,6 +780,10 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm,
> > struct kvm_mmu_page *root,
> >  
> >         lockdep_assert_held_write(&kvm->mmu_lock);
> >  
> > +       WARN_ON_ONCE(zap_private && !is_private_sp(root));
> 
> All the callers have zap_private as zap_private && is_private_sp(root).
> What badness is it trying to uncover?

I added this during debug. Let me drop it.
Edgecombe, Rick P March 20, 2024, 12:56 a.m. UTC | #3
On Tue, 2024-03-19 at 16:56 -0700, Isaku Yamahata wrote:
> When we zap a page from the guest, and add it again on TDX even with
> the same
> GPA, the page is zeroed.  We'd like to keep memory contents for those
> cases.
> 
> Ok, let me add those whys and drop migration part. Here is the
> updated one.
> 
> TDX supports only write-back(WB) memory type for private memory
> architecturally so that (virtualized) memory type change doesn't make
> sense for private memory.  When we remove the private page from the
> guest
> and re-add it with the same GPA, the page is zeroed.
> 
> Regarding memory type change (mtrr virtualization and lapic page
> mapping change), the current implementation zaps pages, and populate
                                                                     s^
> the page with new memory type on the next KVM page fault.  
                               ^s

> It doesn't work for TDX to have zeroed pages.
What does this mean? Above you mention how all the pages are zeroed. Do
you mean it doesn't work for TDX to zero a running guest's pages. Which
would happen for the operations that would expect the pages could get
faulted in again just fine.


> Because TDX supports only WB, we
> ignore the request for MTRR and lapic page change to not zap private
> pages on unmapping for those two cases

Hmm. I need to go back and look at this again. It's not clear from the
description why it is safe for the host to not zap pages if requested
to. I see why the guest wouldn't want them to be zapped.

> 
> TDX Secure-EPT requires removing the guest pages first and leaf
> Secure-EPT pages in order. It doesn't allow zap a Secure-EPT entry
> that has child pages.  It doesn't work with the current TDP MMU
> zapping logic that zaps the root page table without touching child
> pages.  Instead, zap only leaf SPTEs for KVM mmu that has a shared
> bit
> mask.

Could this be better as two patches that each address a separate thing?
1. Leaf only zapping
2. Don't zap for MTRR, etc.

> > 
> > There seems to be an attempt to abstract away the existence of
> > Secure-
> > EPT in mmu.c, that is not fully successful. In this case the code
> > checks kvm_gfn_shared_mask() to see if it needs to handle the
> > zapping
> > in a way specific needed by S-EPT. It ends up being a little
> > confusing
> > because the actual check is about whether there is a shared bit. It
> > only works because only S-EPT is the only thing that has a
> > kvm_gfn_shared_mask().
> > 
> > Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks
> > wrong,
> > but is more honest about what we are getting up to here. I'm not
> > sure
> > though, what do you think?
> 
> Right, I attempted and failed in zapping case.  This is due to the
> restriction
> that the Secure-EPT pages must be removed from the leaves.  the VMX
> case (also
> NPT, even SNP) heavily depends on zapping root entry as optimization.
> 
> I can think of
> - add TDX check. Looks wrong
> - Use kvm_gfn_shared_mask(kvm). confusing
> - Give other name for this check like zap_from_leafs (or better
> name?)
>   The implementation is same to kvm_gfn_shared_mask() with comment.
>   - Or we can add a boolean variable to struct kvm

Hmm, maybe wrap it in a function like:
static inline bool kvm_can_only_zap_leafs(const struct kvm *kvm)
{
	/* A comment explaining what is going on */
	return kvm->arch.vm_type == KVM_X86_TDX_VM;
}

But KVM seems to be a bit more on the open coded side when it comes to
things like this, so not sure what maintainers would prefer. My opinion
is the kvm_gfn_shared_mask() check is too strange and it's worth a new
helper. If that is bad, then just open coded kvm->arch.vm_type ==
KVM_X86_TDX_VM is the second best I think.

I feel both strongly that it should be changed, and unsure what
maintainers would prefer. Hopefully one will chime in.
Edgecombe, Rick P March 21, 2024, 1:17 a.m. UTC | #4
On Tue, 2024-03-19 at 17:56 -0700, Rick Edgecombe wrote:
> > Because TDX supports only WB, we
> > ignore the request for MTRR and lapic page change to not zap
> > private
> > pages on unmapping for those two cases
> 
> Hmm. I need to go back and look at this again. It's not clear from
> the
> description why it is safe for the host to not zap pages if requested
> to. I see why the guest wouldn't want them to be zapped.

Ok, I see now how this works. MTRRs and APIC zapping happen to use the
same function: kvm_zap_gfn_range(). So restricting that function from
zapping private pages has the desired affect. I think it's not ideal
that kvm_zap_gfn_range() silently skips zapping some ranges. I wonder
if we could pass something in, so it's more clear to the caller.

But can these code paths even get reaches in TDX? It sounded like MTRRs
basically weren't supported.
Isaku Yamahata March 21, 2024, 10:39 p.m. UTC | #5
On Wed, Mar 20, 2024 at 12:56:38AM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> On Tue, 2024-03-19 at 16:56 -0700, Isaku Yamahata wrote:
> > When we zap a page from the guest, and add it again on TDX even with
> > the same
> > GPA, the page is zeroed.  We'd like to keep memory contents for those
> > cases.
> > 
> > Ok, let me add those whys and drop migration part. Here is the
> > updated one.
> > 
> > TDX supports only write-back(WB) memory type for private memory
> > architecturally so that (virtualized) memory type change doesn't make
> > sense for private memory.  When we remove the private page from the
> > guest
> > and re-add it with the same GPA, the page is zeroed.
> > 
> > Regarding memory type change (mtrr virtualization and lapic page
> > mapping change), the current implementation zaps pages, and populate
>                                                                      s^
> > the page with new memory type on the next KVM page fault.  
>                                ^s
> 
> > It doesn't work for TDX to have zeroed pages.
> What does this mean? Above you mention how all the pages are zeroed. Do
> you mean it doesn't work for TDX to zero a running guest's pages. Which
> would happen for the operations that would expect the pages could get
> faulted in again just fine.

(non-TDX part of) KVM assumes that page contents are preserved after zapping and
re-populate.  This isn't true for TDX.  The guest would suddenly see zero pages
instead of the old memory contents and would be upset.


> > Because TDX supports only WB, we
> > ignore the request for MTRR and lapic page change to not zap private
> > pages on unmapping for those two cases
> 
> Hmm. I need to go back and look at this again. It's not clear from the
> description why it is safe for the host to not zap pages if requested
> to. I see why the guest wouldn't want them to be zapped.

KVM siltently ignores the request to change memory types.


> > TDX Secure-EPT requires removing the guest pages first and leaf
> > Secure-EPT pages in order. It doesn't allow zap a Secure-EPT entry
> > that has child pages.  It doesn't work with the current TDP MMU
> > zapping logic that zaps the root page table without touching child
> > pages.  Instead, zap only leaf SPTEs for KVM mmu that has a shared
> > bit
> > mask.
> 
> Could this be better as two patches that each address a separate thing?
> 1. Leaf only zapping
> 2. Don't zap for MTRR, etc.

Makes sense. Let's split it.


> > > There seems to be an attempt to abstract away the existence of
> > > Secure-
> > > EPT in mmu.c, that is not fully successful. In this case the code
> > > checks kvm_gfn_shared_mask() to see if it needs to handle the
> > > zapping
> > > in a way specific needed by S-EPT. It ends up being a little
> > > confusing
> > > because the actual check is about whether there is a shared bit. It
> > > only works because only S-EPT is the only thing that has a
> > > kvm_gfn_shared_mask().
> > > 
> > > Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks
> > > wrong,
> > > but is more honest about what we are getting up to here. I'm not
> > > sure
> > > though, what do you think?
> > 
> > Right, I attempted and failed in zapping case.  This is due to the
> > restriction
> > that the Secure-EPT pages must be removed from the leaves.  the VMX
> > case (also
> > NPT, even SNP) heavily depends on zapping root entry as optimization.
> > 
> > I can think of
> > - add TDX check. Looks wrong
> > - Use kvm_gfn_shared_mask(kvm). confusing
> > - Give other name for this check like zap_from_leafs (or better
> > name?)
> >   The implementation is same to kvm_gfn_shared_mask() with comment.
> >   - Or we can add a boolean variable to struct kvm
> 
> Hmm, maybe wrap it in a function like:
> static inline bool kvm_can_only_zap_leafs(const struct kvm *kvm)
> {
> 	/* A comment explaining what is going on */
> 	return kvm->arch.vm_type == KVM_X86_TDX_VM;
> }
> 
> But KVM seems to be a bit more on the open coded side when it comes to
> things like this, so not sure what maintainers would prefer. My opinion
> is the kvm_gfn_shared_mask() check is too strange and it's worth a new
> helper. If that is bad, then just open coded kvm->arch.vm_type ==
> KVM_X86_TDX_VM is the second best I think.
> 
> I feel both strongly that it should be changed, and unsure what
> maintainers would prefer. Hopefully one will chime in.

Now compile time config is dropped, open code is option.
Isaku Yamahata March 21, 2024, 10:59 p.m. UTC | #6
On Thu, Mar 21, 2024 at 01:17:35AM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> On Tue, 2024-03-19 at 17:56 -0700, Rick Edgecombe wrote:
> > > Because TDX supports only WB, we
> > > ignore the request for MTRR and lapic page change to not zap
> > > private
> > > pages on unmapping for those two cases
> > 
> > Hmm. I need to go back and look at this again. It's not clear from
> > the
> > description why it is safe for the host to not zap pages if requested
> > to. I see why the guest wouldn't want them to be zapped.
> 
> Ok, I see now how this works. MTRRs and APIC zapping happen to use the
> same function: kvm_zap_gfn_range(). So restricting that function from
> zapping private pages has the desired affect. I think it's not ideal
> that kvm_zap_gfn_range() silently skips zapping some ranges. I wonder
> if we could pass something in, so it's more clear to the caller.
> 
> But can these code paths even get reaches in TDX? It sounded like MTRRs
> basically weren't supported.

We can make the code paths so with the (new) assumption that guest MTRR can
be disabled cleanly.
Edgecombe, Rick P March 22, 2024, 12:40 a.m. UTC | #7
On Thu, 2024-03-21 at 15:59 -0700, Isaku Yamahata wrote:
> > 
> > Ok, I see now how this works. MTRRs and APIC zapping happen to use
> > the
> > same function: kvm_zap_gfn_range(). So restricting that function
> > from
> > zapping private pages has the desired affect. I think it's not
> > ideal
> > that kvm_zap_gfn_range() silently skips zapping some ranges. I
> > wonder
> > if we could pass something in, so it's more clear to the caller.
> > 
> > But can these code paths even get reaches in TDX? It sounded like
> > MTRRs
> > basically weren't supported.
> 
> We can make the code paths so with the (new) assumption that guest
> MTRR can
> be disabled cleanly.

So the situation is (please correct):
KVM has a no "making up architectural behavior" rule, which is an
important one. But TDX module doesn't support MTRRs. So TD guests can't
have architectural behavior for MTRRs. So this patch is trying as best
as possible to match what MTRR behavior it can (not crash the guest if
someone tries).

First of all, if the guest unmaps the private memory, doesn't it have
to accept it again when gets re-added? So will the guest not crash
anyway?

But, I guess we should punt to userspace is the guest tries to use
MTRRs, not that userspace can handle it happening in a TD...  But it
seems cleaner and safer then skipping zapping some pages inside the
zapping code.

I'm still not sure if I understand the intention and constraints fully.
So please correct. This (the skipping the zapping for some operations)
is a theoretical correctness issue right? It doesn't resolve a TD
crash?
Isaku Yamahata March 25, 2024, 7:05 p.m. UTC | #8
On Fri, Mar 22, 2024 at 12:40:12AM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> On Thu, 2024-03-21 at 15:59 -0700, Isaku Yamahata wrote:
> > > 
> > > Ok, I see now how this works. MTRRs and APIC zapping happen to use
> > > the
> > > same function: kvm_zap_gfn_range(). So restricting that function
> > > from
> > > zapping private pages has the desired affect. I think it's not
> > > ideal
> > > that kvm_zap_gfn_range() silently skips zapping some ranges. I
> > > wonder
> > > if we could pass something in, so it's more clear to the caller.
> > > 
> > > But can these code paths even get reaches in TDX? It sounded like
> > > MTRRs
> > > basically weren't supported.
> > 
> > We can make the code paths so with the (new) assumption that guest
> > MTRR can
> > be disabled cleanly.
> 
> So the situation is (please correct):
> KVM has a no "making up architectural behavior" rule, which is an
> important one. But TDX module doesn't support MTRRs. So TD guests can't
> have architectural behavior for MTRRs. So this patch is trying as best
> as possible to match what MTRR behavior it can (not crash the guest if
> someone tries).
>
> First of all, if the guest unmaps the private memory, doesn't it have
> to accept it again when gets re-added? So will the guest not crash
> anyway?

Right, the guest has to accept it on VE.  If the unmap was intentional by guest,
that's fine.  The unmap is unintentional (with vMTRR), the guest doesn't expect
VE with the GPA.


> But, I guess we should punt to userspace is the guest tries to use
> MTRRs, not that userspace can handle it happening in a TD...  But it
> seems cleaner and safer then skipping zapping some pages inside the
> zapping code.
> 
> I'm still not sure if I understand the intention and constraints fully.
> So please correct. This (the skipping the zapping for some operations)
> is a theoretical correctness issue right? It doesn't resolve a TD
> crash?

For lapic, it's safe guard. Because TDX KVM disables APICv with
APICV_INHIBIT_REASON_TDX, apicv won't call kvm_zap_gfn_range().

For MTRR, the purpose is to make the guest boot (without the guest kernel
command line like clearcpuid=mtrr) .
If we can assume the guest won't touch MTRR registers somehow, KVM can return an
error to TDG.VP.VMCALL<RDMSR, WRMSR>(MTRR registers). So it doesn't call
kvm_zap_gfn_range(). Or we can use KVM_EXIT_X86_{RDMSR, WRMSR} as you suggested.
Edgecombe, Rick P March 25, 2024, 7:55 p.m. UTC | #9
On Mon, 2024-03-25 at 12:05 -0700, Isaku Yamahata wrote:
> Right, the guest has to accept it on VE.  If the unmap was intentional by guest,
> that's fine.  The unmap is unintentional (with vMTRR), the guest doesn't expect
> VE with the GPA.
> 
> 
> > But, I guess we should punt to userspace is the guest tries to use
> > MTRRs, not that userspace can handle it happening in a TD...  But it
> > seems cleaner and safer then skipping zapping some pages inside the
> > zapping code.
> > 
> > I'm still not sure if I understand the intention and constraints fully.
> > So please correct. This (the skipping the zapping for some operations)
> > is a theoretical correctness issue right? It doesn't resolve a TD
> > crash?
> 
> For lapic, it's safe guard. Because TDX KVM disables APICv with
> APICV_INHIBIT_REASON_TDX, apicv won't call kvm_zap_gfn_range().
Ah, I see it:
https://lore.kernel.org/lkml/38e2f8a77e89301534d82325946eb74db3e47815.1708933498.git.isaku.yamahata@intel.com/

Then it seems a warning would be more appropriate if we are worried there might be a way to still
call it. If we are confident it can't, then we can just ignore this case.

> 
> For MTRR, the purpose is to make the guest boot (without the guest kernel
> command line like clearcpuid=mtrr) .
> If we can assume the guest won't touch MTRR registers somehow, KVM can return an
> error to TDG.VP.VMCALL<RDMSR, WRMSR>(MTRR registers). So it doesn't call
> kvm_zap_gfn_range(). Or we can use KVM_EXIT_X86_{RDMSR, WRMSR} as you suggested.

My understanding is that Sean prefers to exit to userspace when KVM can't handle something, versus
making up behavior that keeps known guests alive. So I would think we should change this patch to
only be about not using the zapping roots optimization. Then a separate patch should exit to
userspace on attempt to use MTRRs. And we ignore the APIC one.

This is trying to guess what maintainers would want here. I'm less sure what Paolo prefers.
Isaku Yamahata March 25, 2024, 10:18 p.m. UTC | #10
On Mon, Mar 25, 2024 at 07:55:04PM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> On Mon, 2024-03-25 at 12:05 -0700, Isaku Yamahata wrote:
> > Right, the guest has to accept it on VE.  If the unmap was intentional by guest,
> > that's fine.  The unmap is unintentional (with vMTRR), the guest doesn't expect
> > VE with the GPA.
> > 
> > 
> > > But, I guess we should punt to userspace is the guest tries to use
> > > MTRRs, not that userspace can handle it happening in a TD...  But it
> > > seems cleaner and safer then skipping zapping some pages inside the
> > > zapping code.
> > > 
> > > I'm still not sure if I understand the intention and constraints fully.
> > > So please correct. This (the skipping the zapping for some operations)
> > > is a theoretical correctness issue right? It doesn't resolve a TD
> > > crash?
> > 
> > For lapic, it's safe guard. Because TDX KVM disables APICv with
> > APICV_INHIBIT_REASON_TDX, apicv won't call kvm_zap_gfn_range().
> Ah, I see it:
> https://lore.kernel.org/lkml/38e2f8a77e89301534d82325946eb74db3e47815.1708933498.git.isaku.yamahata@intel.com/
> 
> Then it seems a warning would be more appropriate if we are worried there might be a way to still
> call it. If we are confident it can't, then we can just ignore this case.
> 
> > 
> > For MTRR, the purpose is to make the guest boot (without the guest kernel
> > command line like clearcpuid=mtrr) .
> > If we can assume the guest won't touch MTRR registers somehow, KVM can return an
> > error to TDG.VP.VMCALL<RDMSR, WRMSR>(MTRR registers). So it doesn't call
> > kvm_zap_gfn_range(). Or we can use KVM_EXIT_X86_{RDMSR, WRMSR} as you suggested.
> 
> My understanding is that Sean prefers to exit to userspace when KVM can't handle something, versus
> making up behavior that keeps known guests alive. So I would think we should change this patch to
> only be about not using the zapping roots optimization. Then a separate patch should exit to
> userspace on attempt to use MTRRs. And we ignore the APIC one.
> 
> This is trying to guess what maintainers would want here. I'm less sure what Paolo prefers.

When we hit KVM_MSR_FILTER, the current implementation ignores it and makes it
error to guest.  Surely we should make it KVM_EXIT_X86_{RDMSR, WRMSR}, instead.
It's aligns with the existing implementation(default VM and SW-protected) and
more flexible.
Isaku Yamahata March 25, 2024, 11:10 p.m. UTC | #11
On Mon, Mar 25, 2024 at 03:18:36PM -0700,
Isaku Yamahata <isaku.yamahata@intel.com> wrote:

> On Mon, Mar 25, 2024 at 07:55:04PM +0000,
> "Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:
> 
> > On Mon, 2024-03-25 at 12:05 -0700, Isaku Yamahata wrote:
> > > Right, the guest has to accept it on VE.  If the unmap was intentional by guest,
> > > that's fine.  The unmap is unintentional (with vMTRR), the guest doesn't expect
> > > VE with the GPA.
> > > 
> > > 
> > > > But, I guess we should punt to userspace is the guest tries to use
> > > > MTRRs, not that userspace can handle it happening in a TD...  But it
> > > > seems cleaner and safer then skipping zapping some pages inside the
> > > > zapping code.
> > > > 
> > > > I'm still not sure if I understand the intention and constraints fully.
> > > > So please correct. This (the skipping the zapping for some operations)
> > > > is a theoretical correctness issue right? It doesn't resolve a TD
> > > > crash?
> > > 
> > > For lapic, it's safe guard. Because TDX KVM disables APICv with
> > > APICV_INHIBIT_REASON_TDX, apicv won't call kvm_zap_gfn_range().
> > Ah, I see it:
> > https://lore.kernel.org/lkml/38e2f8a77e89301534d82325946eb74db3e47815.1708933498.git.isaku.yamahata@intel.com/
> > 
> > Then it seems a warning would be more appropriate if we are worried there might be a way to still
> > call it. If we are confident it can't, then we can just ignore this case.
> > 
> > > 
> > > For MTRR, the purpose is to make the guest boot (without the guest kernel
> > > command line like clearcpuid=mtrr) .
> > > If we can assume the guest won't touch MTRR registers somehow, KVM can return an
> > > error to TDG.VP.VMCALL<RDMSR, WRMSR>(MTRR registers). So it doesn't call
> > > kvm_zap_gfn_range(). Or we can use KVM_EXIT_X86_{RDMSR, WRMSR} as you suggested.
> > 
> > My understanding is that Sean prefers to exit to userspace when KVM can't handle something, versus
> > making up behavior that keeps known guests alive. So I would think we should change this patch to
> > only be about not using the zapping roots optimization. Then a separate patch should exit to
> > userspace on attempt to use MTRRs. And we ignore the APIC one.
> > 
> > This is trying to guess what maintainers would want here. I'm less sure what Paolo prefers.
> 
> When we hit KVM_MSR_FILTER, the current implementation ignores it and makes it
> error to guest.  Surely we should make it KVM_EXIT_X86_{RDMSR, WRMSR}, instead.
> It's aligns with the existing implementation(default VM and SW-protected) and
> more flexible.

Something like this for "112/130 KVM: TDX: Handle TDX PV rdmsr/wrmsr hypercall"
Compile only tested at this point.

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index f891de30a2dd..4d9ae5743e24 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1388,31 +1388,67 @@ static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int tdx_complete_rdmsr(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->run->msr.error)
+		tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
+	else {
+		tdvmcall_set_return_code(vcpu, TDVMCALL_SUCCESS);
+		tdvmcall_set_return_val(vcpu, vcpu->run->msr.data);
+	}
+	return 1;
+}
+
 static int tdx_emulate_rdmsr(struct kvm_vcpu *vcpu)
 {
 	u32 index = tdvmcall_a0_read(vcpu);
 	u64 data;
 
-	if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ) ||
-	    kvm_get_msr(vcpu, index, &data)) {
+	if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) {
+		trace_kvm_msr_read_ex(index);
+		tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
+		return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
+					  tdx_complete_rdmsr,
+					  KVM_MSR_RET_FILTERED);
+	}
+
+	if (kvm_get_msr(vcpu, index, &data)) {
 		trace_kvm_msr_read_ex(index);
 		tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
 		return 1;
 	}
-	trace_kvm_msr_read(index, data);
 
+	trace_kvm_msr_read(index, data);
 	tdvmcall_set_return_code(vcpu, TDVMCALL_SUCCESS);
 	tdvmcall_set_return_val(vcpu, data);
 	return 1;
 }
 
+static int tdx_complete_wrmsr(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->run->msr.error)
+		tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
+	else
+		tdvmcall_set_return_code(vcpu, TDVMCALL_SUCCESS);
+	return 1;
+}
+
 static int tdx_emulate_wrmsr(struct kvm_vcpu *vcpu)
 {
 	u32 index = tdvmcall_a0_read(vcpu);
 	u64 data = tdvmcall_a1_read(vcpu);
 
-	if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE) ||
-	    kvm_set_msr(vcpu, index, data)) {
+	if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) {
+		trace_kvm_msr_write_ex(index, data);
+		tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
+		if (kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
+				       tdx_complete_wrmsr,
+				       KVM_MSR_RET_FILTERED))
+			return 1;
+		return 0;
+	}
+
+	if (kvm_set_msr(vcpu, index, data)) {
 		trace_kvm_msr_write_ex(index, data);
 		tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
 		return 1;
Edgecombe, Rick P March 25, 2024, 11:21 p.m. UTC | #12
On Mon, 2024-03-25 at 16:10 -0700, Isaku Yamahata wrote:
> > > My understanding is that Sean prefers to exit to userspace when KVM can't handle something,
> > > versus
> > > making up behavior that keeps known guests alive. So I would think we should change this patch
> > > to
> > > only be about not using the zapping roots optimization. Then a separate patch should exit to
> > > userspace on attempt to use MTRRs. And we ignore the APIC one.
> > > 
> > > This is trying to guess what maintainers would want here. I'm less sure what Paolo prefers.
> > 
> > When we hit KVM_MSR_FILTER, the current implementation ignores it and makes it
> > error to guest.  Surely we should make it KVM_EXIT_X86_{RDMSR, WRMSR}, instead.
> > It's aligns with the existing implementation(default VM and SW-protected) and
> > more flexible.
> 
> Something like this for "112/130 KVM: TDX: Handle TDX PV rdmsr/wrmsr hypercall"
> Compile only tested at this point.

Seems reasonable to me. Does QEMU configure a special set of MSRs to filter for TDX currently?
Isaku Yamahata March 25, 2024, 11:35 p.m. UTC | #13
On Mon, Mar 25, 2024 at 11:21:17PM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> On Mon, 2024-03-25 at 16:10 -0700, Isaku Yamahata wrote:
> > > > My understanding is that Sean prefers to exit to userspace when KVM can't handle something,
> > > > versus
> > > > making up behavior that keeps known guests alive. So I would think we should change this patch
> > > > to
> > > > only be about not using the zapping roots optimization. Then a separate patch should exit to
> > > > userspace on attempt to use MTRRs. And we ignore the APIC one.
> > > > 
> > > > This is trying to guess what maintainers would want here. I'm less sure what Paolo prefers.
> > > 
> > > When we hit KVM_MSR_FILTER, the current implementation ignores it and makes it
> > > error to guest.  Surely we should make it KVM_EXIT_X86_{RDMSR, WRMSR}, instead.
> > > It's aligns with the existing implementation(default VM and SW-protected) and
> > > more flexible.
> > 
> > Something like this for "112/130 KVM: TDX: Handle TDX PV rdmsr/wrmsr hypercall"
> > Compile only tested at this point.
> 
> Seems reasonable to me. Does QEMU configure a special set of MSRs to filter for TDX currently?

No for TDX at the moment.  We need to add such logic.
Chao Gao March 26, 2024, 2:32 a.m. UTC | #14
On Mon, Mar 25, 2024 at 04:35:28PM -0700, Isaku Yamahata wrote:
>On Mon, Mar 25, 2024 at 11:21:17PM +0000,
>"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:
>
>> On Mon, 2024-03-25 at 16:10 -0700, Isaku Yamahata wrote:
>> > > > My understanding is that Sean prefers to exit to userspace when KVM can't handle something,
>> > > > versus
>> > > > making up behavior that keeps known guests alive. So I would think we should change this patch
>> > > > to
>> > > > only be about not using the zapping roots optimization. Then a separate patch should exit to
>> > > > userspace on attempt to use MTRRs. And we ignore the APIC one.
>> > > > 
>> > > > This is trying to guess what maintainers would want here. I'm less sure what Paolo prefers.
>> > > 
>> > > When we hit KVM_MSR_FILTER, the current implementation ignores it and makes it
>> > > error to guest.  Surely we should make it KVM_EXIT_X86_{RDMSR, WRMSR}, instead.
>> > > It's aligns with the existing implementation(default VM and SW-protected) and
>> > > more flexible.
>> > 
>> > Something like this for "112/130 KVM: TDX: Handle TDX PV rdmsr/wrmsr hypercall"
>> > Compile only tested at this point.
>> 
>> Seems reasonable to me. Does QEMU configure a special set of MSRs to filter for TDX currently?
>
>No for TDX at the moment.  We need to add such logic.

What if QEMU doesn't configure the set of MSRs to filter? In this case, KVM
still needs to handle the MSR accesses.
Edgecombe, Rick P March 26, 2024, 2:42 a.m. UTC | #15
On Tue, 2024-03-26 at 10:32 +0800, Chao Gao wrote:
> > > > Something like this for "112/130 KVM: TDX: Handle TDX PV rdmsr/wrmsr hypercall"
> > > > Compile only tested at this point.
> > > 
> > > Seems reasonable to me. Does QEMU configure a special set of MSRs to filter for TDX currently?
> > 
> > No for TDX at the moment.  We need to add such logic.
> 
> What if QEMU doesn't configure the set of MSRs to filter? In this case, KVM
> still needs to handle the MSR accesses.

Do you see a problem for the kernel? I think if any issues are limited to only the guest, then we
should count on userspace to configure the msr list.

Today if the MSR access is not allowed by the filter, or the MSR access otherwise fails, an error is
returned to the guest. I think Isaku's proposal is to return to userspace if the filter list fails,
and return an error to the guest if the access otherwise fails. So the accessible MSRs are the same.
It's just change in how error is reported.
Chao Gao March 26, 2024, 11:13 a.m. UTC | #16
On Tue, Mar 26, 2024 at 10:42:36AM +0800, Edgecombe, Rick P wrote:
>On Tue, 2024-03-26 at 10:32 +0800, Chao Gao wrote:
>> > > > Something like this for "112/130 KVM: TDX: Handle TDX PV rdmsr/wrmsr hypercall"
>> > > > Compile only tested at this point.
>> > > 
>> > > Seems reasonable to me. Does QEMU configure a special set of MSRs to filter for TDX currently?
>> > 
>> > No for TDX at the moment.  We need to add such logic.
>> 
>> What if QEMU doesn't configure the set of MSRs to filter? In this case, KVM
>> still needs to handle the MSR accesses.
>
>Do you see a problem for the kernel? I think if any issues are limited to only the guest, then we
>should count on userspace to configure the msr list.

How can QEMU handle MTRR MSR accesses if KVM exits to QEMU? I am not sure if
QEMU needs to do a lot of work to virtualize MTRR.

If QEMU doesn't configure the msr filter list correctly, KVM has to handle
guest's MTRR MSR accesses. In my understanding, the suggestion is KVM zap
private memory mappings. But guests won't accept memory again because no one
currently requests guests to do this after writes to MTRR MSRs. In this case,
guests may access unaccepted memory, causing infinite EPT violation loop
(assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
the host. But I think it would be better if we can avoid wasting CPU resource
on the useless EPT violation loop.

>
>Today if the MSR access is not allowed by the filter, or the MSR access otherwise fails, an error is
>returned to the guest. I think Isaku's proposal is to return to userspace if the filter list fails,
>and return an error to the guest if the access otherwise fails. So the accessible MSRs are the same.
>It's just change in how error is reported.
Isaku Yamahata March 26, 2024, 5:48 p.m. UTC | #17
On Tue, Mar 26, 2024 at 07:13:46PM +0800,
Chao Gao <chao.gao@intel.com> wrote:

> On Tue, Mar 26, 2024 at 10:42:36AM +0800, Edgecombe, Rick P wrote:
> >On Tue, 2024-03-26 at 10:32 +0800, Chao Gao wrote:
> >> > > > Something like this for "112/130 KVM: TDX: Handle TDX PV rdmsr/wrmsr hypercall"
> >> > > > Compile only tested at this point.
> >> > > 
> >> > > Seems reasonable to me. Does QEMU configure a special set of MSRs to filter for TDX currently?
> >> > 
> >> > No for TDX at the moment.  We need to add such logic.
> >> 
> >> What if QEMU doesn't configure the set of MSRs to filter? In this case, KVM
> >> still needs to handle the MSR accesses.
> >
> >Do you see a problem for the kernel? I think if any issues are limited to only the guest, then we
> >should count on userspace to configure the msr list.
> 
> How can QEMU handle MTRR MSR accesses if KVM exits to QEMU? I am not sure if
> QEMU needs to do a lot of work to virtualize MTRR.

The default kernel logic will to return error for
TDG.VP.VMCALL<RDMSR or WRMSR MTRR registers>.
Qemu can have mostly same in the current kernel logic.

rdmsr:
MTRRCAP: 0
MTRRDEFTYPE: MTRR_TYPE_WRBACK

wrmsr:
MTRRDEFTYPE: If write back, nop. Otherwise error.


> If QEMU doesn't configure the msr filter list correctly, KVM has to handle
> guest's MTRR MSR accesses. In my understanding, the suggestion is KVM zap
> private memory mappings. But guests won't accept memory again because no one
> currently requests guests to do this after writes to MTRR MSRs. In this case,
> guests may access unaccepted memory, causing infinite EPT violation loop
> (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
> the host. But I think it would be better if we can avoid wasting CPU resource
> on the useless EPT violation loop.

Qemu is expected to do it correctly.  There are manyways for userspace to go
wrong.  This isn't specific to MTRR MSR.
Xiaoyao Li March 27, 2024, 2:54 a.m. UTC | #18
On 3/27/2024 1:48 AM, Isaku Yamahata wrote:
> On Tue, Mar 26, 2024 at 07:13:46PM +0800,
> Chao Gao <chao.gao@intel.com> wrote:
> 
>> On Tue, Mar 26, 2024 at 10:42:36AM +0800, Edgecombe, Rick P wrote:
>>> On Tue, 2024-03-26 at 10:32 +0800, Chao Gao wrote:
>>>>>>> Something like this for "112/130 KVM: TDX: Handle TDX PV rdmsr/wrmsr hypercall"
>>>>>>> Compile only tested at this point.
>>>>>>
>>>>>> Seems reasonable to me. Does QEMU configure a special set of MSRs to filter for TDX currently?
>>>>>
>>>>> No for TDX at the moment.  We need to add such logic.
>>>>
>>>> What if QEMU doesn't configure the set of MSRs to filter? In this case, KVM
>>>> still needs to handle the MSR accesses.
>>>
>>> Do you see a problem for the kernel? I think if any issues are limited to only the guest, then we
>>> should count on userspace to configure the msr list.
>>
>> How can QEMU handle MTRR MSR accesses if KVM exits to QEMU? I am not sure if
>> QEMU needs to do a lot of work to virtualize MTRR.
> 
> The default kernel logic will to return error for
> TDG.VP.VMCALL<RDMSR or WRMSR MTRR registers>.
> Qemu can have mostly same in the current kernel logic.
> 
> rdmsr:
> MTRRCAP: 0
> MTRRDEFTYPE: MTRR_TYPE_WRBACK
> 
> wrmsr:
> MTRRDEFTYPE: If write back, nop. Otherwise error.
> 
> 
>> If QEMU doesn't configure the msr filter list correctly, KVM has to handle
>> guest's MTRR MSR accesses. In my understanding, the suggestion is KVM zap
>> private memory mappings. But guests won't accept memory again because no one
>> currently requests guests to do this after writes to MTRR MSRs. In this case,
>> guests may access unaccepted memory, causing infinite EPT violation loop
>> (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
>> the host. But I think it would be better if we can avoid wasting CPU resource
>> on the useless EPT violation loop.
> 
> Qemu is expected to do it correctly.  There are manyways for userspace to go
> wrong.  This isn't specific to MTRR MSR.

This seems incorrect. KVM shouldn't force userspace to filter some 
specific MSRs. The semantic of MSR filter is userspace configures it on 
its own will, not KVM requires to do so.
Edgecombe, Rick P March 27, 2024, 5:36 p.m. UTC | #19
On Wed, 2024-03-27 at 10:54 +0800, Xiaoyao Li wrote:
> > > If QEMU doesn't configure the msr filter list correctly, KVM has to handle
> > > guest's MTRR MSR accesses. In my understanding, the suggestion is KVM zap
> > > private memory mappings. But guests won't accept memory again because no one
> > > currently requests guests to do this after writes to MTRR MSRs. In this case,
> > > guests may access unaccepted memory, causing infinite EPT violation loop
> > > (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
> > > the host. But I think it would be better if we can avoid wasting CPU resource
> > > on the useless EPT violation loop.
> > 
> > Qemu is expected to do it correctly.  There are manyways for userspace to go
> > wrong.  This isn't specific to MTRR MSR.
> 
> This seems incorrect. KVM shouldn't force userspace to filter some 
> specific MSRs. The semantic of MSR filter is userspace configures it on 
> its own will, not KVM requires to do so.

I'm ok just always doing the exit to userspace on attempt to use MTRRs in a TD, and not rely on the
MSR list. At least I don't see the problem.
Isaku Yamahata March 28, 2024, 12:06 a.m. UTC | #20
On Wed, Mar 27, 2024 at 05:36:07PM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> On Wed, 2024-03-27 at 10:54 +0800, Xiaoyao Li wrote:
> > > > If QEMU doesn't configure the msr filter list correctly, KVM has to handle
> > > > guest's MTRR MSR accesses. In my understanding, the suggestion is KVM zap
> > > > private memory mappings. But guests won't accept memory again because no one
> > > > currently requests guests to do this after writes to MTRR MSRs. In this case,
> > > > guests may access unaccepted memory, causing infinite EPT violation loop
> > > > (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
> > > > the host. But I think it would be better if we can avoid wasting CPU resource
> > > > on the useless EPT violation loop.
> > > 
> > > Qemu is expected to do it correctly.  There are manyways for userspace to go
> > > wrong.  This isn't specific to MTRR MSR.
> > 
> > This seems incorrect. KVM shouldn't force userspace to filter some 
> > specific MSRs. The semantic of MSR filter is userspace configures it on 
> > its own will, not KVM requires to do so.
> 
> I'm ok just always doing the exit to userspace on attempt to use MTRRs in a TD, and not rely on the
> MSR list. At least I don't see the problem.

KVM doesn't force it.  KVM allows QEMU to use the MSR filter for TDX.
(v19 doesn't allow it.) If QEMU chooses to use the MSR filter, QEMU has to
handle the MSR access correctly.
Xiaoyao Li March 28, 2024, 12:06 a.m. UTC | #21
On 3/28/2024 1:36 AM, Edgecombe, Rick P wrote:
> On Wed, 2024-03-27 at 10:54 +0800, Xiaoyao Li wrote:
>>>> If QEMU doesn't configure the msr filter list correctly, KVM has to handle
>>>> guest's MTRR MSR accesses. 
>>>> In my understanding, the suggestion is KVM zap private memory mappings. 

TDX spec states that

   18.2.1.4.1 Memory Type for Private and Opaque Access

   The memory type for private and opaque access semantics, which use a
   private HKID, is WB.

   18.2.1.4.2 Memory Type for Shared Accesses

   Intel SDM, Vol. 3, 28.2.7.2 Memory Type Used for Translated Guest-
   Physical Addresses

   The memory type for shared access semantics, which use a shared HKID,
   is determined as described below. Note that this is different from the
   way memory type is determined by the hardware during non-root mode
   operation. Rather, it is a best-effort approximation that is designed
   to still allow the host VMM some control over memory type.
     • For shared access during host-side (SEAMCALL) flows, the memory
       type is determined by MTRRs.
     • For shared access during guest-side flows (VM exit from the guest
       TD), the memory type is determined by a combination of the Shared
       EPT and MTRRs.
       o If the memory type determined during Shared EPT walk is WB, then
         the effective memory type for the access is determined by MTRRs.
       o Else, the effective memory type for the access is UC.

My understanding is that guest MTRR doesn't affect the memory type for 
private memory. So we don't need to zap private memory mappings.

>>>> But guests won't accept memory again because no one
>>>> currently requests guests to do this after writes to MTRR MSRs. In this case,
>>>> guests may access unaccepted memory, causing infinite EPT violation loop
>>>> (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
>>>> the host. But I think it would be better if we can avoid wasting CPU resource
>>>> on the useless EPT violation loop.
>>>
>>> Qemu is expected to do it correctly.  There are manyways for userspace to go
>>> wrong.  This isn't specific to MTRR MSR.
>>
>> This seems incorrect. KVM shouldn't force userspace to filter some
>> specific MSRs. The semantic of MSR filter is userspace configures it on
>> its own will, not KVM requires to do so.
> 
> I'm ok just always doing the exit to userspace on attempt to use MTRRs in a TD, and not rely on the
> MSR list. At least I don't see the problem.

What is the exit reason in vcpu->run->exit_reason? 
KVM_EXIT_X86_RDMSR/WRMSR? If so, it breaks the ABI on 
KVM_EXIT_X86_RDMSR/WRMSR.
Xiaoyao Li March 28, 2024, 12:23 a.m. UTC | #22
On 3/26/2024 3:55 AM, Edgecombe, Rick P wrote:
> On Mon, 2024-03-25 at 12:05 -0700, Isaku Yamahata wrote:
>> Right, the guest has to accept it on VE.  If the unmap was intentional by guest,
>> that's fine.  The unmap is unintentional (with vMTRR), the guest doesn't expect
>> VE with the GPA.
>>
>>
>>> But, I guess we should punt to userspace is the guest tries to use
>>> MTRRs, not that userspace can handle it happening in a TD...  But it
>>> seems cleaner and safer then skipping zapping some pages inside the
>>> zapping code.
>>>
>>> I'm still not sure if I understand the intention and constraints fully.
>>> So please correct. This (the skipping the zapping for some operations)
>>> is a theoretical correctness issue right? It doesn't resolve a TD
>>> crash?
>>
>> For lapic, it's safe guard. Because TDX KVM disables APICv with
>> APICV_INHIBIT_REASON_TDX, apicv won't call kvm_zap_gfn_range().
> Ah, I see it:
> https://lore.kernel.org/lkml/38e2f8a77e89301534d82325946eb74db3e47815.1708933498.git.isaku.yamahata@intel.com/
> 
> Then it seems a warning would be more appropriate if we are worried there might be a way to still
> call it. If we are confident it can't, then we can just ignore this case.
> 
>>
>> For MTRR, the purpose is to make the guest boot (without the guest kernel
>> command line like clearcpuid=mtrr) .
>> If we can assume the guest won't touch MTRR registers somehow, KVM can return an
>> error to TDG.VP.VMCALL<RDMSR, WRMSR>(MTRR registers). So it doesn't call
>> kvm_zap_gfn_range(). Or we can use KVM_EXIT_X86_{RDMSR, WRMSR} as you suggested.
> 
> My understanding is that Sean prefers to exit to userspace when KVM can't handle something, versus
> making up behavior that keeps known guests alive. So I would think we should change this patch to
> only be about not using the zapping roots optimization. Then a separate patch should exit to
> userspace on attempt to use MTRRs. And we ignore the APIC one.

Certainly no. If exit to userspace, what is the exit reason and what is 
expected for userspace to do? userspace can do nothing, except either 
kill the TD or eat the RDMSR/WRMSR.

There is nothing to do with userspace. MTRR is virtualized as fixed1 for 
TD (by current TDX architecture). Userspace can do nothing on it and 
it's not userspace's fault to let TD guest manipulate on MTRR MSRs.

This is the bad design of current TDX, what KVM should do is return 
error to TD on TDVMCALL of WR/RDMSR on MTRR MSRs. This should be a known 
flaw of TDX that MTRR is not supported though TD guest reads the MTRR 
CPUID as 1.

This flaw should be fixed by TDX architecture that making MTRR 
configurable. At that time, userspace is responsible to set MSR filter 
on MTRR MSRs if it wants to configure the MTRR CPUID to 1.

> This is trying to guess what maintainers would want here. I'm less sure what Paolo prefers.
Isaku Yamahata March 28, 2024, 12:36 a.m. UTC | #23
On Thu, Mar 28, 2024 at 08:06:53AM +0800,
Xiaoyao Li <xiaoyao.li@intel.com> wrote:

> On 3/28/2024 1:36 AM, Edgecombe, Rick P wrote:
> > On Wed, 2024-03-27 at 10:54 +0800, Xiaoyao Li wrote:
> > > > > If QEMU doesn't configure the msr filter list correctly, KVM has to handle
> > > > > guest's MTRR MSR accesses. In my understanding, the
> > > > > suggestion is KVM zap private memory mappings.
> 
> TDX spec states that
> 
>   18.2.1.4.1 Memory Type for Private and Opaque Access
> 
>   The memory type for private and opaque access semantics, which use a
>   private HKID, is WB.
> 
>   18.2.1.4.2 Memory Type for Shared Accesses
> 
>   Intel SDM, Vol. 3, 28.2.7.2 Memory Type Used for Translated Guest-
>   Physical Addresses
> 
>   The memory type for shared access semantics, which use a shared HKID,
>   is determined as described below. Note that this is different from the
>   way memory type is determined by the hardware during non-root mode
>   operation. Rather, it is a best-effort approximation that is designed
>   to still allow the host VMM some control over memory type.
>     • For shared access during host-side (SEAMCALL) flows, the memory
>       type is determined by MTRRs.
>     • For shared access during guest-side flows (VM exit from the guest
>       TD), the memory type is determined by a combination of the Shared
>       EPT and MTRRs.
>       o If the memory type determined during Shared EPT walk is WB, then
>         the effective memory type for the access is determined by MTRRs.
>       o Else, the effective memory type for the access is UC.
> 
> My understanding is that guest MTRR doesn't affect the memory type for
> private memory. So we don't need to zap private memory mappings.

So, there is no point to (try to) emulate MTRR.  The direction is, don't
advertise MTRR to the guest (new TDX module is needed.) or enforce
the guest to not use MTRR (guest command line clearcpuid=mtrr).  KVM will
simply return error to guest access to MTRR related registers.

QEMU or user space VMM can use the MSR filter if they want.


> > > > > But guests won't accept memory again because no one
> > > > > currently requests guests to do this after writes to MTRR MSRs. In this case,
> > > > > guests may access unaccepted memory, causing infinite EPT violation loop
> > > > > (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
> > > > > the host. But I think it would be better if we can avoid wasting CPU resource
> > > > > on the useless EPT violation loop.
> > > > 
> > > > Qemu is expected to do it correctly.  There are manyways for userspace to go
> > > > wrong.  This isn't specific to MTRR MSR.
> > > 
> > > This seems incorrect. KVM shouldn't force userspace to filter some
> > > specific MSRs. The semantic of MSR filter is userspace configures it on
> > > its own will, not KVM requires to do so.
> > 
> > I'm ok just always doing the exit to userspace on attempt to use MTRRs in a TD, and not rely on the
> > MSR list. At least I don't see the problem.
> 
> What is the exit reason in vcpu->run->exit_reason? KVM_EXIT_X86_RDMSR/WRMSR?
> If so, it breaks the ABI on KVM_EXIT_X86_RDMSR/WRMSR.

It's only when the user space requested it with the MSR filter.
Edgecombe, Rick P March 28, 2024, 12:45 a.m. UTC | #24
On Thu, 2024-03-28 at 08:06 +0800, Xiaoyao Li wrote:
> 
> TDX spec states that
> 
>    18.2.1.4.1 Memory Type for Private and Opaque Access
> 
>    The memory type for private and opaque access semantics, which use a
>    private HKID, is WB.
> 
>    18.2.1.4.2 Memory Type for Shared Accesses
> 
>    Intel SDM, Vol. 3, 28.2.7.2 Memory Type Used for Translated Guest-
>    Physical Addresses
> 
>    The memory type for shared access semantics, which use a shared HKID,
>    is determined as described below. Note that this is different from the
>    way memory type is determined by the hardware during non-root mode
>    operation. Rather, it is a best-effort approximation that is designed
>    to still allow the host VMM some control over memory type.
>      • For shared access during host-side (SEAMCALL) flows, the memory
>        type is determined by MTRRs.
>      • For shared access during guest-side flows (VM exit from the guest
>        TD), the memory type is determined by a combination of the Shared
>        EPT and MTRRs.
>        o If the memory type determined during Shared EPT walk is WB, then
>          the effective memory type for the access is determined by MTRRs.
>        o Else, the effective memory type for the access is UC.
> 
> My understanding is that guest MTRR doesn't affect the memory type for 
> private memory. So we don't need to zap private memory mappings.

Right, KVM can't zap the private side.

But why does KVM have to support a "best effort" MTRR virtualization for TDs? Kai pointed me to this
today and I haven't looked through it in depth yet:
https://lore.kernel.org/kvm/20240309010929.1403984-1-seanjc@google.com/

An alternative could be to mirror that behavior, but normal VMs have to work with existing userspace
setup. KVM doesn't support any TDs yet, so we can take the opportunity to not introduce weird
things.

> 
> > > > > But guests won't accept memory again because no one
> > > > > currently requests guests to do this after writes to MTRR MSRs. In this case,
> > > > > guests may access unaccepted memory, causing infinite EPT violation loop
> > > > > (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
> > > > > the host. But I think it would be better if we can avoid wasting CPU resource
> > > > > on the useless EPT violation loop.
> > > > 
> > > > Qemu is expected to do it correctly.  There are manyways for userspace to go
> > > > wrong.  This isn't specific to MTRR MSR.
> > > 
> > > This seems incorrect. KVM shouldn't force userspace to filter some
> > > specific MSRs. The semantic of MSR filter is userspace configures it on
> > > its own will, not KVM requires to do so.
> > 
> > I'm ok just always doing the exit to userspace on attempt to use MTRRs in a TD, and not rely on
> > the
> > MSR list. At least I don't see the problem.
> 
> What is the exit reason in vcpu->run->exit_reason? 
> KVM_EXIT_X86_RDMSR/WRMSR? If so, it breaks the ABI on 
> KVM_EXIT_X86_RDMSR/WRMSR.

How so? Userspace needs to learn to create a TD first.
Xiaoyao Li March 28, 2024, 12:58 a.m. UTC | #25
On 3/28/2024 8:45 AM, Edgecombe, Rick P wrote:
> On Thu, 2024-03-28 at 08:06 +0800, Xiaoyao Li wrote:
>>
>> TDX spec states that
>>
>>     18.2.1.4.1 Memory Type for Private and Opaque Access
>>
>>     The memory type for private and opaque access semantics, which use a
>>     private HKID, is WB.
>>
>>     18.2.1.4.2 Memory Type for Shared Accesses
>>
>>     Intel SDM, Vol. 3, 28.2.7.2 Memory Type Used for Translated Guest-
>>     Physical Addresses
>>
>>     The memory type for shared access semantics, which use a shared HKID,
>>     is determined as described below. Note that this is different from the
>>     way memory type is determined by the hardware during non-root mode
>>     operation. Rather, it is a best-effort approximation that is designed
>>     to still allow the host VMM some control over memory type.
>>       • For shared access during host-side (SEAMCALL) flows, the memory
>>         type is determined by MTRRs.
>>       • For shared access during guest-side flows (VM exit from the guest
>>         TD), the memory type is determined by a combination of the Shared
>>         EPT and MTRRs.
>>         o If the memory type determined during Shared EPT walk is WB, then
>>           the effective memory type for the access is determined by MTRRs.
>>         o Else, the effective memory type for the access is UC.
>>
>> My understanding is that guest MTRR doesn't affect the memory type for
>> private memory. So we don't need to zap private memory mappings.
> 
> Right, KVM can't zap the private side.
> 
> But why does KVM have to support a "best effort" MTRR virtualization for TDs? Kai pointed me to this
> today and I haven't looked through it in depth yet:
> https://lore.kernel.org/kvm/20240309010929.1403984-1-seanjc@google.com/
> 
> An alternative could be to mirror that behavior, but normal VMs have to work with existing userspace
> setup. KVM doesn't support any TDs yet, so we can take the opportunity to not introduce weird
> things.

Not to provide any MTRR support for TD is what I prefer.

>>
>>>>>> But guests won't accept memory again because no one
>>>>>> currently requests guests to do this after writes to MTRR MSRs. In this case,
>>>>>> guests may access unaccepted memory, causing infinite EPT violation loop
>>>>>> (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
>>>>>> the host. But I think it would be better if we can avoid wasting CPU resource
>>>>>> on the useless EPT violation loop.
>>>>>
>>>>> Qemu is expected to do it correctly.  There are manyways for userspace to go
>>>>> wrong.  This isn't specific to MTRR MSR.
>>>>
>>>> This seems incorrect. KVM shouldn't force userspace to filter some
>>>> specific MSRs. The semantic of MSR filter is userspace configures it on
>>>> its own will, not KVM requires to do so.
>>>
>>> I'm ok just always doing the exit to userspace on attempt to use MTRRs in a TD, and not rely on
>>> the
>>> MSR list. At least I don't see the problem.
>>
>> What is the exit reason in vcpu->run->exit_reason?
>> KVM_EXIT_X86_RDMSR/WRMSR? If so, it breaks the ABI on
>> KVM_EXIT_X86_RDMSR/WRMSR.
> 
> How so? Userspace needs to learn to create a TD first.

The current ABI of KVM_EXIT_X86_RDMSR/WRMSR is that userspace itself 
sets up MSR fitler at first, then it will get such EXIT_REASON when 
guest accesses the MSRs being filtered.

If you want to use this EXIT reason, then you need to enforce userspace 
setting up the MSR filter. How to enforce? If not enforce, but exit with 
KVM_EXIT_X86_RDMSR/WRMSR no matter usersapce sets up MSR filter or not. 
Then you are trying to introduce divergent behavior in KVM.
Xiaoyao Li March 28, 2024, 1:04 a.m. UTC | #26
On 3/28/2024 8:36 AM, Isaku Yamahata wrote:
> On Thu, Mar 28, 2024 at 08:06:53AM +0800,
> Xiaoyao Li <xiaoyao.li@intel.com> wrote:
> 
>> On 3/28/2024 1:36 AM, Edgecombe, Rick P wrote:
>>> On Wed, 2024-03-27 at 10:54 +0800, Xiaoyao Li wrote:
>>>>>> If QEMU doesn't configure the msr filter list correctly, KVM has to handle
>>>>>> guest's MTRR MSR accesses. In my understanding, the
>>>>>> suggestion is KVM zap private memory mappings.
>>
>> TDX spec states that
>>
>>    18.2.1.4.1 Memory Type for Private and Opaque Access
>>
>>    The memory type for private and opaque access semantics, which use a
>>    private HKID, is WB.
>>
>>    18.2.1.4.2 Memory Type for Shared Accesses
>>
>>    Intel SDM, Vol. 3, 28.2.7.2 Memory Type Used for Translated Guest-
>>    Physical Addresses
>>
>>    The memory type for shared access semantics, which use a shared HKID,
>>    is determined as described below. Note that this is different from the
>>    way memory type is determined by the hardware during non-root mode
>>    operation. Rather, it is a best-effort approximation that is designed
>>    to still allow the host VMM some control over memory type.
>>      • For shared access during host-side (SEAMCALL) flows, the memory
>>        type is determined by MTRRs.
>>      • For shared access during guest-side flows (VM exit from the guest
>>        TD), the memory type is determined by a combination of the Shared
>>        EPT and MTRRs.
>>        o If the memory type determined during Shared EPT walk is WB, then
>>          the effective memory type for the access is determined by MTRRs.
>>        o Else, the effective memory type for the access is UC.
>>
>> My understanding is that guest MTRR doesn't affect the memory type for
>> private memory. So we don't need to zap private memory mappings.
> 
> So, there is no point to (try to) emulate MTRR.  The direction is, don't
> advertise MTRR to the guest (new TDX module is needed.) or enforce
> the guest to not use MTRR (guest command line clearcpuid=mtrr).  

Ideally, it would be better if TD guest learns to disable/not use MTRR 
itself.

> KVM will
> simply return error to guest access to MTRR related registers.
> 
> QEMU or user space VMM can use the MSR filter if they want.
> 
> 
>>>>>> But guests won't accept memory again because no one
>>>>>> currently requests guests to do this after writes to MTRR MSRs. In this case,
>>>>>> guests may access unaccepted memory, causing infinite EPT violation loop
>>>>>> (assume SEPT_VE_DISABLE is set). This won't impact other guests/workloads on
>>>>>> the host. But I think it would be better if we can avoid wasting CPU resource
>>>>>> on the useless EPT violation loop.
>>>>>
>>>>> Qemu is expected to do it correctly.  There are manyways for userspace to go
>>>>> wrong.  This isn't specific to MTRR MSR.
>>>>
>>>> This seems incorrect. KVM shouldn't force userspace to filter some
>>>> specific MSRs. The semantic of MSR filter is userspace configures it on
>>>> its own will, not KVM requires to do so.
>>>
>>> I'm ok just always doing the exit to userspace on attempt to use MTRRs in a TD, and not rely on the
>>> MSR list. At least I don't see the problem.
>>
>> What is the exit reason in vcpu->run->exit_reason? KVM_EXIT_X86_RDMSR/WRMSR?
>> If so, it breaks the ABI on KVM_EXIT_X86_RDMSR/WRMSR.
> 
> It's only when the user space requested it with the MSR filter.

right. But userspace has no reason to filter them because userspace can 
do nothing except 1) either kill the TD, or 2) eat the instruction.
Edgecombe, Rick P March 28, 2024, 1:06 a.m. UTC | #27
On Thu, 2024-03-28 at 08:58 +0800, Xiaoyao Li wrote:
> > How so? Userspace needs to learn to create a TD first.
> 
> The current ABI of KVM_EXIT_X86_RDMSR/WRMSR is that userspace itself 
> sets up MSR fitler at first, then it will get such EXIT_REASON when 
> guest accesses the MSRs being filtered.
> 
> If you want to use this EXIT reason, then you need to enforce userspace 
> setting up the MSR filter. How to enforce?

I think Isaku's proposal was to let userspace configure it.

For the sake of conversation, what if we don't enforce it? The downside of not enforcing it is that
we then need to worry about code paths in KVM the MTRRs would call. But what goes wrong
functionally? If userspace doesn't fully setup a TD things can go wrong for the TD.

A plus side of using the MSR filter stuff is it reuses existing functionality.

>  If not enforce, but exit with 
> KVM_EXIT_X86_RDMSR/WRMSR no matter usersapce sets up MSR filter or not. 
> Then you are trying to introduce divergent behavior in KVM.

The current ABI of KVM_EXIT_X86_RDMSR when TDs are created is nothing. So I don't see how this is
any kind of ABI break. If you agree we shouldn't try to support MTRRs, do you have a different exit
reason or behavior in mind?
Xiaoyao Li March 28, 2024, 1:30 a.m. UTC | #28
On 3/28/2024 9:06 AM, Edgecombe, Rick P wrote:
> On Thu, 2024-03-28 at 08:58 +0800, Xiaoyao Li wrote:
>>> How so? Userspace needs to learn to create a TD first.
>>
>> The current ABI of KVM_EXIT_X86_RDMSR/WRMSR is that userspace itself
>> sets up MSR fitler at first, then it will get such EXIT_REASON when
>> guest accesses the MSRs being filtered.
>>
>> If you want to use this EXIT reason, then you need to enforce userspace
>> setting up the MSR filter. How to enforce?
> 
> I think Isaku's proposal was to let userspace configure it.
> 
> For the sake of conversation, what if we don't enforce it? The downside of not enforcing it is that
> we then need to worry about code paths in KVM the MTRRs would call. But what goes wrong
> functionally? If userspace doesn't fully setup a TD things can go wrong for the TD.
> 
> A plus side of using the MSR filter stuff is it reuses existing functionality.
> 
>>   If not enforce, but exit with
>> KVM_EXIT_X86_RDMSR/WRMSR no matter usersapce sets up MSR filter or not.
>> Then you are trying to introduce divergent behavior in KVM.
> 
> The current ABI of KVM_EXIT_X86_RDMSR when TDs are created is nothing. So I don't see how this is
> any kind of ABI break. If you agree we shouldn't try to support MTRRs, do you have a different exit
> reason or behavior in mind?

Just return error on TDVMCALL of RDMSR/WRMSR on TD's access of MTRR MSRs.
Edgecombe, Rick P March 28, 2024, 3:04 a.m. UTC | #29
On Thu, 2024-03-28 at 09:30 +0800, Xiaoyao Li wrote:
> > The current ABI of KVM_EXIT_X86_RDMSR when TDs are created is nothing. So I don't see how this
> > is
> > any kind of ABI break. If you agree we shouldn't try to support MTRRs, do you have a different
> > exit
> > reason or behavior in mind?
> 
> Just return error on TDVMCALL of RDMSR/WRMSR on TD's access of MTRR MSRs.

MTRR appears to be configured to be type "Fixed" in the TDX module. So the guest could expect to be
able to use it and be surprised by a #GP.

        {
          "MSB": "12",
          "LSB": "12",
          "Field Size": "1",
          "Field Name": "MTRR",
          "Configuration Details": null,
          "Bit or Field Virtualization Type": "Fixed",
          "Virtualization Details": "0x1"
        },

If KVM does not support MTRRs in TDX, then it has to return the error somewhere or pretend to
support it (do nothing but not return an error). Returning an error to the guest would be making up
arch behavior, and to a lesser degree so would ignoring the WRMSR. So that is why I lean towards
returning to userspace and giving the VMM the option to ignore it, return an error to the guest or
show an error to the user. If KVM can't support the behavior, better to get an actual error in
userspace than a mysterious guest hang, right?

Outside of what kind of exit it is, do you object to the general plan to punt to userspace?

Since this is a TDX specific limitation, I guess there is KVM_EXIT_TDX_VMCALL as a general category
of TDVMCALLs that cannot be handled by KVM.
Xiaoyao Li March 28, 2024, 3:40 a.m. UTC | #30
On 3/28/2024 11:04 AM, Edgecombe, Rick P wrote:
> On Thu, 2024-03-28 at 09:30 +0800, Xiaoyao Li wrote:
>>> The current ABI of KVM_EXIT_X86_RDMSR when TDs are created is nothing. So I don't see how this
>>> is
>>> any kind of ABI break. If you agree we shouldn't try to support MTRRs, do you have a different
>>> exit
>>> reason or behavior in mind?
>>
>> Just return error on TDVMCALL of RDMSR/WRMSR on TD's access of MTRR MSRs.
> 
> MTRR appears to be configured to be type "Fixed" in the TDX module. So the guest could expect to be
> able to use it and be surprised by a #GP.
> 
>          {
>            "MSB": "12",
>            "LSB": "12",
>            "Field Size": "1",
>            "Field Name": "MTRR",
>            "Configuration Details": null,
>            "Bit or Field Virtualization Type": "Fixed",
>            "Virtualization Details": "0x1"
>          },
> 
> If KVM does not support MTRRs in TDX, then it has to return the error somewhere or pretend to
> support it (do nothing but not return an error). Returning an error to the guest would be making up
> arch behavior, and to a lesser degree so would ignoring the WRMSR. 

The root cause is that it's a bad design of TDX to make MTRR fixed1. 
When guest reads MTRR CPUID as 1 while getting #VE on MTRR MSRs, it 
already breaks the architectural behavior. (MAC faces the similar issue 
, MCA is fixed1 as well while accessing MCA related MSRs gets #VE. This 
is why TDX is going to fix them by introducing new feature and make them 
configurable)

> So that is why I lean towards
> returning to userspace and giving the VMM the option to ignore it, return an error to the guest or
> show an error to the user. 

"show an error to the user" doesn't help at all. Because user cannot fix 
it, nor does QEMU.

> If KVM can't support the behavior, better to get an actual error in
> userspace than a mysterious guest hang, right?
What behavior do you mean?

> Outside of what kind of exit it is, do you object to the general plan to punt to userspace?
> 
> Since this is a TDX specific limitation, I guess there is KVM_EXIT_TDX_VMCALL as a general category
> of TDVMCALLs that cannot be handled by KVM.

I just don't see any difference between handling it in KVM and handling 
it in userspace: either a) return error to guest or b) ignore the WRMSR.
Chao Gao March 28, 2024, 9:53 a.m. UTC | #31
On Thu, Mar 28, 2024 at 08:06:53AM +0800, Xiaoyao Li wrote:
>On 3/28/2024 1:36 AM, Edgecombe, Rick P wrote:
>> On Wed, 2024-03-27 at 10:54 +0800, Xiaoyao Li wrote:
>> > > > If QEMU doesn't configure the msr filter list correctly, KVM has to handle
>> > > > guest's MTRR MSR accesses. In my understanding, the
>> > > > suggestion is KVM zap private memory mappings.
>
>TDX spec states that
>
>  18.2.1.4.1 Memory Type for Private and Opaque Access
>
>  The memory type for private and opaque access semantics, which use a
>  private HKID, is WB.
>
>  18.2.1.4.2 Memory Type for Shared Accesses
>
>  Intel SDM, Vol. 3, 28.2.7.2 Memory Type Used for Translated Guest-
>  Physical Addresses
>
>  The memory type for shared access semantics, which use a shared HKID,
>  is determined as described below. Note that this is different from the
>  way memory type is determined by the hardware during non-root mode
>  operation. Rather, it is a best-effort approximation that is designed
>  to still allow the host VMM some control over memory type.
>    • For shared access during host-side (SEAMCALL) flows, the memory
>      type is determined by MTRRs.
>    • For shared access during guest-side flows (VM exit from the guest
>      TD), the memory type is determined by a combination of the Shared
>      EPT and MTRRs.
>      o If the memory type determined during Shared EPT walk is WB, then
>        the effective memory type for the access is determined by MTRRs.
>      o Else, the effective memory type for the access is UC.
>
>My understanding is that guest MTRR doesn't affect the memory type for
>private memory. So we don't need to zap private memory mappings.

This isn't related to the discussion. IIUC, this is the memory type used
by TDX module code to access shared/private memory.

I didn't suggest zapping private memory. It is my understanding about what
we will end up with, if KVM relies on QEMU to filter MTRR MSRs but somehow
QEMU fails to do that.
Chao Gao March 28, 2024, 10:17 a.m. UTC | #32
On Thu, Mar 28, 2024 at 11:40:27AM +0800, Xiaoyao Li wrote:
>On 3/28/2024 11:04 AM, Edgecombe, Rick P wrote:
>> On Thu, 2024-03-28 at 09:30 +0800, Xiaoyao Li wrote:
>> > > The current ABI of KVM_EXIT_X86_RDMSR when TDs are created is nothing. So I don't see how this
>> > > is
>> > > any kind of ABI break. If you agree we shouldn't try to support MTRRs, do you have a different
>> > > exit
>> > > reason or behavior in mind?
>> > 
>> > Just return error on TDVMCALL of RDMSR/WRMSR on TD's access of MTRR MSRs.
>> 
>> MTRR appears to be configured to be type "Fixed" in the TDX module. So the guest could expect to be
>> able to use it and be surprised by a #GP.
>> 
>>          {
>>            "MSB": "12",
>>            "LSB": "12",
>>            "Field Size": "1",
>>            "Field Name": "MTRR",
>>            "Configuration Details": null,
>>            "Bit or Field Virtualization Type": "Fixed",
>>            "Virtualization Details": "0x1"
>>          },
>> 
>> If KVM does not support MTRRs in TDX, then it has to return the error somewhere or pretend to
>> support it (do nothing but not return an error). Returning an error to the guest would be making up
>> arch behavior, and to a lesser degree so would ignoring the WRMSR.
>
>The root cause is that it's a bad design of TDX to make MTRR fixed1. When
>guest reads MTRR CPUID as 1 while getting #VE on MTRR MSRs, it already breaks
>the architectural behavior. (MAC faces the similar issue , MCA is fixed1 as

I won't say #VE on MTRR MSRs breaks anything. Writes to other MSRs (e.g.
TSC_DEADLINE MSR) also lead to #VE. If KVM can emulate the MSR accesses, #VE
should be fine.

The problem is: MTRR CPUID feature is fixed 1 while KVM/QEMU doesn't know how
to virtualize MTRR especially given that KVM cannot control the memory type in
secure-EPT entries.

>well while accessing MCA related MSRs gets #VE. This is why TDX is going to
>fix them by introducing new feature and make them configurable)
>
>> So that is why I lean towards
>> returning to userspace and giving the VMM the option to ignore it, return an error to the guest or
>> show an error to the user.
>
>"show an error to the user" doesn't help at all. Because user cannot fix it,
>nor does QEMU.

The key point isn't who can fix/emulate MTRR MSRs. It is just KVM doesn't know
how to handle this situation and ask userspace for help.

Whether or how userspace can handle the MSR writes isn't KVM's problem. It may be
better if KVM can tell userspace exactly in which cases KVM will exit to
userspace. But there is no such an infrastructure.

An example is: in KVM CET series, we find it is complex for KVM instruction
emulator to emulate control flow instructions when CET is enabled. The
suggestion is also to punt to userspace (w/o any indication to userspace that
KVM would do this).

>
>> If KVM can't support the behavior, better to get an actual error in
>> userspace than a mysterious guest hang, right?
>What behavior do you mean?
>
>> Outside of what kind of exit it is, do you object to the general plan to punt to userspace?
>> 
>> Since this is a TDX specific limitation, I guess there is KVM_EXIT_TDX_VMCALL as a general category
>> of TDVMCALLs that cannot be handled by KVM.

Using KVM_EXIT_TDX_VMCALL looks fine.

We need to explain why MTRR MSRs are handled in this way unlike other MSRs.

It is better if KVM can tell userspace that MTRR virtualization isn't supported
by KVM for TDs. Then userspace should resolve the conflict between KVM and TDX
module on MTRR. But to report MTRR as unsupported, we need to make
GET_SUPPORTED_CPUID a vm-scope ioctl. I am not sure if it is worth the effort.


>
>I just don't see any difference between handling it in KVM and handling it in
>userspace: either a) return error to guest or b) ignore the WRMSR.
Xiaoyao Li March 28, 2024, 1:21 p.m. UTC | #33
On 3/28/2024 6:17 PM, Chao Gao wrote:
> On Thu, Mar 28, 2024 at 11:40:27AM +0800, Xiaoyao Li wrote:
>> On 3/28/2024 11:04 AM, Edgecombe, Rick P wrote:
>>> On Thu, 2024-03-28 at 09:30 +0800, Xiaoyao Li wrote:
>>>>> The current ABI of KVM_EXIT_X86_RDMSR when TDs are created is nothing. So I don't see how this
>>>>> is
>>>>> any kind of ABI break. If you agree we shouldn't try to support MTRRs, do you have a different
>>>>> exit
>>>>> reason or behavior in mind?
>>>>
>>>> Just return error on TDVMCALL of RDMSR/WRMSR on TD's access of MTRR MSRs.
>>>
>>> MTRR appears to be configured to be type "Fixed" in the TDX module. So the guest could expect to be
>>> able to use it and be surprised by a #GP.
>>>
>>>           {
>>>             "MSB": "12",
>>>             "LSB": "12",
>>>             "Field Size": "1",
>>>             "Field Name": "MTRR",
>>>             "Configuration Details": null,
>>>             "Bit or Field Virtualization Type": "Fixed",
>>>             "Virtualization Details": "0x1"
>>>           },
>>>
>>> If KVM does not support MTRRs in TDX, then it has to return the error somewhere or pretend to
>>> support it (do nothing but not return an error). Returning an error to the guest would be making up
>>> arch behavior, and to a lesser degree so would ignoring the WRMSR.
>>
>> The root cause is that it's a bad design of TDX to make MTRR fixed1. When
>> guest reads MTRR CPUID as 1 while getting #VE on MTRR MSRs, it already breaks
>> the architectural behavior. (MAC faces the similar issue , MCA is fixed1 as
> 
> I won't say #VE on MTRR MSRs breaks anything. Writes to other MSRs (e.g.
> TSC_DEADLINE MSR) also lead to #VE. If KVM can emulate the MSR accesses, #VE
> should be fine.
> 
> The problem is: MTRR CPUID feature is fixed 1 while KVM/QEMU doesn't know how
> to virtualize MTRR especially given that KVM cannot control the memory type in
> secure-EPT entries.

yes, I partly agree on that "#VE on MTRR MSRs breaks anything". #VE is 
not a problem, the problem is if the #VE is opt-in or unconditional.

For the TSC_DEADLINE_MSR, #VE is opt-in actually. 
CPUID(1).EXC[24].TSC_DEADLINE is configurable by VMM. Only when VMM 
configures the bit to 1, will the TD guest get #VE. If VMM configures it 
to 0, TD guest just gets #GP. This is the reasonable design.

>> well while accessing MCA related MSRs gets #VE. This is why TDX is going to
>> fix them by introducing new feature and make them configurable)
>>
>>> So that is why I lean towards
>>> returning to userspace and giving the VMM the option to ignore it, return an error to the guest or
>>> show an error to the user.
>>
>> "show an error to the user" doesn't help at all. Because user cannot fix it,
>> nor does QEMU.
> 
> The key point isn't who can fix/emulate MTRR MSRs. It is just KVM doesn't know
> how to handle this situation and ask userspace for help.
> 
> Whether or how userspace can handle the MSR writes isn't KVM's problem. It may be
> better if KVM can tell userspace exactly in which cases KVM will exit to
> userspace. But there is no such an infrastructure.
> 
> An example is: in KVM CET series, we find it is complex for KVM instruction
> emulator to emulate control flow instructions when CET is enabled. The
> suggestion is also to punt to userspace (w/o any indication to userspace that
> KVM would do this).

Please point me to decision of CET? I'm interested in how userspace can 
help on that.

>>
>>> If KVM can't support the behavior, better to get an actual error in
>>> userspace than a mysterious guest hang, right?
>> What behavior do you mean?
>>
>>> Outside of what kind of exit it is, do you object to the general plan to punt to userspace?
>>>
>>> Since this is a TDX specific limitation, I guess there is KVM_EXIT_TDX_VMCALL as a general category
>>> of TDVMCALLs that cannot be handled by KVM.
> 
> Using KVM_EXIT_TDX_VMCALL looks fine.
> 
> We need to explain why MTRR MSRs are handled in this way unlike other MSRs.
> 
> It is better if KVM can tell userspace that MTRR virtualization isn't supported
> by KVM for TDs. Then userspace should resolve the conflict between KVM and TDX
> module on MTRR. But to report MTRR as unsupported, we need to make
> GET_SUPPORTED_CPUID a vm-scope ioctl. I am not sure if it is worth the effort.

My memory is that Sean dislike the vm-scope GET_SUPPORTED_CPUID for TDX 
when he was at Intel.

Anyway, we can provide TDX specific interface to report SUPPORTED_CPUID 
in KVM_TDX_CAPABILITIES, if we really need it.

> 
>>
>> I just don't see any difference between handling it in KVM and handling it in
>> userspace: either a) return error to guest or b) ignore the WRMSR.
Chao Gao March 28, 2024, 1:38 p.m. UTC | #34
On Thu, Mar 28, 2024 at 09:21:37PM +0800, Xiaoyao Li wrote:
>On 3/28/2024 6:17 PM, Chao Gao wrote:
>> On Thu, Mar 28, 2024 at 11:40:27AM +0800, Xiaoyao Li wrote:
>> > On 3/28/2024 11:04 AM, Edgecombe, Rick P wrote:
>> > > On Thu, 2024-03-28 at 09:30 +0800, Xiaoyao Li wrote:
>> > > > > The current ABI of KVM_EXIT_X86_RDMSR when TDs are created is nothing. So I don't see how this
>> > > > > is
>> > > > > any kind of ABI break. If you agree we shouldn't try to support MTRRs, do you have a different
>> > > > > exit
>> > > > > reason or behavior in mind?
>> > > > 
>> > > > Just return error on TDVMCALL of RDMSR/WRMSR on TD's access of MTRR MSRs.
>> > > 
>> > > MTRR appears to be configured to be type "Fixed" in the TDX module. So the guest could expect to be
>> > > able to use it and be surprised by a #GP.
>> > > 
>> > >           {
>> > >             "MSB": "12",
>> > >             "LSB": "12",
>> > >             "Field Size": "1",
>> > >             "Field Name": "MTRR",
>> > >             "Configuration Details": null,
>> > >             "Bit or Field Virtualization Type": "Fixed",
>> > >             "Virtualization Details": "0x1"
>> > >           },
>> > > 
>> > > If KVM does not support MTRRs in TDX, then it has to return the error somewhere or pretend to
>> > > support it (do nothing but not return an error). Returning an error to the guest would be making up
>> > > arch behavior, and to a lesser degree so would ignoring the WRMSR.
>> > 
>> > The root cause is that it's a bad design of TDX to make MTRR fixed1. When
>> > guest reads MTRR CPUID as 1 while getting #VE on MTRR MSRs, it already breaks
>> > the architectural behavior. (MAC faces the similar issue , MCA is fixed1 as
>> 
>> I won't say #VE on MTRR MSRs breaks anything. Writes to other MSRs (e.g.
>> TSC_DEADLINE MSR) also lead to #VE. If KVM can emulate the MSR accesses, #VE
>> should be fine.
>> 
>> The problem is: MTRR CPUID feature is fixed 1 while KVM/QEMU doesn't know how
>> to virtualize MTRR especially given that KVM cannot control the memory type in
>> secure-EPT entries.
>
>yes, I partly agree on that "#VE on MTRR MSRs breaks anything". #VE is not a
>problem, the problem is if the #VE is opt-in or unconditional.

From guest's p.o.v, there is no difference: the guest doesn't know whether a feature
is opted in or not.

>
>For the TSC_DEADLINE_MSR, #VE is opt-in actually.
>CPUID(1).EXC[24].TSC_DEADLINE is configurable by VMM. Only when VMM
>configures the bit to 1, will the TD guest get #VE. If VMM configures it to
>0, TD guest just gets #GP. This is the reasonable design.
>
>> > well while accessing MCA related MSRs gets #VE. This is why TDX is going to
>> > fix them by introducing new feature and make them configurable)
>> > 
>> > > So that is why I lean towards
>> > > returning to userspace and giving the VMM the option to ignore it, return an error to the guest or
>> > > show an error to the user.
>> > 
>> > "show an error to the user" doesn't help at all. Because user cannot fix it,
>> > nor does QEMU.
>> 
>> The key point isn't who can fix/emulate MTRR MSRs. It is just KVM doesn't know
>> how to handle this situation and ask userspace for help.
>> 
>> Whether or how userspace can handle the MSR writes isn't KVM's problem. It may be
>> better if KVM can tell userspace exactly in which cases KVM will exit to
>> userspace. But there is no such an infrastructure.
>> 
>> An example is: in KVM CET series, we find it is complex for KVM instruction
>> emulator to emulate control flow instructions when CET is enabled. The
>> suggestion is also to punt to userspace (w/o any indication to userspace that
>> KVM would do this).
>
>Please point me to decision of CET? I'm interested in how userspace can help
>on that.

https://lore.kernel.org/kvm/ZZgsipXoXTKyvCZT@google.com/

>
>> > 
>> > > If KVM can't support the behavior, better to get an actual error in
>> > > userspace than a mysterious guest hang, right?
>> > What behavior do you mean?
>> > 
>> > > Outside of what kind of exit it is, do you object to the general plan to punt to userspace?
>> > > 
>> > > Since this is a TDX specific limitation, I guess there is KVM_EXIT_TDX_VMCALL as a general category
>> > > of TDVMCALLs that cannot be handled by KVM.
>> 
>> Using KVM_EXIT_TDX_VMCALL looks fine.
>> 
>> We need to explain why MTRR MSRs are handled in this way unlike other MSRs.
>> 
>> It is better if KVM can tell userspace that MTRR virtualization isn't supported
>> by KVM for TDs. Then userspace should resolve the conflict between KVM and TDX
>> module on MTRR. But to report MTRR as unsupported, we need to make
>> GET_SUPPORTED_CPUID a vm-scope ioctl. I am not sure if it is worth the effort.
>
>My memory is that Sean dislike the vm-scope GET_SUPPORTED_CPUID for TDX when
>he was at Intel.

Ok. No strong opinion on this.
Xiaoyao Li March 28, 2024, 2:45 p.m. UTC | #35
On 3/28/2024 9:38 PM, Chao Gao wrote:
> On Thu, Mar 28, 2024 at 09:21:37PM +0800, Xiaoyao Li wrote:
>> On 3/28/2024 6:17 PM, Chao Gao wrote:
>>> On Thu, Mar 28, 2024 at 11:40:27AM +0800, Xiaoyao Li wrote:
>>>> On 3/28/2024 11:04 AM, Edgecombe, Rick P wrote:
>>>>> On Thu, 2024-03-28 at 09:30 +0800, Xiaoyao Li wrote:
>>>>>>> The current ABI of KVM_EXIT_X86_RDMSR when TDs are created is nothing. So I don't see how this
>>>>>>> is
>>>>>>> any kind of ABI break. If you agree we shouldn't try to support MTRRs, do you have a different
>>>>>>> exit
>>>>>>> reason or behavior in mind?
>>>>>>
>>>>>> Just return error on TDVMCALL of RDMSR/WRMSR on TD's access of MTRR MSRs.
>>>>>
>>>>> MTRR appears to be configured to be type "Fixed" in the TDX module. So the guest could expect to be
>>>>> able to use it and be surprised by a #GP.
>>>>>
>>>>>            {
>>>>>              "MSB": "12",
>>>>>              "LSB": "12",
>>>>>              "Field Size": "1",
>>>>>              "Field Name": "MTRR",
>>>>>              "Configuration Details": null,
>>>>>              "Bit or Field Virtualization Type": "Fixed",
>>>>>              "Virtualization Details": "0x1"
>>>>>            },
>>>>>
>>>>> If KVM does not support MTRRs in TDX, then it has to return the error somewhere or pretend to
>>>>> support it (do nothing but not return an error). Returning an error to the guest would be making up
>>>>> arch behavior, and to a lesser degree so would ignoring the WRMSR.
>>>>
>>>> The root cause is that it's a bad design of TDX to make MTRR fixed1. When
>>>> guest reads MTRR CPUID as 1 while getting #VE on MTRR MSRs, it already breaks
>>>> the architectural behavior. (MAC faces the similar issue , MCA is fixed1 as
>>>
>>> I won't say #VE on MTRR MSRs breaks anything. Writes to other MSRs (e.g.
>>> TSC_DEADLINE MSR) also lead to #VE. If KVM can emulate the MSR accesses, #VE
>>> should be fine.
>>>
>>> The problem is: MTRR CPUID feature is fixed 1 while KVM/QEMU doesn't know how
>>> to virtualize MTRR especially given that KVM cannot control the memory type in
>>> secure-EPT entries.
>>
>> yes, I partly agree on that "#VE on MTRR MSRs breaks anything". #VE is not a
>> problem, the problem is if the #VE is opt-in or unconditional.
> 
>  From guest's p.o.v, there is no difference: the guest doesn't know whether a feature
> is opted in or not.

I don't argue it makes any difference to guest. I argue that it is a bad 
design of TDX to make MTRR fixed1, which leaves the tough problem to 
VMM. TDX architecture is one should be blamed.

Though TDX is going to change it, we have to come up something to handle 
with current existing TDX if we want to support them.

I have no objection of leaving it to userspace, via KVM_EXIT_TDX_VMCALL. 
If we go this path, I would suggest return error to TD guest on QEMU 
side (when I prepare the QEMU patch for it) because QEMU cannot emulate 
it neither.
Edgecombe, Rick P March 28, 2024, 4:57 p.m. UTC | #36
On Thu, 2024-03-28 at 22:45 +0800, Xiaoyao Li wrote:
> I don't argue it makes any difference to guest. I argue that it is a bad 
> design of TDX to make MTRR fixed1, which leaves the tough problem to 
> VMM. TDX architecture is one should be blamed.

I didn't see anyone arguing against this point. It does seems strange to force something to be
exposed that can't be fully handled. I wonder what the history is. 

> 
> Though TDX is going to change it, we have to come up something to handle 
> with current existing TDX if we want to support them.

Right. The things being discussed are untenable as long term solutions. The question is, is there
anything acceptable in the meantime. That is the goal of this thread.

We do have the other option of waiting for a new TDX module that could fix it better, but I thought
exiting to userspace for the time being would be way to move forward.

Would be great to have a maintainer chime in on this point.

> 
> I have no objection of leaving it to userspace, via KVM_EXIT_TDX_VMCALL. 
> If we go this path, I would suggest return error to TD guest on QEMU 
> side (when I prepare the QEMU patch for it) because QEMU cannot emulate 
> it neither.

It would be nice to give the user (user of qemu) some sort of notice of what was going on. For Linux
the workaround is clearcpuid=mtrr. If qemu can print something like "MTRRs not supported", or I
don't know what message fits. Then the user can see what the problem is and add that to the kernel
command line. If they just see a guest crash because it can't handle the error, they will have to
debug.
Chao Gao April 17, 2024, 2:21 a.m. UTC | #37
On Mon, Feb 26, 2024 at 12:26:01AM -0800, isaku.yamahata@intel.com wrote:
>@@ -779,6 +780,10 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> 
> 	lockdep_assert_held_write(&kvm->mmu_lock);
> 
>+	WARN_ON_ONCE(zap_private && !is_private_sp(root));
>+	if (!zap_private && is_private_sp(root))
>+		return false;

Should be "return flush;".

Fengwei and I spent one week chasing a bug where virtio-net in the TD guest may
stop working at some point after bootup if the host enables numad. We finally
found that the bug was introduced by the 'return false' statement, which left
some stale EPT entries unflushed.

I am wondering if we can refactor related functions slightly to make it harder
to make such mistakes and make it easier to identify them. e.g., we could make
"@flush" an in/out parameter of tdp_mmu_zap_leafs(), kvm_tdp_mmu_zap_leafs()
and kvm_tdp_mmu_unmap_gfn_range(). It looks more apparent that "*flush = false"
below could be problematic if the changes were something like:

	if (!zap_private && is_private_sp(root)) {
		*flush = false;
		return;
	}


>+
> 	rcu_read_lock();
> 
> 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
>@@ -810,13 +815,15 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
>  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
>  * more SPTEs were zapped since the MMU lock was last acquired.
>  */
>-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
>+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush,
>+			   bool zap_private)
> {
> 	struct kvm_mmu_page *root;
> 
> 	lockdep_assert_held_write(&kvm->mmu_lock);
> 	for_each_tdp_mmu_root_yield_safe(kvm, root)
>-		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
>+		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush,
>+					  zap_private && is_private_sp(root));
> 
> 	return flush;
> }
>@@ -891,7 +898,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
>  * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
>  * See kvm_tdp_mmu_get_vcpu_root_hpa().
>  */
>-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
>+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm, bool skip_private)
> {
> 	struct kvm_mmu_page *root;
> 
>@@ -916,6 +923,12 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
> 	 * or get/put references to roots.
> 	 */
> 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
>+		/*
>+		 * Skip private root since private page table
>+		 * is only torn down when VM is destroyed.
>+		 */
>+		if (skip_private && is_private_sp(root))
>+			continue;
> 		/*
> 		 * Note, invalid roots can outlive a memslot update!  Invalid
> 		 * roots must be *zapped* before the memslot update completes,
>@@ -1104,14 +1117,26 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> 	return ret;
> }
> 
>+/* Used by mmu notifier via kvm_unmap_gfn_range() */
> bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
> 				 bool flush)
> {
> 	struct kvm_mmu_page *root;
>+	bool zap_private = false;
>+
>+	if (kvm_gfn_shared_mask(kvm)) {
>+		if (!range->only_private && !range->only_shared)
>+			/* attributes change */
>+			zap_private = !(range->arg.attributes &
>+					KVM_MEMORY_ATTRIBUTE_PRIVATE);
>+		else
>+			zap_private = range->only_private;
>+	}
> 
> 	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
> 		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
>-					  range->may_block, flush);
>+					  range->may_block, flush,
>+					  zap_private && is_private_sp(root));
> 
> 	return flush;
> }
>diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
>index 20d97aa46c49..b3cf58a50357 100644
>--- a/arch/x86/kvm/mmu/tdp_mmu.h
>+++ b/arch/x86/kvm/mmu/tdp_mmu.h
>@@ -19,10 +19,11 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
> 
> void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
> 
>-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
>+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush,
>+			   bool zap_private);
> bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
> void kvm_tdp_mmu_zap_all(struct kvm *kvm);
>-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
>+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm, bool skip_private);
> void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
> 
> int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>-- 
>2.25.1
>
>
Isaku Yamahata April 17, 2024, 6:47 a.m. UTC | #38
On Wed, Apr 17, 2024 at 10:21:16AM +0800,
Chao Gao <chao.gao@intel.com> wrote:

> On Mon, Feb 26, 2024 at 12:26:01AM -0800, isaku.yamahata@intel.com wrote:
> >@@ -779,6 +780,10 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> > 
> > 	lockdep_assert_held_write(&kvm->mmu_lock);
> > 
> >+	WARN_ON_ONCE(zap_private && !is_private_sp(root));
> >+	if (!zap_private && is_private_sp(root))
> >+		return false;
> 
> Should be "return flush;".
> 
> Fengwei and I spent one week chasing a bug where virtio-net in the TD guest may
> stop working at some point after bootup if the host enables numad. We finally
> found that the bug was introduced by the 'return false' statement, which left
> some stale EPT entries unflushed.

Thank you for chasing it down.


> I am wondering if we can refactor related functions slightly to make it harder
> to make such mistakes and make it easier to identify them. e.g., we could make
> "@flush" an in/out parameter of tdp_mmu_zap_leafs(), kvm_tdp_mmu_zap_leafs()
> and kvm_tdp_mmu_unmap_gfn_range(). It looks more apparent that "*flush = false"
> below could be problematic if the changes were something like:
> 
> 	if (!zap_private && is_private_sp(root)) {
> 		*flush = false;
> 		return;
> 	}

Yes, let me look into it.
Edgecombe, Rick P April 21, 2024, 1:58 a.m. UTC | #39
On Mon, 2024-02-26 at 00:26 -0800, isaku.yamahata@intel.com wrote:
> +/* Used by mmu notifier via kvm_unmap_gfn_range() */
>  bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range
> *range,
>                                  bool flush)
>  {
>         struct kvm_mmu_page *root;
> +       bool zap_private = false;
> +
> +       if (kvm_gfn_shared_mask(kvm)) {
> +               if (!range->only_private && !range->only_shared)
> +                       /* attributes change */
> +                       zap_private = !(range->arg.attributes &
> +                                       KVM_MEMORY_ATTRIBUTE_PRIVATE);
> +               else
> +                       zap_private = range->only_private;
> +       }
>  
>         __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id,
> false)
>                 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
> -                                         range->may_block, flush);
> +                                         range->may_block, flush,
> +                                         zap_private && is_private_sp(root));
>  



I'm trying to apply the feedback:
 - Drop MTRR support
 - Changing only_private/shared to exclude_private/shared
...and update the log accordingly. These changes are all intersect in this
function and I'm having a hard time trying to justify the resulting logic.

It seems the point of passing the the attributes is because:
"However, looking at kvm_mem_attrs_changed() again, I think invoking
kvm_unmap_gfn_range() from generic KVM code is a mistake and shortsighted. 
Zapping in response to *any* attribute change is very private/shared centric. 
E.g. if/when we extend attributes to provide per-page RWX protections, zapping
existing SPTEs in response to granting *more* permissions may not be necessary
or even desirable."
https://lore.kernel.org/all/ZJX0hk+KpQP0KUyB@google.com/

But I think shoving the logic for how to handle the attribute changes deep into
the zapping code is the opposite extreme. It results in this confusing logic
with the decision on what to zap is spread all around.

Instead we should have kvm_arch_pre_set_memory_attributes() adjust the range so
it can tell kvm_unmap_gfn_range() which ranges to zap (private/shared).

So:
kvm_vm_set_mem_attributes() - passes attributes
kvm_arch_pre_set_memory_attributes() - chooses which private/shared alias to 
                                       zap based on attribute.
kvm_unmap_gfn_range/kvm_tdp_mmu_unmap_gfn_range - zaps the private/shared alias
tdp_mmu_zap_leafs() - doesn't care about the root type, just zaps leafs


This zapping function can then just do the simple thing it's told to do. It ends
up looking like:

bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
				 bool flush)
{
	struct kvm_mmu_page *root;
	bool exclude_private = false;
	bool exclude_shared = false;

	if (kvm_gfn_shared_mask(kvm)) {
		exclude_private = range->exclude_private;
		exclude_shared = range->exclude_shared;
	}

	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id,
false) {
		if (exclude_private && is_private_sp(root))
			continue;
		if (exclude_shared && !is_private_sp(root))
			continue;

		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
					  range->may_block, flush);
	}

	return flush;
}

The resulting logic should be the same. Separately, we might be able to simplify
it further if we change the behavior a bit (lose the kvm_gfn_shared_mask() check
or the exclude_shared member), but in the meantime this seems a lot easier to
explain and review for what I think is equivalent behavior.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0d6d4506ec97..30c86e858ae4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6339,7 +6339,7 @@  static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 	 * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
 	 */
 	if (tdp_mmu_enabled)
-		kvm_tdp_mmu_invalidate_all_roots(kvm);
+		kvm_tdp_mmu_invalidate_all_roots(kvm, true);
 
 	/*
 	 * Notify all vcpus to reload its shadow page table and flush TLB.
@@ -6459,7 +6459,16 @@  void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 	flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
 
 	if (tdp_mmu_enabled)
-		flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
+		/*
+		 * zap_private = false. Zap only shared pages.
+		 *
+		 * kvm_zap_gfn_range() is used when MTRR or PAT memory
+		 * type was changed.  Later on the next kvm page fault,
+		 * populate it with updated spte entry.
+		 * Because only WB is supported for private pages, don't
+		 * care of private pages.
+		 */
+		flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush, false);
 
 	if (flush)
 		kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
@@ -6905,10 +6914,56 @@  void kvm_arch_flush_shadow_all(struct kvm *kvm)
 	kvm_mmu_zap_all(kvm);
 }
 
+static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	bool flush = false;
+
+	write_lock(&kvm->mmu_lock);
+
+	/*
+	 * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
+	 * case scenario we'll have unused shadow pages lying around until they
+	 * are recycled due to age or when the VM is destroyed.
+	 */
+	if (tdp_mmu_enabled) {
+		struct kvm_gfn_range range = {
+		      .slot = slot,
+		      .start = slot->base_gfn,
+		      .end = slot->base_gfn + slot->npages,
+		      .may_block = true,
+
+		      /*
+		       * This handles both private gfn and shared gfn.
+		       * All private page should be zapped on memslot deletion.
+		       */
+		      .only_private = true,
+		      .only_shared = true,
+		};
+
+		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush);
+	} else {
+		/* TDX supports only TDP-MMU case. */
+		WARN_ON_ONCE(1);
+		flush = true;
+	}
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	write_unlock(&kvm->mmu_lock);
+}
+
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot)
 {
-	kvm_mmu_zap_all_fast(kvm);
+	if (kvm_gfn_shared_mask(kvm))
+		/*
+		 * Secure-EPT requires to release PTs from the leaf.  The
+		 * optimization to zap root PT first with child PT doesn't
+		 * work.
+		 */
+		kvm_mmu_zap_memslot(kvm, slot);
+	else
+		kvm_mmu_zap_all_fast(kvm);
 }
 
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d47f0daf1b03..e7514a807134 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -37,7 +37,7 @@  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
 	 * ultimately frees all roots.
 	 */
-	kvm_tdp_mmu_invalidate_all_roots(kvm);
+	kvm_tdp_mmu_invalidate_all_roots(kvm, false);
 	kvm_tdp_mmu_zap_invalidated_roots(kvm);
 
 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
@@ -771,7 +771,8 @@  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
  * operation can cause a soft lockup.
  */
 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
-			      gfn_t start, gfn_t end, bool can_yield, bool flush)
+			      gfn_t start, gfn_t end, bool can_yield, bool flush,
+			      bool zap_private)
 {
 	struct tdp_iter iter;
 
@@ -779,6 +780,10 @@  static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 
+	WARN_ON_ONCE(zap_private && !is_private_sp(root));
+	if (!zap_private && is_private_sp(root))
+		return false;
+
 	rcu_read_lock();
 
 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
@@ -810,13 +815,15 @@  static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
  * more SPTEs were zapped since the MMU lock was last acquired.
  */
-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush,
+			   bool zap_private)
 {
 	struct kvm_mmu_page *root;
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 	for_each_tdp_mmu_root_yield_safe(kvm, root)
-		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
+		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush,
+					  zap_private && is_private_sp(root));
 
 	return flush;
 }
@@ -891,7 +898,7 @@  void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
  * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
  * See kvm_tdp_mmu_get_vcpu_root_hpa().
  */
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm, bool skip_private)
 {
 	struct kvm_mmu_page *root;
 
@@ -916,6 +923,12 @@  void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
 	 * or get/put references to roots.
 	 */
 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+		/*
+		 * Skip private root since private page table
+		 * is only torn down when VM is destroyed.
+		 */
+		if (skip_private && is_private_sp(root))
+			continue;
 		/*
 		 * Note, invalid roots can outlive a memslot update!  Invalid
 		 * roots must be *zapped* before the memslot update completes,
@@ -1104,14 +1117,26 @@  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	return ret;
 }
 
+/* Used by mmu notifier via kvm_unmap_gfn_range() */
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 				 bool flush)
 {
 	struct kvm_mmu_page *root;
+	bool zap_private = false;
+
+	if (kvm_gfn_shared_mask(kvm)) {
+		if (!range->only_private && !range->only_shared)
+			/* attributes change */
+			zap_private = !(range->arg.attributes &
+					KVM_MEMORY_ATTRIBUTE_PRIVATE);
+		else
+			zap_private = range->only_private;
+	}
 
 	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
 		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
-					  range->may_block, flush);
+					  range->may_block, flush,
+					  zap_private && is_private_sp(root));
 
 	return flush;
 }
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 20d97aa46c49..b3cf58a50357 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -19,10 +19,11 @@  __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
 
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush,
+			   bool zap_private);
 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm, bool skip_private);
 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
 
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);