diff mbox series

[v2,11/15] KVM: x86/tdp_mmu: Reflect tearing down mirror page tables

Message ID 20240530210714.364118-12-rick.p.edgecombe@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX MMU prep series part 1 | expand

Commit Message

Rick Edgecombe May 30, 2024, 9:07 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

Integrate hooks for mirroring page table operations for cases where TDX
will zap PTEs or free page tables.

Like other Coco technologies, TDX has the concept of private and shared
memory. For TDX the private and shared mappings are managed on separate
EPT roots. The private half is managed indirectly though calls into a
protected runtime environment called the TDX module, where the shared half
is managed within KVM in normal page tables.

Since calls into the TDX module are relatively slow, walking private page
tables by making calls into the TDX module would not be efficient. Because
of this, previous changes have taught the TDP MMU to keep a mirror root,
which is separate, unmapped TDP root that private operations can be
directed to. Currently this root is disconnected from the guest. Now add
plumbing to "reflect" changes to the mirror to the page tables being
mirrored. Just create the x86_ops for now, leave plumbing the operations
into the TDX module for future patches.

Add two operations for tearing down page tables, one for freeing page
tables (reflect_free_spt) and one for zapping PTEs (reflect_remove_spte).
Define them such that reflect_remove_spte will perform a TLB flush as well.
(in TDX terms "ensure there are no active translations").

TDX MMU support will exclude certain MMU operations, so only plug in the
mirroring x86 ops where they will be needed. For zapping/freeing, only
hook tdp_mmu_iter_set_spte() which is use used for mapping and linking
PTs. Don't bother hooking tdp_mmu_set_spte_atomic() as it is only used for
zapping PTEs in operations unsupported by TDX: zapping collapsible PTEs and
kvm_mmu_zap_all_fast().

In previous changes to address races around concurrent populating using
tdp_mmu_set_spte_atomic(), a solution was introduced to temporarily set
REMOVED_SPTE in the mirrored page tables while performing the "reflect"
operations. Such a solution is not needed for the tear down paths in TDX
as these will always be performed with the mmu_lock held for write.
Sprinkle some KVM_BUG_ON()s to reflect this.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Co-developed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
TDX MMU Prep v2:
 - Split from "KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU"
 - Rename x86_ops from "private" to "reflect"
 - In response to "sp->mirrored_spt" rename helpers to "mirrored"
 - Remove unused present mirroring support in tdp_mmu_set_spte()
 - Merge reflect_zap_spte() into reflect_remove_spte()
 - Move mirror zapping logic out of handle_changed_spte()
 - Add some KVM_BUG_ONs
---
 arch/x86/include/asm/kvm-x86-ops.h |  2 ++
 arch/x86/include/asm/kvm_host.h    |  8 ++++++
 arch/x86/kvm/mmu/tdp_mmu.c         | 45 ++++++++++++++++++++++++++++--
 3 files changed, 53 insertions(+), 2 deletions(-)

Comments

Paolo Bonzini June 7, 2024, 11:37 a.m. UTC | #1
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  2 ++
>  arch/x86/include/asm/kvm_host.h    |  8 ++++++
>  arch/x86/kvm/mmu/tdp_mmu.c         | 45 ++++++++++++++++++++++++++++--
>  3 files changed, 53 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 1877d6a77525..dae06afc6038 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -97,6 +97,8 @@ KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
>  KVM_X86_OP(load_mmu_pgd)
>  KVM_X86_OP_OPTIONAL(reflect_link_spt)
>  KVM_X86_OP_OPTIONAL(reflect_set_spte)
> +KVM_X86_OP_OPTIONAL(reflect_free_spt)
> +KVM_X86_OP_OPTIONAL(reflect_remove_spte)
>  KVM_X86_OP(has_wbinvd_exit)
>  KVM_X86_OP(get_l2_tsc_offset)
>  KVM_X86_OP(get_l2_tsc_multiplier)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 20bb10f22ca6..0df4a31a0df9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1755,6 +1755,14 @@ struct kvm_x86_ops {
>         int (*reflect_set_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>                                 kvm_pfn_t pfn);
>
> +       /* Update mirrored page tables for page table about to be freed */
> +       int (*reflect_free_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +                               void *mirrored_spt);
> +
> +       /* Update mirrored page table from spte getting removed, and flush TLB */
> +       int (*reflect_remove_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +                                  kvm_pfn_t pfn);

Again, maybe free_external_spt and zap_external_spte?

Also, please rename the last argument to pfn_for_gfn. I'm not proud of
it, but it took me over 10 minutes to understand if the pfn referred
to the gfn itself, or to the external SP that holds the spte...
There's a possibility that it isn't just me. :)

(In general, this patch took me a _lot_ to review... there were a
couple of places that left me incomprehensibly puzzled, more on this
below).

>         bool (*has_wbinvd_exit)(void);
>
>         u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 41b1d3f26597..1245f6a48dbe 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -346,6 +346,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
>  }
>
> +static void reflect_removed_spte(struct kvm *kvm, gfn_t gfn,
> +                                       u64 old_spte, u64 new_spte,
> +                                       int level)

new_spte is not used and can be dropped. Also, tdp_mmu_zap_external_spte?

> +{
> +       bool was_present = is_shadow_present_pte(old_spte);
> +       bool was_leaf = was_present && is_last_spte(old_spte, level);

Just put it below:

if (!is_shadow_present_pte(old_spte))
  return;

/* Here we only care about zapping the external leaf PTEs. */
if (!is_last_spte(old_spte, level))

> +       kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
> +       int ret;
> +
> +       /*
> +        * Allow only leaf page to be zapped. Reclaim non-leaf page tables page

This comment left me confused, so I'll try to rephrase and see if I
can explain what happens. Correct me if I'm wrong.

The only paths to handle_removed_pt() are:
- kvm_tdp_mmu_zap_leafs()
- kvm_tdp_mmu_zap_invalidated_roots()

but because kvm_mmu_zap_all_fast() does not operate on mirror roots,
the latter can only happen at VM destruction time.

But it's not clear why it's worth mentioning it here, or even why it
is special at all. Isn't that just what handle_removed_pt() does at
the end? Why does it matter that it's only done at VM destruction
time?

In other words, it seems to me that this comment is TMI. And if I am
wrong (which may well be), the extra information should explain the
"why" in more detail, and it should be around the call to
reflect_free_spt, not here.

> +               return;
> +       /* Zapping leaf spte is allowed only when write lock is held. */
> +       lockdep_assert_held_write(&kvm->mmu_lock);
> +       /* Because write lock is held, operation should success. */
> +       ret = static_call(kvm_x86_reflect_remove_spte)(kvm, gfn, level, old_pfn);
> +       KVM_BUG_ON(ret, kvm);
> +}
> +
>  /**
>   * handle_removed_pt() - handle a page table removed from the TDP structure
>   *
> @@ -441,6 +464,22 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
>                 }
>                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
>                                     old_spte, REMOVED_SPTE, sp->role, shared);
> +               if (is_mirror_sp(sp)) {
> +                       KVM_BUG_ON(shared, kvm);
> +                       reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level);
> +               }
> +       }
> +
> +       if (is_mirror_sp(sp) &&
> +           WARN_ON(static_call(kvm_x86_reflect_free_spt)(kvm, sp->gfn, sp->role.level,
> +                                                         kvm_mmu_mirrored_spt(sp)))) {

Please use base_gfn and level here, instead of fishing them from sp.

> +               /*
> +                * Failed to free page table page in mirror page table and
> +                * there is nothing to do further.
> +                * Intentionally leak the page to prevent the kernel from
> +                * accessing the encrypted page.
> +                */
> +               sp->mirrored_spt = NULL;
>         }
>
>         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
> @@ -778,9 +817,11 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
>         role.level = level;
>         handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, false);
>
> -       /* Don't support setting for the non-atomic case */
> -       if (is_mirror_sptep(sptep))
> +       if (is_mirror_sptep(sptep)) {
> +               /* Only support zapping for the non-atomic case */

Like for patch 10, this comment should point out why we never get here
for mirror SPs.

Paolo

>                 KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
> +               reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level);
> +       }
>
>         return old_spte;
>  }
> --
> 2.34.1
>
Rick Edgecombe June 7, 2024, 9:46 p.m. UTC | #2
On Fri, 2024-06-07 at 13:37 +0200, Paolo Bonzini wrote:
> > 
> > +       /* Update mirrored page tables for page table about to be freed */
> > +       int (*reflect_free_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level
> > level,
> > +                               void *mirrored_spt);
> > +
> > +       /* Update mirrored page table from spte getting removed, and flush
> > TLB */
> > +       int (*reflect_remove_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level
> > level,
> > +                                  kvm_pfn_t pfn);
> 
> Again, maybe free_external_spt and zap_external_spte?

Yep, I'm on board.

> 
> Also, please rename the last argument to pfn_for_gfn. I'm not proud of
> it, but it took me over 10 minutes to understand if the pfn referred
> to the gfn itself, or to the external SP that holds the spte...
> There's a possibility that it isn't just me. :)

Ah, I see how that could be confusing. 

> 
> (In general, this patch took me a _lot_ to review... there were a
> couple of places that left me incomprehensibly puzzled, more on this
> below).

Sorry for that. Thanks for taking the time to weed through it anyway.

> 
> >          bool (*has_wbinvd_exit)(void);
> > 
> >          u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 41b1d3f26597..1245f6a48dbe 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -346,6 +346,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct
> > kvm_mmu_page *sp)
> >          spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> >   }
> > 
> > +static void reflect_removed_spte(struct kvm *kvm, gfn_t gfn,
> > +                                       u64 old_spte, u64 new_spte,
> > +                                       int level)
> 
> new_spte is not used and can be dropped. Also, tdp_mmu_zap_external_spte?

Oh, yep. In v19 there used to be a KVM_BUG_ON(), but we can get rid of it.

> 
> > +{
> > +       bool was_present = is_shadow_present_pte(old_spte);
> > +       bool was_leaf = was_present && is_last_spte(old_spte, level);
> 
> Just put it below:
> 
> if (!is_shadow_present_pte(old_spte))
>   return;

Right, this is another miss from removing the KVM_BUG_ON()s.

> 
> /* Here we only care about zapping the external leaf PTEs. */
> if (!is_last_spte(old_spte, level))
> 
> > +       kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
> > +       int ret;
> > +
> > +       /*
> > +        * Allow only leaf page to be zapped. Reclaim non-leaf page tables
> > page
> 
> This comment left me confused, so I'll try to rephrase and see if I
> can explain what happens. Correct me if I'm wrong.
> 
> The only paths to handle_removed_pt() are:
> - kvm_tdp_mmu_zap_leafs()
> - kvm_tdp_mmu_zap_invalidated_roots()
> 
> but because kvm_mmu_zap_all_fast() does not operate on mirror roots,
> the latter can only happen at VM destruction time.
> 
> But it's not clear why it's worth mentioning it here, or even why it
> is special at all. Isn't that just what handle_removed_pt() does at
> the end? Why does it matter that it's only done at VM destruction
> time?
> 
> In other words, it seems to me that this comment is TMI. And if I am
> wrong (which may well be), the extra information should explain the
> "why" in more detail, and it should be around the call to
> reflect_free_spt, not here.

TDX of course has the limitation around the ordering of the zapping S-EPT. So I
read the comment to be referring to how the implementation avoids zapping any
non-leaf PTEs during TD runtime.

But I'm going to have to circle back here after investigating a bit more. Isaku,
any comments on this comment and conditional?

> 
> > +               return;
> > +       /* Zapping leaf spte is allowed only when write lock is held. */
> > +       lockdep_assert_held_write(&kvm->mmu_lock);
> > +       /* Because write lock is held, operation should success. */
> > +       ret = static_call(kvm_x86_reflect_remove_spte)(kvm, gfn, level,
> > old_pfn);
> > +       KVM_BUG_ON(ret, kvm);
> > +}
> > +
> >   /**
> >    * handle_removed_pt() - handle a page table removed from the TDP
> > structure
> >    *
> > @@ -441,6 +464,22 @@ static void handle_removed_pt(struct kvm *kvm,
> > tdp_ptep_t pt, bool shared)
> >                  }
> >                  handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
> >                                      old_spte, REMOVED_SPTE, sp->role,
> > shared);
> > +               if (is_mirror_sp(sp)) {
> > +                       KVM_BUG_ON(shared, kvm);
> > +                       reflect_removed_spte(kvm, gfn, old_spte,
> > REMOVED_SPTE, level);
> > +               }
> > +       }
> > +
> > +       if (is_mirror_sp(sp) &&
> > +           WARN_ON(static_call(kvm_x86_reflect_free_spt)(kvm, sp->gfn, sp-
> > >role.level,
> > +                                                        
> > kvm_mmu_mirrored_spt(sp)))) {
> 
> Please use base_gfn and level here, instead of fishing them from sp.

Oh, yep, thanks.

> 
> > +               /*
> > +                * Failed to free page table page in mirror page table and
> > +                * there is nothing to do further.
> > +                * Intentionally leak the page to prevent the kernel from
> > +                * accessing the encrypted page.
> > +                */
> > +               sp->mirrored_spt = NULL;
> >          }
> > 
> >          call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
> > @@ -778,9 +817,11 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id,
> > tdp_ptep_t sptep,
> >          role.level = level;
> >          handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role,
> > false);
> > 
> > -       /* Don't support setting for the non-atomic case */
> > -       if (is_mirror_sptep(sptep))
> > +       if (is_mirror_sptep(sptep)) {
> > +               /* Only support zapping for the non-atomic case */
> 
> Like for patch 10, this comment should point out why we never get here
> for mirror SPs.

Ok.
Paolo Bonzini June 8, 2024, 9:25 a.m. UTC | #3
On Fri, Jun 7, 2024 at 11:46 PM Edgecombe, Rick P
<rick.p.edgecombe@intel.com> wrote:
> > Also, please rename the last argument to pfn_for_gfn. I'm not proud of
> > it, but it took me over 10 minutes to understand if the pfn referred
> > to the gfn itself, or to the external SP that holds the spte...
> > There's a possibility that it isn't just me. :)
>
> Ah, I see how that could be confusing.
>
> > (In general, this patch took me a _lot_ to review... there were a
> > couple of places that left me incomprehensibly puzzled, more on this
> > below).
>
> Sorry for that. Thanks for taking the time to weed through it anyway.

Oh, don't worry, I have no idea what made this patch stick out as the
hardest one... Maybe it's really that there is such a thing as too
many comments in some cases, and also the mutual recursion between
handle_removed_pt() and handle_changed_spte(). Nothing that can't be
fixed.

> > > +       kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
> > > +       int ret;
> > > +
> > > +       /*
> > > +        * Allow only leaf page to be zapped. Reclaim non-leaf page tables
> > > page
> >
> > This comment left me confused, so I'll try to rephrase and see if I
> > can explain what happens. Correct me if I'm wrong.
> >
> > The only paths to handle_removed_pt() are:
> > - kvm_tdp_mmu_zap_leafs()
> > - kvm_tdp_mmu_zap_invalidated_roots()
> >
> > but because kvm_mmu_zap_all_fast() does not operate on mirror roots,
> > the latter can only happen at VM destruction time.
> >
> > But it's not clear why it's worth mentioning it here, or even why it
> > is special at all. Isn't that just what handle_removed_pt() does at
> > the end? Why does it matter that it's only done at VM destruction
> > time?
> >
> > In other words, it seems to me that this comment is TMI. And if I am
> > wrong (which may well be), the extra information should explain the
> > "why" in more detail, and it should be around the call to
> > reflect_free_spt, not here.
>
> TDX of course has the limitation around the ordering of the zapping S-EPT. So I
> read the comment to be referring to how the implementation avoids zapping any
> non-leaf PTEs during TD runtime.

Ok, I think then you can just replace it with a comment that explains
why TDX needs it, as it's one of those places where we let TDX
assumptions creep into common code - which is not a big deal per se,
it's just worth mentioning it in a comment. Unlike the
tdp_mmu_get_root_for_fault() remark I have just sent for patch 9/15,
here the assumption is more algorithmic and less about a specific line
of code, and I think that makes it okay.

Paolo

> > > -       /* Don't support setting for the non-atomic case */
> > > -       if (is_mirror_sptep(sptep))
> > > +       if (is_mirror_sptep(sptep)) {
> > > +               /* Only support zapping for the non-atomic case */
> >
> > Like for patch 10, this comment should point out why we never get here
> > for mirror SPs.
>
> Ok.
Isaku Yamahata June 12, 2024, 6:39 p.m. UTC | #4
On Fri, Jun 07, 2024 at 09:46:27PM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> > /* Here we only care about zapping the external leaf PTEs. */
> > if (!is_last_spte(old_spte, level))
> > 
> > > +       kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
> > > +       int ret;
> > > +
> > > +       /*
> > > +        * Allow only leaf page to be zapped. Reclaim non-leaf page tables
> > > page
> > 
> > This comment left me confused, so I'll try to rephrase and see if I
> > can explain what happens. Correct me if I'm wrong.
> > 
> > The only paths to handle_removed_pt() are:
> > - kvm_tdp_mmu_zap_leafs()
> > - kvm_tdp_mmu_zap_invalidated_roots()
> > 
> > but because kvm_mmu_zap_all_fast() does not operate on mirror roots,
> > the latter can only happen at VM destruction time.
> > 
> > But it's not clear why it's worth mentioning it here, or even why it
> > is special at all. Isn't that just what handle_removed_pt() does at
> > the end? Why does it matter that it's only done at VM destruction
> > time?
> > 
> > In other words, it seems to me that this comment is TMI. And if I am
> > wrong (which may well be), the extra information should explain the
> > "why" in more detail, and it should be around the call to
> > reflect_free_spt, not here.
> 
> TDX of course has the limitation around the ordering of the zapping S-EPT. So I
> read the comment to be referring to how the implementation avoids zapping any
> non-leaf PTEs during TD runtime.
> 
> But I'm going to have to circle back here after investigating a bit more. Isaku,
> any comments on this comment and conditional?

It's for large page page merge/split.  At this point, it seems only confusing.
We need only leaf zapping.  Maybe reflect_removed_leaf_spte() or something.
Later we can come back when large page support.
Rick Edgecombe June 14, 2024, 11:11 p.m. UTC | #5
On Wed, 2024-06-12 at 11:39 -0700, Isaku Yamahata wrote:
> > TDX of course has the limitation around the ordering of the zapping S-EPT.
> > So I
> > read the comment to be referring to how the implementation avoids zapping
> > any
> > non-leaf PTEs during TD runtime.
> > 
> > But I'm going to have to circle back here after investigating a bit more.
> > Isaku,
> > any comments on this comment and conditional?
> 
> It's for large page page merge/split.  At this point, it seems only confusing.
> We need only leaf zapping.  Maybe reflect_removed_leaf_spte() or something.
> Later we can come back when large page support.

Yes, I don't see the need for the is_shadow_present_pte() in this function. The
leaf check is needed, but actually it could be done well enough in TDX code for
now (for as long as we have no huge pages).

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0155b8330e15..e54dd2355005 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1721,6 +1721,10 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm,
gfn_t gfn,
 {
        int ret;
 
+       /* Only 4k pages currently, nothing to do for other levels */
+       if (level != PG_LEVEL_4K)
+               return 0;
+
        ret = tdx_sept_zap_private_spte(kvm, gfn, level);
        if (ret)
                return ret;


The benefits would be that we have less mirror logic in core MMU code that
actually is about TDX specifics unrelated to the mirror concept. The downsides
are returning success there feels kind of wrong. I'm going to move forward with
dropping the is_shadow_present_pte() check and adding a comment (see below), but
I thought the alternative was worth mentioning because I was tempted.

/*
 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
 * PTs are removed in a special order, involving free_external_spt().
 * But remove_external_spte() will be called on non-leaf PTEs via
 * __tdp_mmu_zap_root(), so avoid the error the former would return
 * in this case.
 */
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 1877d6a77525..dae06afc6038 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -97,6 +97,8 @@  KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
 KVM_X86_OP(load_mmu_pgd)
 KVM_X86_OP_OPTIONAL(reflect_link_spt)
 KVM_X86_OP_OPTIONAL(reflect_set_spte)
+KVM_X86_OP_OPTIONAL(reflect_free_spt)
+KVM_X86_OP_OPTIONAL(reflect_remove_spte)
 KVM_X86_OP(has_wbinvd_exit)
 KVM_X86_OP(get_l2_tsc_offset)
 KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 20bb10f22ca6..0df4a31a0df9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1755,6 +1755,14 @@  struct kvm_x86_ops {
 	int (*reflect_set_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 				kvm_pfn_t pfn);
 
+	/* Update mirrored page tables for page table about to be freed */
+	int (*reflect_free_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+				void *mirrored_spt);
+
+	/* Update mirrored page table from spte getting removed, and flush TLB */
+	int (*reflect_remove_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+				   kvm_pfn_t pfn);
+
 	bool (*has_wbinvd_exit)(void);
 
 	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 41b1d3f26597..1245f6a48dbe 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -346,6 +346,29 @@  static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 }
 
+static void reflect_removed_spte(struct kvm *kvm, gfn_t gfn,
+					u64 old_spte, u64 new_spte,
+					int level)
+{
+	bool was_present = is_shadow_present_pte(old_spte);
+	bool was_leaf = was_present && is_last_spte(old_spte, level);
+	kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
+	int ret;
+
+	/*
+	 * Allow only leaf page to be zapped. Reclaim non-leaf page tables page
+	 * at destroying VM.
+	 */
+	if (!was_leaf)
+		return;
+
+	/* Zapping leaf spte is allowed only when write lock is held. */
+	lockdep_assert_held_write(&kvm->mmu_lock);
+	/* Because write lock is held, operation should success. */
+	ret = static_call(kvm_x86_reflect_remove_spte)(kvm, gfn, level, old_pfn);
+	KVM_BUG_ON(ret, kvm);
+}
+
 /**
  * handle_removed_pt() - handle a page table removed from the TDP structure
  *
@@ -441,6 +464,22 @@  static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
 		}
 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 				    old_spte, REMOVED_SPTE, sp->role, shared);
+		if (is_mirror_sp(sp)) {
+			KVM_BUG_ON(shared, kvm);
+			reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level);
+		}
+	}
+
+	if (is_mirror_sp(sp) &&
+	    WARN_ON(static_call(kvm_x86_reflect_free_spt)(kvm, sp->gfn, sp->role.level,
+							  kvm_mmu_mirrored_spt(sp)))) {
+		/*
+		 * Failed to free page table page in mirror page table and
+		 * there is nothing to do further.
+		 * Intentionally leak the page to prevent the kernel from
+		 * accessing the encrypted page.
+		 */
+		sp->mirrored_spt = NULL;
 	}
 
 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -778,9 +817,11 @@  static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 	role.level = level;
 	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, false);
 
-	/* Don't support setting for the non-atomic case */
-	if (is_mirror_sptep(sptep))
+	if (is_mirror_sptep(sptep)) {
+		/* Only support zapping for the non-atomic case */
 		KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
+		reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level);
+	}
 
 	return old_spte;
 }