Message ID | 20240530210714.364118-12-rick.p.edgecombe@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | TDX MMU prep series part 1 | expand |
> --- > arch/x86/include/asm/kvm-x86-ops.h | 2 ++ > arch/x86/include/asm/kvm_host.h | 8 ++++++ > arch/x86/kvm/mmu/tdp_mmu.c | 45 ++++++++++++++++++++++++++++-- > 3 files changed, 53 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h > index 1877d6a77525..dae06afc6038 100644 > --- a/arch/x86/include/asm/kvm-x86-ops.h > +++ b/arch/x86/include/asm/kvm-x86-ops.h > @@ -97,6 +97,8 @@ KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) > KVM_X86_OP(load_mmu_pgd) > KVM_X86_OP_OPTIONAL(reflect_link_spt) > KVM_X86_OP_OPTIONAL(reflect_set_spte) > +KVM_X86_OP_OPTIONAL(reflect_free_spt) > +KVM_X86_OP_OPTIONAL(reflect_remove_spte) > KVM_X86_OP(has_wbinvd_exit) > KVM_X86_OP(get_l2_tsc_offset) > KVM_X86_OP(get_l2_tsc_multiplier) > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 20bb10f22ca6..0df4a31a0df9 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1755,6 +1755,14 @@ struct kvm_x86_ops { > int (*reflect_set_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, > kvm_pfn_t pfn); > > + /* Update mirrored page tables for page table about to be freed */ > + int (*reflect_free_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, > + void *mirrored_spt); > + > + /* Update mirrored page table from spte getting removed, and flush TLB */ > + int (*reflect_remove_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, > + kvm_pfn_t pfn); Again, maybe free_external_spt and zap_external_spte? Also, please rename the last argument to pfn_for_gfn. I'm not proud of it, but it took me over 10 minutes to understand if the pfn referred to the gfn itself, or to the external SP that holds the spte... There's a possibility that it isn't just me. :) (In general, this patch took me a _lot_ to review... there were a couple of places that left me incomprehensibly puzzled, more on this below). > bool (*has_wbinvd_exit)(void); > > u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 41b1d3f26597..1245f6a48dbe 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -346,6 +346,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) > spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > } > > +static void reflect_removed_spte(struct kvm *kvm, gfn_t gfn, > + u64 old_spte, u64 new_spte, > + int level) new_spte is not used and can be dropped. Also, tdp_mmu_zap_external_spte? > +{ > + bool was_present = is_shadow_present_pte(old_spte); > + bool was_leaf = was_present && is_last_spte(old_spte, level); Just put it below: if (!is_shadow_present_pte(old_spte)) return; /* Here we only care about zapping the external leaf PTEs. */ if (!is_last_spte(old_spte, level)) > + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); > + int ret; > + > + /* > + * Allow only leaf page to be zapped. Reclaim non-leaf page tables page This comment left me confused, so I'll try to rephrase and see if I can explain what happens. Correct me if I'm wrong. The only paths to handle_removed_pt() are: - kvm_tdp_mmu_zap_leafs() - kvm_tdp_mmu_zap_invalidated_roots() but because kvm_mmu_zap_all_fast() does not operate on mirror roots, the latter can only happen at VM destruction time. But it's not clear why it's worth mentioning it here, or even why it is special at all. Isn't that just what handle_removed_pt() does at the end? Why does it matter that it's only done at VM destruction time? In other words, it seems to me that this comment is TMI. And if I am wrong (which may well be), the extra information should explain the "why" in more detail, and it should be around the call to reflect_free_spt, not here. > + return; > + /* Zapping leaf spte is allowed only when write lock is held. */ > + lockdep_assert_held_write(&kvm->mmu_lock); > + /* Because write lock is held, operation should success. */ > + ret = static_call(kvm_x86_reflect_remove_spte)(kvm, gfn, level, old_pfn); > + KVM_BUG_ON(ret, kvm); > +} > + > /** > * handle_removed_pt() - handle a page table removed from the TDP structure > * > @@ -441,6 +464,22 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) > } > handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, > old_spte, REMOVED_SPTE, sp->role, shared); > + if (is_mirror_sp(sp)) { > + KVM_BUG_ON(shared, kvm); > + reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level); > + } > + } > + > + if (is_mirror_sp(sp) && > + WARN_ON(static_call(kvm_x86_reflect_free_spt)(kvm, sp->gfn, sp->role.level, > + kvm_mmu_mirrored_spt(sp)))) { Please use base_gfn and level here, instead of fishing them from sp. > + /* > + * Failed to free page table page in mirror page table and > + * there is nothing to do further. > + * Intentionally leak the page to prevent the kernel from > + * accessing the encrypted page. > + */ > + sp->mirrored_spt = NULL; > } > > call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); > @@ -778,9 +817,11 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, > role.level = level; > handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, false); > > - /* Don't support setting for the non-atomic case */ > - if (is_mirror_sptep(sptep)) > + if (is_mirror_sptep(sptep)) { > + /* Only support zapping for the non-atomic case */ Like for patch 10, this comment should point out why we never get here for mirror SPs. Paolo > KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); > + reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level); > + } > > return old_spte; > } > -- > 2.34.1 >
On Fri, 2024-06-07 at 13:37 +0200, Paolo Bonzini wrote: > > > > + /* Update mirrored page tables for page table about to be freed */ > > + int (*reflect_free_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level > > level, > > + void *mirrored_spt); > > + > > + /* Update mirrored page table from spte getting removed, and flush > > TLB */ > > + int (*reflect_remove_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level > > level, > > + kvm_pfn_t pfn); > > Again, maybe free_external_spt and zap_external_spte? Yep, I'm on board. > > Also, please rename the last argument to pfn_for_gfn. I'm not proud of > it, but it took me over 10 minutes to understand if the pfn referred > to the gfn itself, or to the external SP that holds the spte... > There's a possibility that it isn't just me. :) Ah, I see how that could be confusing. > > (In general, this patch took me a _lot_ to review... there were a > couple of places that left me incomprehensibly puzzled, more on this > below). Sorry for that. Thanks for taking the time to weed through it anyway. > > > bool (*has_wbinvd_exit)(void); > > > > u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > index 41b1d3f26597..1245f6a48dbe 100644 > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > > @@ -346,6 +346,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct > > kvm_mmu_page *sp) > > spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > > } > > > > +static void reflect_removed_spte(struct kvm *kvm, gfn_t gfn, > > + u64 old_spte, u64 new_spte, > > + int level) > > new_spte is not used and can be dropped. Also, tdp_mmu_zap_external_spte? Oh, yep. In v19 there used to be a KVM_BUG_ON(), but we can get rid of it. > > > +{ > > + bool was_present = is_shadow_present_pte(old_spte); > > + bool was_leaf = was_present && is_last_spte(old_spte, level); > > Just put it below: > > if (!is_shadow_present_pte(old_spte)) > return; Right, this is another miss from removing the KVM_BUG_ON()s. > > /* Here we only care about zapping the external leaf PTEs. */ > if (!is_last_spte(old_spte, level)) > > > + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); > > + int ret; > > + > > + /* > > + * Allow only leaf page to be zapped. Reclaim non-leaf page tables > > page > > This comment left me confused, so I'll try to rephrase and see if I > can explain what happens. Correct me if I'm wrong. > > The only paths to handle_removed_pt() are: > - kvm_tdp_mmu_zap_leafs() > - kvm_tdp_mmu_zap_invalidated_roots() > > but because kvm_mmu_zap_all_fast() does not operate on mirror roots, > the latter can only happen at VM destruction time. > > But it's not clear why it's worth mentioning it here, or even why it > is special at all. Isn't that just what handle_removed_pt() does at > the end? Why does it matter that it's only done at VM destruction > time? > > In other words, it seems to me that this comment is TMI. And if I am > wrong (which may well be), the extra information should explain the > "why" in more detail, and it should be around the call to > reflect_free_spt, not here. TDX of course has the limitation around the ordering of the zapping S-EPT. So I read the comment to be referring to how the implementation avoids zapping any non-leaf PTEs during TD runtime. But I'm going to have to circle back here after investigating a bit more. Isaku, any comments on this comment and conditional? > > > + return; > > + /* Zapping leaf spte is allowed only when write lock is held. */ > > + lockdep_assert_held_write(&kvm->mmu_lock); > > + /* Because write lock is held, operation should success. */ > > + ret = static_call(kvm_x86_reflect_remove_spte)(kvm, gfn, level, > > old_pfn); > > + KVM_BUG_ON(ret, kvm); > > +} > > + > > /** > > * handle_removed_pt() - handle a page table removed from the TDP > > structure > > * > > @@ -441,6 +464,22 @@ static void handle_removed_pt(struct kvm *kvm, > > tdp_ptep_t pt, bool shared) > > } > > handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, > > old_spte, REMOVED_SPTE, sp->role, > > shared); > > + if (is_mirror_sp(sp)) { > > + KVM_BUG_ON(shared, kvm); > > + reflect_removed_spte(kvm, gfn, old_spte, > > REMOVED_SPTE, level); > > + } > > + } > > + > > + if (is_mirror_sp(sp) && > > + WARN_ON(static_call(kvm_x86_reflect_free_spt)(kvm, sp->gfn, sp- > > >role.level, > > + > > kvm_mmu_mirrored_spt(sp)))) { > > Please use base_gfn and level here, instead of fishing them from sp. Oh, yep, thanks. > > > + /* > > + * Failed to free page table page in mirror page table and > > + * there is nothing to do further. > > + * Intentionally leak the page to prevent the kernel from > > + * accessing the encrypted page. > > + */ > > + sp->mirrored_spt = NULL; > > } > > > > call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); > > @@ -778,9 +817,11 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, > > tdp_ptep_t sptep, > > role.level = level; > > handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, > > false); > > > > - /* Don't support setting for the non-atomic case */ > > - if (is_mirror_sptep(sptep)) > > + if (is_mirror_sptep(sptep)) { > > + /* Only support zapping for the non-atomic case */ > > Like for patch 10, this comment should point out why we never get here > for mirror SPs. Ok.
On Fri, Jun 7, 2024 at 11:46 PM Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote: > > Also, please rename the last argument to pfn_for_gfn. I'm not proud of > > it, but it took me over 10 minutes to understand if the pfn referred > > to the gfn itself, or to the external SP that holds the spte... > > There's a possibility that it isn't just me. :) > > Ah, I see how that could be confusing. > > > (In general, this patch took me a _lot_ to review... there were a > > couple of places that left me incomprehensibly puzzled, more on this > > below). > > Sorry for that. Thanks for taking the time to weed through it anyway. Oh, don't worry, I have no idea what made this patch stick out as the hardest one... Maybe it's really that there is such a thing as too many comments in some cases, and also the mutual recursion between handle_removed_pt() and handle_changed_spte(). Nothing that can't be fixed. > > > + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); > > > + int ret; > > > + > > > + /* > > > + * Allow only leaf page to be zapped. Reclaim non-leaf page tables > > > page > > > > This comment left me confused, so I'll try to rephrase and see if I > > can explain what happens. Correct me if I'm wrong. > > > > The only paths to handle_removed_pt() are: > > - kvm_tdp_mmu_zap_leafs() > > - kvm_tdp_mmu_zap_invalidated_roots() > > > > but because kvm_mmu_zap_all_fast() does not operate on mirror roots, > > the latter can only happen at VM destruction time. > > > > But it's not clear why it's worth mentioning it here, or even why it > > is special at all. Isn't that just what handle_removed_pt() does at > > the end? Why does it matter that it's only done at VM destruction > > time? > > > > In other words, it seems to me that this comment is TMI. And if I am > > wrong (which may well be), the extra information should explain the > > "why" in more detail, and it should be around the call to > > reflect_free_spt, not here. > > TDX of course has the limitation around the ordering of the zapping S-EPT. So I > read the comment to be referring to how the implementation avoids zapping any > non-leaf PTEs during TD runtime. Ok, I think then you can just replace it with a comment that explains why TDX needs it, as it's one of those places where we let TDX assumptions creep into common code - which is not a big deal per se, it's just worth mentioning it in a comment. Unlike the tdp_mmu_get_root_for_fault() remark I have just sent for patch 9/15, here the assumption is more algorithmic and less about a specific line of code, and I think that makes it okay. Paolo > > > - /* Don't support setting for the non-atomic case */ > > > - if (is_mirror_sptep(sptep)) > > > + if (is_mirror_sptep(sptep)) { > > > + /* Only support zapping for the non-atomic case */ > > > > Like for patch 10, this comment should point out why we never get here > > for mirror SPs. > > Ok.
On Fri, Jun 07, 2024 at 09:46:27PM +0000, "Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote: > > /* Here we only care about zapping the external leaf PTEs. */ > > if (!is_last_spte(old_spte, level)) > > > > > + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); > > > + int ret; > > > + > > > + /* > > > + * Allow only leaf page to be zapped. Reclaim non-leaf page tables > > > page > > > > This comment left me confused, so I'll try to rephrase and see if I > > can explain what happens. Correct me if I'm wrong. > > > > The only paths to handle_removed_pt() are: > > - kvm_tdp_mmu_zap_leafs() > > - kvm_tdp_mmu_zap_invalidated_roots() > > > > but because kvm_mmu_zap_all_fast() does not operate on mirror roots, > > the latter can only happen at VM destruction time. > > > > But it's not clear why it's worth mentioning it here, or even why it > > is special at all. Isn't that just what handle_removed_pt() does at > > the end? Why does it matter that it's only done at VM destruction > > time? > > > > In other words, it seems to me that this comment is TMI. And if I am > > wrong (which may well be), the extra information should explain the > > "why" in more detail, and it should be around the call to > > reflect_free_spt, not here. > > TDX of course has the limitation around the ordering of the zapping S-EPT. So I > read the comment to be referring to how the implementation avoids zapping any > non-leaf PTEs during TD runtime. > > But I'm going to have to circle back here after investigating a bit more. Isaku, > any comments on this comment and conditional? It's for large page page merge/split. At this point, it seems only confusing. We need only leaf zapping. Maybe reflect_removed_leaf_spte() or something. Later we can come back when large page support.
On Wed, 2024-06-12 at 11:39 -0700, Isaku Yamahata wrote: > > TDX of course has the limitation around the ordering of the zapping S-EPT. > > So I > > read the comment to be referring to how the implementation avoids zapping > > any > > non-leaf PTEs during TD runtime. > > > > But I'm going to have to circle back here after investigating a bit more. > > Isaku, > > any comments on this comment and conditional? > > It's for large page page merge/split. At this point, it seems only confusing. > We need only leaf zapping. Maybe reflect_removed_leaf_spte() or something. > Later we can come back when large page support. Yes, I don't see the need for the is_shadow_present_pte() in this function. The leaf check is needed, but actually it could be done well enough in TDX code for now (for as long as we have no huge pages). diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0155b8330e15..e54dd2355005 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1721,6 +1721,10 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, { int ret; + /* Only 4k pages currently, nothing to do for other levels */ + if (level != PG_LEVEL_4K) + return 0; + ret = tdx_sept_zap_private_spte(kvm, gfn, level); if (ret) return ret; The benefits would be that we have less mirror logic in core MMU code that actually is about TDX specifics unrelated to the mirror concept. The downsides are returning success there feels kind of wrong. I'm going to move forward with dropping the is_shadow_present_pte() check and adding a comment (see below), but I thought the alternative was worth mentioning because I was tempted. /* * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external * PTs are removed in a special order, involving free_external_spt(). * But remove_external_spte() will be called on non-leaf PTEs via * __tdp_mmu_zap_root(), so avoid the error the former would return * in this case. */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 1877d6a77525..dae06afc6038 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -97,6 +97,8 @@ KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP_OPTIONAL(reflect_link_spt) KVM_X86_OP_OPTIONAL(reflect_set_spte) +KVM_X86_OP_OPTIONAL(reflect_free_spt) +KVM_X86_OP_OPTIONAL(reflect_remove_spte) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 20bb10f22ca6..0df4a31a0df9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1755,6 +1755,14 @@ struct kvm_x86_ops { int (*reflect_set_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn); + /* Update mirrored page tables for page table about to be freed */ + int (*reflect_free_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + void *mirrored_spt); + + /* Update mirrored page table from spte getting removed, and flush TLB */ + int (*reflect_remove_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + kvm_pfn_t pfn); + bool (*has_wbinvd_exit)(void); u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 41b1d3f26597..1245f6a48dbe 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -346,6 +346,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } +static void reflect_removed_spte(struct kvm *kvm, gfn_t gfn, + u64 old_spte, u64 new_spte, + int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool was_leaf = was_present && is_last_spte(old_spte, level); + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); + int ret; + + /* + * Allow only leaf page to be zapped. Reclaim non-leaf page tables page + * at destroying VM. + */ + if (!was_leaf) + return; + + /* Zapping leaf spte is allowed only when write lock is held. */ + lockdep_assert_held_write(&kvm->mmu_lock); + /* Because write lock is held, operation should success. */ + ret = static_call(kvm_x86_reflect_remove_spte)(kvm, gfn, level, old_pfn); + KVM_BUG_ON(ret, kvm); +} + /** * handle_removed_pt() - handle a page table removed from the TDP structure * @@ -441,6 +464,22 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, old_spte, REMOVED_SPTE, sp->role, shared); + if (is_mirror_sp(sp)) { + KVM_BUG_ON(shared, kvm); + reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level); + } + } + + if (is_mirror_sp(sp) && + WARN_ON(static_call(kvm_x86_reflect_free_spt)(kvm, sp->gfn, sp->role.level, + kvm_mmu_mirrored_spt(sp)))) { + /* + * Failed to free page table page in mirror page table and + * there is nothing to do further. + * Intentionally leak the page to prevent the kernel from + * accessing the encrypted page. + */ + sp->mirrored_spt = NULL; } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -778,9 +817,11 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, role.level = level; handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, false); - /* Don't support setting for the non-atomic case */ - if (is_mirror_sptep(sptep)) + if (is_mirror_sptep(sptep)) { + /* Only support zapping for the non-atomic case */ KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); + reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level); + } return old_spte; }