Message ID | 20211110223010.1392399-3-bgardon@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86/mmu: Optimize disabling dirty logging | expand |
On Wed, Nov 10, 2021 at 02:29:53PM -0800, Ben Gardon wrote: > When recursively handling a removed TDP page table, the TDP MMU will > flush the TLBs and queue an RCU callback to free the PT. If the original > change zapped a non-leaf SPTE at PG_LEVEL_1G or above, that change will > result in many unnecessary TLB flushes when one would suffice. Queue all > the PTs which need to be freed on a list and wait to queue RCU callbacks > to free them until after all the recursive callbacks are done. > > > Signed-off-by: Ben Gardon <bgardon@google.com> > --- > arch/x86/kvm/mmu/tdp_mmu.c | 88 ++++++++++++++++++++++++++++++-------- > 1 file changed, 70 insertions(+), 18 deletions(-) > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 866c2b191e1e..5b31d046df78 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -220,7 +220,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) > > static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > u64 old_spte, u64 new_spte, int level, > - bool shared); > + bool shared, > + struct list_head *disconnected_sps); > > static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) > { > @@ -302,6 +303,11 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, > * @shared: This operation may not be running under the exclusive use > * of the MMU lock and the operation must synchronize with other > * threads that might be modifying SPTEs. > + * @disconnected_sps: If null, the TLBs will be flushed and the disconnected > + * TDP MMU page will be queued to be freed after an RCU > + * callback. If non-null the page will be added to the list > + * and flushing the TLBs and queueing an RCU callback to > + * free the page will be the caller's responsibility. > * > * Given a page table that has been removed from the TDP paging structure, > * iterates through the page table to clear SPTEs and free child page tables. > @@ -312,7 +318,8 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, > * early rcu_dereferences in the function. > */ > static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, > - bool shared) > + bool shared, > { > struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); > int level = sp->role.level; > @@ -371,13 +378,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, > } > handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, > old_child_spte, REMOVED_SPTE, level, > - shared); > + shared, disconnected_sps); > } > > - kvm_flush_remote_tlbs_with_address(kvm, base_gfn, > - KVM_PAGES_PER_HPAGE(level + 1)); > - > - call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); > + if (disconnected_sps) { > + list_add_tail(&sp->link, disconnected_sps); > + } else { > + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, > + KVM_PAGES_PER_HPAGE(level + 1)); > + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); > + } > } > > /** > @@ -391,13 +401,21 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, > * @shared: This operation may not be running under the exclusive use of > * the MMU lock and the operation must synchronize with other > * threads that might be modifying SPTEs. > + * @disconnected_sps: Only used if a page of page table memory has been > + * removed from the paging structure by this change. > + * If null, the TLBs will be flushed and the disconnected > + * TDP MMU page will be queued to be freed after an RCU > + * callback. If non-null the page will be added to the list > + * and flushing the TLBs and queueing an RCU callback to > + * free the page will be the caller's responsibility. > * > * Handle bookkeeping that might result from the modification of a SPTE. > * This function must be called for all TDP SPTE modifications. > */ > static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > u64 old_spte, u64 new_spte, int level, > - bool shared) > + bool shared, > + struct list_head *disconnected_sps) > { > bool was_present = is_shadow_present_pte(old_spte); > bool is_present = is_shadow_present_pte(new_spte); > @@ -475,22 +493,39 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > */ > if (was_present && !was_leaf && (pfn_changed || !is_present)) > handle_removed_tdp_mmu_page(kvm, > - spte_to_child_pt(old_spte, level), shared); > + spte_to_child_pt(old_spte, level), shared, > + disconnected_sps); > } > > static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > u64 old_spte, u64 new_spte, int level, > - bool shared) > + bool shared, struct list_head *disconnected_sps) > { > __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, > - shared); > + shared, disconnected_sps); > handle_changed_spte_acc_track(old_spte, new_spte, level); > handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, > new_spte, level); > } > > /* > - * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically > + * The TLBs must be flushed between the pages linked from disconnected_sps > + * being removed from the paging structure and this function being called. > + */ > +static void handle_disconnected_sps(struct kvm *kvm, > + struct list_head *disconnected_sps) handle_disconnected_sps() does a very specific task so I think we could go with a more specific function name to make the code more readable. How about free_sps_rcu() or call_rcu_free_sps()? > +{ > + struct kvm_mmu_page *sp; > + struct kvm_mmu_page *next; > + > + list_for_each_entry_safe(sp, next, disconnected_sps, link) { > + list_del(&sp->link); > + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); > + } > +} > + > +/* > + * __tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically > * and handle the associated bookkeeping. Do not mark the page dirty > * in KVM's dirty bitmaps. > * > @@ -500,9 +535,10 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > * Returns: true if the SPTE was set, false if it was not. If false is returned, > * this function will have no side-effects. > */ > -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, > - struct tdp_iter *iter, > - u64 new_spte) > +static inline bool __tdp_mmu_set_spte_atomic(struct kvm *kvm, > + struct tdp_iter *iter, > + u64 new_spte, > + struct list_head *disconnected_sps) > { > lockdep_assert_held_read(&kvm->mmu_lock); > > @@ -522,22 +558,32 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, > return false; > > __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, > - new_spte, iter->level, true); > + new_spte, iter->level, true, disconnected_sps); > handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); > > return true; > } > > +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, > + struct tdp_iter *iter, > + u64 new_spte) > +{ > + return __tdp_mmu_set_spte_atomic(kvm, iter, new_spte, NULL); Why not leverage disconnected_sps here as well? Then you can remove the NULL case (and comments) from handle_removed_tdp_mmu_page. > +} > + > static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, > struct tdp_iter *iter) > { > + LIST_HEAD(disconnected_sps); > + > /* > * Freeze the SPTE by setting it to a special, > * non-present value. This will stop other threads from > * immediately installing a present entry in its place > * before the TLBs are flushed. > */ > - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) > + if (!__tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE, > + &disconnected_sps)) > return false; > > kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, > @@ -553,6 +599,8 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, > */ > WRITE_ONCE(*rcu_dereference(iter->sptep), 0); > > + handle_disconnected_sps(kvm, &disconnected_sps); > + > return true; > } > > @@ -577,6 +625,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, > u64 new_spte, bool record_acc_track, > bool record_dirty_log) > { > + LIST_HEAD(disconnected_sps); > + > lockdep_assert_held_write(&kvm->mmu_lock); > > /* > @@ -591,7 +641,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, > WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); > > __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, > - new_spte, iter->level, false); > + new_spte, iter->level, false, &disconnected_sps); > if (record_acc_track) > handle_changed_spte_acc_track(iter->old_spte, new_spte, > iter->level); > @@ -599,6 +649,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, > handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, > iter->old_spte, new_spte, > iter->level); > + > + handle_disconnected_sps(kvm, &disconnected_sps); Where is the TLB flush for these disconnected_sps? > } > > static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, > -- > 2.34.0.rc0.344.g81b53c2807-goog >
On Wed, Nov 10, 2021, Ben Gardon wrote: > When recursively handling a removed TDP page table, the TDP MMU will > flush the TLBs and queue an RCU callback to free the PT. If the original > change zapped a non-leaf SPTE at PG_LEVEL_1G or above, that change will > result in many unnecessary TLB flushes when one would suffice. Queue all > the PTs which need to be freed on a list and wait to queue RCU callbacks > to free them until after all the recursive callbacks are done. I'm pretty sure we can do this without tracking disconnected SPs. The whole point of protecting TDP MMU with RCU is to wait until _all_ CPUs are guaranateed to have dropped references. Emphasis on "all" because that also includes the CPU that's doing the zapping/replacement! And since the current CPU is required to hold RCU, we can use its RCU lock as a proxy for all vCPUs executing in the guest. That will require either flushing in zap_gfn_range() or requiring callers to hold, or more likely a mix of both so that flows that zap multiple roots or both TDP and legacy MMU pages can batch flushes If this doesn't sound completely bonkers, I'd like to pick this up next week, I wandered into KVM's handling of invalidated roots and have patches that would conflict in weird ways with this idea. So I think this can simply be (sans zap_gfn_range() changes): diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4e226cdb40d9..d2303bca4449 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -431,9 +431,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, shared); } - kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level + 1)); - call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -716,11 +713,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, return false; if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - rcu_read_unlock(); - if (flush) kvm_flush_remote_tlbs(kvm); + rcu_read_unlock(); + if (shared) cond_resched_rwlock_read(&kvm->mmu_lock); else @@ -817,7 +814,6 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, } rcu_read_unlock(); - return flush; } /* @@ -954,6 +950,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, ret = RET_PF_SPURIOUS; else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; + else if (<old spte was present shadow page>) + kvm_flush_remote_tlbs(kvm); /* * If the page fault was caused by a write but the page is write > +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, > + struct tdp_iter *iter, > + u64 new_spte) > +{ > + return __tdp_mmu_set_spte_atomic(kvm, iter, new_spte, NULL); This helper and refactoring belongs in patch 19. It is impossible to review without the context of its user(s).
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 866c2b191e1e..5b31d046df78 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -220,7 +220,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, - bool shared); + bool shared, + struct list_head *disconnected_sps); static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { @@ -302,6 +303,11 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * @shared: This operation may not be running under the exclusive use * of the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. + * @disconnected_sps: If null, the TLBs will be flushed and the disconnected + * TDP MMU page will be queued to be freed after an RCU + * callback. If non-null the page will be added to the list + * and flushing the TLBs and queueing an RCU callback to + * free the page will be the caller's responsibility. * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. @@ -312,7 +318,8 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * early rcu_dereferences in the function. */ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, - bool shared) + bool shared, + struct list_head *disconnected_sps) { struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; @@ -371,13 +378,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, old_child_spte, REMOVED_SPTE, level, - shared); + shared, disconnected_sps); } - kvm_flush_remote_tlbs_with_address(kvm, base_gfn, - KVM_PAGES_PER_HPAGE(level + 1)); - - call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); + if (disconnected_sps) { + list_add_tail(&sp->link, disconnected_sps); + } else { + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, + KVM_PAGES_PER_HPAGE(level + 1)); + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); + } } /** @@ -391,13 +401,21 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, * @shared: This operation may not be running under the exclusive use of * the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. + * @disconnected_sps: Only used if a page of page table memory has been + * removed from the paging structure by this change. + * If null, the TLBs will be flushed and the disconnected + * TDP MMU page will be queued to be freed after an RCU + * callback. If non-null the page will be added to the list + * and flushing the TLBs and queueing an RCU callback to + * free the page will be the caller's responsibility. * * Handle bookkeeping that might result from the modification of a SPTE. * This function must be called for all TDP SPTE modifications. */ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, - bool shared) + bool shared, + struct list_head *disconnected_sps) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); @@ -475,22 +493,39 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (was_present && !was_leaf && (pfn_changed || !is_present)) handle_removed_tdp_mmu_page(kvm, - spte_to_child_pt(old_spte, level), shared); + spte_to_child_pt(old_spte, level), shared, + disconnected_sps); } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, - bool shared) + bool shared, struct list_head *disconnected_sps) { __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, - shared); + shared, disconnected_sps); handle_changed_spte_acc_track(old_spte, new_spte, level); handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, new_spte, level); } /* - * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically + * The TLBs must be flushed between the pages linked from disconnected_sps + * being removed from the paging structure and this function being called. + */ +static void handle_disconnected_sps(struct kvm *kvm, + struct list_head *disconnected_sps) +{ + struct kvm_mmu_page *sp; + struct kvm_mmu_page *next; + + list_for_each_entry_safe(sp, next, disconnected_sps, link) { + list_del(&sp->link); + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); + } +} + +/* + * __tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * @@ -500,9 +535,10 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Returns: true if the SPTE was set, false if it was not. If false is returned, * this function will have no side-effects. */ -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline bool __tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte, + struct list_head *disconnected_sps) { lockdep_assert_held_read(&kvm->mmu_lock); @@ -522,22 +558,32 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, return false; __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); + new_spte, iter->level, true, disconnected_sps); handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); return true; } +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + return __tdp_mmu_set_spte_atomic(kvm, iter, new_spte, NULL); +} + static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter) { + LIST_HEAD(disconnected_sps); + /* * Freeze the SPTE by setting it to a special, * non-present value. This will stop other threads from * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + if (!__tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE, + &disconnected_sps)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -553,6 +599,8 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, */ WRITE_ONCE(*rcu_dereference(iter->sptep), 0); + handle_disconnected_sps(kvm, &disconnected_sps); + return true; } @@ -577,6 +625,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { + LIST_HEAD(disconnected_sps); + lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -591,7 +641,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, false); + new_spte, iter->level, false, &disconnected_sps); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); @@ -599,6 +649,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level); + + handle_disconnected_sps(kvm, &disconnected_sps); } static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
When recursively handling a removed TDP page table, the TDP MMU will flush the TLBs and queue an RCU callback to free the PT. If the original change zapped a non-leaf SPTE at PG_LEVEL_1G or above, that change will result in many unnecessary TLB flushes when one would suffice. Queue all the PTs which need to be freed on a list and wait to queue RCU callbacks to free them until after all the recursive callbacks are done. Signed-off-by: Ben Gardon <bgardon@google.com> --- arch/x86/kvm/mmu/tdp_mmu.c | 88 ++++++++++++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 18 deletions(-)