Message ID | 1517583559-424-1-git-send-email-dwmw@amazon.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> On 2. Feb 2018, at 15:59, David Woodhouse <dwmw@amazon.co.uk> wrote: > > With retpoline, tight loops of "call this function for every XXX" are > very much pessimised by taking a prediction miss *every* time. > > This one showed up very high in our early testing, and it only has five > things it'll ever call so make it take an 'op' enum instead of a > function pointer and let's see how that works out... > > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > --- > Not sure I like this. Better suggestions welcomed... > > In the general case, we have a few things we can do with the calls that > retpoline turns into bottlenecks. This is one of them. > > Another option, if there are just one or two "likely" functions, is > something along the lines of > > if (func == likelyfunc) > likelyfunc() > else > (*func)(); // GCC does retpoline for this > > For things like kvm_x86_ops we really could just turn *all* of those > into direct calls at runtime, like pvops does. > > There are some which land somewhere in the middle, like the default > dma_ops. We probably want something like the 'likelyfunc' version > above, except that we *also* want to flip the likelyfunc between the > Intel and AMD IOMMU ops functions, at early boot. I'll see what I can > come up with... > > arch/x86/kvm/mmu.c | 72 ++++++++++++++++++++++++++++++++++++------------------ > 1 file changed, 48 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 2b8eb4d..44f9de7 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -5055,12 +5055,21 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) > } > > /* The return value indicates if tlb flush on all vcpus is needed. */ > -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); > +enum slot_handler_op { > + SLOT_RMAP_CLEAR_DIRTY, > + SLOT_RMAP_SET_DIRTY, > + SLOT_RMAP_WRITE_PROTECT, > + SLOT_ZAP_RMAPP, > + SLOT_ZAP_COLLAPSIBLE_SPTE, > +}; > + > +static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > + struct kvm_rmap_head *rmap_head); > > /* The caller should hold mmu-lock before calling this function. */ > static bool > slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, > - slot_level_handler fn, int start_level, int end_level, > + enum slot_handler_op op, int start_level, int end_level, > gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) > { > struct slot_rmap_walk_iterator iterator; > @@ -5068,8 +5077,29 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, > > for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, > end_gfn, &iterator) { > - if (iterator.rmap) > - flush |= fn(kvm, iterator.rmap); > + if (iterator.rmap) { > + switch (op) { > + case SLOT_RMAP_CLEAR_DIRTY: > + flush |= __rmap_clear_dirty(kvm, iterator.rmap); > + break; > + > + case SLOT_RMAP_SET_DIRTY: > + flush |= __rmap_set_dirty(kvm, iterator.rmap); > + break; > + > + case SLOT_RMAP_WRITE_PROTECT: > + flush |= __rmap_write_protect(kvm, iterator.rmap, false); > + break; > + > + case SLOT_ZAP_RMAPP: > + flush |= kvm_zap_rmapp(kvm, iterator.rmap); > + break; > + > + case SLOT_ZAP_COLLAPSIBLE_SPTE: > + flush |= kvm_mmu_zap_collapsible_spte(kvm, iterator.rmap); > + break; > + } > + } > > if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { > if (flush && lock_flush_tlb) { > @@ -5090,10 +5120,10 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, > > static bool > slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, > - slot_level_handler fn, int start_level, int end_level, > + enum slot_handler_op op, int start_level, int end_level, > bool lock_flush_tlb) > { > - return slot_handle_level_range(kvm, memslot, fn, start_level, > + return slot_handle_level_range(kvm, memslot, op, start_level, > end_level, memslot->base_gfn, > memslot->base_gfn + memslot->npages - 1, > lock_flush_tlb); > @@ -5101,25 +5131,25 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, > > static bool > slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, > - slot_level_handler fn, bool lock_flush_tlb) > + enum slot_handler_op op, bool lock_flush_tlb) > { > - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, > + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL, > PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); > } > > static bool > slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, > - slot_level_handler fn, bool lock_flush_tlb) > + enum slot_handler_op op, bool lock_flush_tlb) > { > - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, > + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL + 1, > PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); > } > > static bool > slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, > - slot_level_handler fn, bool lock_flush_tlb) > + enum slot_handler_op op, bool lock_flush_tlb) > { > - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, > + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL, > PT_PAGE_TABLE_LEVEL, lock_flush_tlb); > } > > @@ -5140,7 +5170,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) > if (start >= end) > continue; > > - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, > + slot_handle_level_range(kvm, memslot, SLOT_ZAP_RMAPP, > PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, > start, end - 1, true); > } > @@ -5149,19 +5179,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) > spin_unlock(&kvm->mmu_lock); > } > > -static bool slot_rmap_write_protect(struct kvm *kvm, > - struct kvm_rmap_head *rmap_head) > -{ > - return __rmap_write_protect(kvm, rmap_head, false); > -} > - > void kvm_mmu_slot_remove_write_access(struct kvm *kvm, > struct kvm_memory_slot *memslot) > { > bool flush; > > spin_lock(&kvm->mmu_lock); > - flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, > + flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT, > false); > spin_unlock(&kvm->mmu_lock); > > @@ -5226,7 +5250,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, > /* FIXME: const-ify all uses of struct kvm_memory_slot. */ > spin_lock(&kvm->mmu_lock); > slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, > - kvm_mmu_zap_collapsible_spte, true); > + SLOT_ZAP_COLLAPSIBLE_SPTE, true); > spin_unlock(&kvm->mmu_lock); > } > > @@ -5236,7 +5260,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, > bool flush; > > spin_lock(&kvm->mmu_lock); > - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); > + flush = slot_handle_leaf(kvm, memslot, SLOT_RMAP_CLEAR_DIRTY, false); > spin_unlock(&kvm->mmu_lock); > > lockdep_assert_held(&kvm->slots_lock); > @@ -5258,7 +5282,7 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, > bool flush; > > spin_lock(&kvm->mmu_lock); > - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, > + flush = slot_handle_large_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT, > false); > spin_unlock(&kvm->mmu_lock); > > @@ -5276,7 +5300,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, > bool flush; > > spin_lock(&kvm->mmu_lock); > - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); > + flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_SET_DIRTY, false); > spin_unlock(&kvm->mmu_lock); > > lockdep_assert_held(&kvm->slots_lock); > -- > 2.7.4 > Let's add more context. vmx_slot_disable_log_dirty() was already one of the bottlenecks on instance launch (at least with our setup). With retpoline, it became horribly slow (like twice as slow). Up to know, we're using a ugly workaround that works for us but of course isn't acceptable in the long run. I'm going to explore the issue further earlier next week. Filippo Amazon Development Center Germany GmbH Berlin - Dresden - Aachen main office: Krausenstr. 38, 10117 Berlin Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger Ust-ID: DE289237879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
On Fri, Feb 2, 2018 at 6:59 AM, David Woodhouse <dwmw@amazon.co.uk> wrote: > With retpoline, tight loops of "call this function for every XXX" are > very much pessimised by taking a prediction miss *every* time. > > This one showed up very high in our early testing, and it only has five > things it'll ever call so make it take an 'op' enum instead of a > function pointer and let's see how that works out... Umm. May I suggest a different workaround? Honestly, if this is so performance-critical, the *real* fix is to actually just mark all those "slot_handle_*()" functions as "always_inline". Because that will *really* improve performance, by simply removing the indirection entirely - since then the functions involved will become static. You might get other code improvements too, because I suspect it will end up removing an extra level of function call due to those trivial wrapper functions. And there's a couple of "bool lock_flush_tlb" arguments that will simply become constant and generate much better code that way. And maybe you don't want to inline all of the slot_handle_*() functions, and it's only one or two of them that matter because they loop over a lot of entries, but honestly, most of those slot_handle_xyz() functions seem to have just a couple of call sites anyway. slot_handle_large_level() is probably already inlined by the compiler because it only has a single call-site. Will it make for bigger code? Yes. But probably not really all *that* much bigger, because of how it also will allow the compiler to simplify some things. An dif this really is so critical that those non-predicted calls were that noticeable, those other simplifications probably also matter. And then you get rid of all run-time conditionals, and all the indirect jumps entirely. Plus the patch will be smaller and simpler too. Hmm? Linus
> > On 2. Feb 2018, at 15:59, David Woodhouse <dwmw@amazon.co.uk> wrote: > > With retpoline, tight loops of "call this function for every XXX" are > > very much pessimised by taking a prediction miss *every* time. > > > > This one showed up very high in our early testing, and it only has five > > things it'll ever call so make it take an 'op' enum instead of a > > function pointer and let's see how that works out... > > > > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> What about __always_inline instead? Thanks, Paolo > > --- > > Not sure I like this. Better suggestions welcomed... > > > > In the general case, we have a few things we can do with the calls that > > retpoline turns into bottlenecks. This is one of them. > > > > Another option, if there are just one or two "likely" functions, is > > something along the lines of > > > > if (func == likelyfunc) > > likelyfunc() > > else > > (*func)(); // GCC does retpoline for this > > > > For things like kvm_x86_ops we really could just turn *all* of those > > into direct calls at runtime, like pvops does. > > > > There are some which land somewhere in the middle, like the default > > dma_ops. We probably want something like the 'likelyfunc' version > > above, except that we *also* want to flip the likelyfunc between the > > Intel and AMD IOMMU ops functions, at early boot. I'll see what I can > > come up with... > > > > arch/x86/kvm/mmu.c | 72 > > ++++++++++++++++++++++++++++++++++++------------------ > > 1 file changed, 48 insertions(+), 24 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > > index 2b8eb4d..44f9de7 100644 > > --- a/arch/x86/kvm/mmu.c > > +++ b/arch/x86/kvm/mmu.c > > @@ -5055,12 +5055,21 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) > > } > > > > /* The return value indicates if tlb flush on all vcpus is needed. */ > > -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head > > *rmap_head); > > +enum slot_handler_op { > > + SLOT_RMAP_CLEAR_DIRTY, > > + SLOT_RMAP_SET_DIRTY, > > + SLOT_RMAP_WRITE_PROTECT, > > + SLOT_ZAP_RMAPP, > > + SLOT_ZAP_COLLAPSIBLE_SPTE, > > +}; > > + > > +static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > > + struct kvm_rmap_head *rmap_head); > > > > /* The caller should hold mmu-lock before calling this function. */ > > static bool > > slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, > > - slot_level_handler fn, int start_level, int end_level, > > + enum slot_handler_op op, int start_level, int end_level, > > gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) > > { > > struct slot_rmap_walk_iterator iterator; > > @@ -5068,8 +5077,29 @@ slot_handle_level_range(struct kvm *kvm, struct > > kvm_memory_slot *memslot, > > > > for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, > > end_gfn, &iterator) { > > - if (iterator.rmap) > > - flush |= fn(kvm, iterator.rmap); > > + if (iterator.rmap) { > > + switch (op) { > > + case SLOT_RMAP_CLEAR_DIRTY: > > + flush |= __rmap_clear_dirty(kvm, iterator.rmap); > > + break; > > + > > + case SLOT_RMAP_SET_DIRTY: > > + flush |= __rmap_set_dirty(kvm, iterator.rmap); > > + break; > > + > > + case SLOT_RMAP_WRITE_PROTECT: > > + flush |= __rmap_write_protect(kvm, iterator.rmap, false); > > + break; > > + > > + case SLOT_ZAP_RMAPP: > > + flush |= kvm_zap_rmapp(kvm, iterator.rmap); > > + break; > > + > > + case SLOT_ZAP_COLLAPSIBLE_SPTE: > > + flush |= kvm_mmu_zap_collapsible_spte(kvm, iterator.rmap); > > + break; > > + } > > + } > > > > if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { > > if (flush && lock_flush_tlb) { > > @@ -5090,10 +5120,10 @@ slot_handle_level_range(struct kvm *kvm, struct > > kvm_memory_slot *memslot, > > > > static bool > > slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, > > - slot_level_handler fn, int start_level, int end_level, > > + enum slot_handler_op op, int start_level, int end_level, > > bool lock_flush_tlb) > > { > > - return slot_handle_level_range(kvm, memslot, fn, start_level, > > + return slot_handle_level_range(kvm, memslot, op, start_level, > > end_level, memslot->base_gfn, > > memslot->base_gfn + memslot->npages - 1, > > lock_flush_tlb); > > @@ -5101,25 +5131,25 @@ slot_handle_level(struct kvm *kvm, struct > > kvm_memory_slot *memslot, > > > > static bool > > slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, > > - slot_level_handler fn, bool lock_flush_tlb) > > + enum slot_handler_op op, bool lock_flush_tlb) > > { > > - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, > > + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL, > > PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); > > } > > > > static bool > > slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, > > - slot_level_handler fn, bool lock_flush_tlb) > > + enum slot_handler_op op, bool lock_flush_tlb) > > { > > - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, > > + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL + 1, > > PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); > > } > > > > static bool > > slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, > > - slot_level_handler fn, bool lock_flush_tlb) > > + enum slot_handler_op op, bool lock_flush_tlb) > > { > > - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, > > + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL, > > PT_PAGE_TABLE_LEVEL, lock_flush_tlb); > > } > > > > @@ -5140,7 +5170,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t > > gfn_start, gfn_t gfn_end) > > if (start >= end) > > continue; > > > > - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, > > + slot_handle_level_range(kvm, memslot, SLOT_ZAP_RMAPP, > > PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, > > start, end - 1, true); > > } > > @@ -5149,19 +5179,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t > > gfn_start, gfn_t gfn_end) > > spin_unlock(&kvm->mmu_lock); > > } > > > > -static bool slot_rmap_write_protect(struct kvm *kvm, > > - struct kvm_rmap_head *rmap_head) > > -{ > > - return __rmap_write_protect(kvm, rmap_head, false); > > -} > > - > > void kvm_mmu_slot_remove_write_access(struct kvm *kvm, > > struct kvm_memory_slot *memslot) > > { > > bool flush; > > > > spin_lock(&kvm->mmu_lock); > > - flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, > > + flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT, > > false); > > spin_unlock(&kvm->mmu_lock); > > > > @@ -5226,7 +5250,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, > > /* FIXME: const-ify all uses of struct kvm_memory_slot. */ > > spin_lock(&kvm->mmu_lock); > > slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, > > - kvm_mmu_zap_collapsible_spte, true); > > + SLOT_ZAP_COLLAPSIBLE_SPTE, true); > > spin_unlock(&kvm->mmu_lock); > > } > > > > @@ -5236,7 +5260,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, > > bool flush; > > > > spin_lock(&kvm->mmu_lock); > > - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); > > + flush = slot_handle_leaf(kvm, memslot, SLOT_RMAP_CLEAR_DIRTY, false); > > spin_unlock(&kvm->mmu_lock); > > > > lockdep_assert_held(&kvm->slots_lock); > > @@ -5258,7 +5282,7 @@ void > > kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, > > bool flush; > > > > spin_lock(&kvm->mmu_lock); > > - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, > > + flush = slot_handle_large_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT, > > false); > > spin_unlock(&kvm->mmu_lock); > > > > @@ -5276,7 +5300,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, > > bool flush; > > > > spin_lock(&kvm->mmu_lock); > > - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); > > + flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_SET_DIRTY, false); > > spin_unlock(&kvm->mmu_lock); > > > > lockdep_assert_held(&kvm->slots_lock); > > -- > > 2.7.4 > > > > Let's add more context. > > vmx_slot_disable_log_dirty() was already one of the bottlenecks on instance > launch > (at least with our setup). With retpoline, it became horribly slow (like > twice as > slow). > > Up to know, we're using a ugly workaround that works for us but of course > isn't > acceptable in the long run. I'm going to explore the issue further earlier > next > week. > > Filippo > > > Amazon Development Center Germany GmbH > Berlin - Dresden - Aachen > main office: Krausenstr. 38, 10117 Berlin > Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger > Ust-ID: DE289237879 > Eingetragen am Amtsgericht Charlottenburg HRB 149173 B > >
On Fri, 2018-02-02 at 16:10 -0500, Paolo Bonzini wrote: > > > On 2. Feb 2018, at 15:59, David Woodhouse <dwmw@amazon.co.uk> wrote: > > > With retpoline, tight loops of "call this function for every XXX" are > > > very much pessimised by taking a prediction miss *every* time. > > > > > > This one showed up very high in our early testing, and it only has five > > > things it'll ever call so make it take an 'op' enum instead of a > > > function pointer and let's see how that works out... > > > > > > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > > What about __always_inline instead? Yeah, Linus suggested that and it looks like it generates sane (in fact better) code. Will do some more testing and send it out for real probably on Monday. Thanks.
On 02/02/2018 19:50, Linus Torvalds wrote: > On Fri, Feb 2, 2018 at 6:59 AM, David Woodhouse <dwmw@amazon.co.uk> wrote: >> With retpoline, tight loops of "call this function for every XXX" are >> very much pessimised by taking a prediction miss *every* time. >> >> This one showed up very high in our early testing, and it only has five >> things it'll ever call so make it take an 'op' enum instead of a >> function pointer and let's see how that works out... > Umm. May I suggest a different workaround? > > Honestly, if this is so performance-critical, the *real* fix is to > actually just mark all those "slot_handle_*()" functions as > "always_inline". I replied quickly from the phone before reading the rest of the thread---yeah, always_inline is the way to go. I see the same differences as Linus and David (slight improvement for slot_handle_*, +1k if you add kvm_handle_hva and kvm_handle_hva_range). At least for slot_handle_* it's a no-brainer. The others are basically the MMU notifier implementation; in the perfect case it should actually never be called (or at least it ought to be very rare), so I think we can keep the indirect calls for now. Paolo
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2b8eb4d..44f9de7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5055,12 +5055,21 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) } /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); +enum slot_handler_op { + SLOT_RMAP_CLEAR_DIRTY, + SLOT_RMAP_SET_DIRTY, + SLOT_RMAP_WRITE_PROTECT, + SLOT_ZAP_RMAPP, + SLOT_ZAP_COLLAPSIBLE_SPTE, +}; + +static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, + struct kvm_rmap_head *rmap_head); /* The caller should hold mmu-lock before calling this function. */ static bool slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, + enum slot_handler_op op, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) { struct slot_rmap_walk_iterator iterator; @@ -5068,8 +5077,29 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); + if (iterator.rmap) { + switch (op) { + case SLOT_RMAP_CLEAR_DIRTY: + flush |= __rmap_clear_dirty(kvm, iterator.rmap); + break; + + case SLOT_RMAP_SET_DIRTY: + flush |= __rmap_set_dirty(kvm, iterator.rmap); + break; + + case SLOT_RMAP_WRITE_PROTECT: + flush |= __rmap_write_protect(kvm, iterator.rmap, false); + break; + + case SLOT_ZAP_RMAPP: + flush |= kvm_zap_rmapp(kvm, iterator.rmap); + break; + + case SLOT_ZAP_COLLAPSIBLE_SPTE: + flush |= kvm_mmu_zap_collapsible_spte(kvm, iterator.rmap); + break; + } + } if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { @@ -5090,10 +5120,10 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, static bool slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, + enum slot_handler_op op, int start_level, int end_level, bool lock_flush_tlb) { - return slot_handle_level_range(kvm, memslot, fn, start_level, + return slot_handle_level_range(kvm, memslot, op, start_level, end_level, memslot->base_gfn, memslot->base_gfn + memslot->npages - 1, lock_flush_tlb); @@ -5101,25 +5131,25 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, static bool slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + enum slot_handler_op op, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } static bool slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + enum slot_handler_op op, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL + 1, PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } static bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + enum slot_handler_op op, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL, PT_PAGE_TABLE_LEVEL, lock_flush_tlb); } @@ -5140,7 +5170,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + slot_handle_level_range(kvm, memslot, SLOT_ZAP_RMAPP, PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, start, end - 1, true); } @@ -5149,19 +5179,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) spin_unlock(&kvm->mmu_lock); } -static bool slot_rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) -{ - return __rmap_write_protect(kvm, rmap_head, false); -} - void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) { bool flush; spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, + flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT, false); spin_unlock(&kvm->mmu_lock); @@ -5226,7 +5250,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, /* FIXME: const-ify all uses of struct kvm_memory_slot. */ spin_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, - kvm_mmu_zap_collapsible_spte, true); + SLOT_ZAP_COLLAPSIBLE_SPTE, true); spin_unlock(&kvm->mmu_lock); } @@ -5236,7 +5260,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, bool flush; spin_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); + flush = slot_handle_leaf(kvm, memslot, SLOT_RMAP_CLEAR_DIRTY, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); @@ -5258,7 +5282,7 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, bool flush; spin_lock(&kvm->mmu_lock); - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, + flush = slot_handle_large_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT, false); spin_unlock(&kvm->mmu_lock); @@ -5276,7 +5300,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, bool flush; spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_SET_DIRTY, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock);
With retpoline, tight loops of "call this function for every XXX" are very much pessimised by taking a prediction miss *every* time. This one showed up very high in our early testing, and it only has five things it'll ever call so make it take an 'op' enum instead of a function pointer and let's see how that works out... Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> --- Not sure I like this. Better suggestions welcomed... In the general case, we have a few things we can do with the calls that retpoline turns into bottlenecks. This is one of them. Another option, if there are just one or two "likely" functions, is something along the lines of if (func == likelyfunc) likelyfunc() else (*func)(); // GCC does retpoline for this For things like kvm_x86_ops we really could just turn *all* of those into direct calls at runtime, like pvops does. There are some which land somewhere in the middle, like the default dma_ops. We probably want something like the 'likelyfunc' version above, except that we *also* want to flip the likelyfunc between the Intel and AMD IOMMU ops functions, at early boot. I'll see what I can come up with... arch/x86/kvm/mmu.c | 72 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 24 deletions(-)