Message ID | 1475163356-3463-7-git-send-email-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Sep 29, 2016 at 04:35:49PM +0100, Tvrtko Ursulin wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Instead of the existing linear seach, now that we have sorted > range tables, we can do a binary search on them for some > potential miniscule performance gain, but more importantly > for elegance and code size. Hopefully the perfomance gain is > sufficient to offset the function calls which were not there > before. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > drivers/gpu/drm/i915/intel_uncore.c | 28 ++++++++++++++++++++-------- > 1 file changed, 20 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c > index bee1482a5ece..ae5edaea16f7 100644 > --- a/drivers/gpu/drm/i915/intel_uncore.c > +++ b/drivers/gpu/drm/i915/intel_uncore.c > @@ -26,6 +26,7 @@ > #include "i915_vgpu.h" > > #include <linux/pm_runtime.h> > +#include <linux/bsearch.h> > > #define FORCEWAKE_ACK_TIMEOUT_MS 50 > > @@ -589,20 +590,31 @@ struct intel_forcewake_range > enum forcewake_domains domains; > }; > > +static int fw_range_cmp(const void *key, const void *elt) > +{ > + struct intel_forcewake_range *entry = > + (struct intel_forcewake_range *)elt; > + u32 offset = (u32)((unsigned long)key); > + > + if (offset < entry->start) > + return -1; > + else if (offset > entry->end) > + return 1; > + else > + return 0; > +} > + > static enum forcewake_domains > find_fw_domain(u32 offset, const struct intel_forcewake_range *ranges, > unsigned int num_ranges) > { > - unsigned int i; > - struct intel_forcewake_range *entry = > - (struct intel_forcewake_range *)ranges; > + struct intel_forcewake_range *entry; > > - for (i = 0; i < num_ranges; i++, entry++) { > - if (offset >= entry->start && offset <= entry->end) > - return entry->domains; > - } > + entry = bsearch((void *)(unsigned long)offset, (const void *)ranges, > + num_ranges, sizeof(struct intel_forcewake_range), > + fw_range_cmp); How much for bsearch() to be turned into a generator macro? > - return -1; > + return entry ? entry->domains : -1; > } Looks ok, maybe pass in the default value to return if !entry, saves the double check. -Chris
On 29/09/2016 17:16, Chris Wilson wrote: > On Thu, Sep 29, 2016 at 04:35:49PM +0100, Tvrtko Ursulin wrote: >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> >> Instead of the existing linear seach, now that we have sorted >> range tables, we can do a binary search on them for some >> potential miniscule performance gain, but more importantly >> for elegance and code size. Hopefully the perfomance gain is >> sufficient to offset the function calls which were not there >> before. >> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> --- >> drivers/gpu/drm/i915/intel_uncore.c | 28 ++++++++++++++++++++-------- >> 1 file changed, 20 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c >> index bee1482a5ece..ae5edaea16f7 100644 >> --- a/drivers/gpu/drm/i915/intel_uncore.c >> +++ b/drivers/gpu/drm/i915/intel_uncore.c >> @@ -26,6 +26,7 @@ >> #include "i915_vgpu.h" >> >> #include <linux/pm_runtime.h> >> +#include <linux/bsearch.h> >> >> #define FORCEWAKE_ACK_TIMEOUT_MS 50 >> >> @@ -589,20 +590,31 @@ struct intel_forcewake_range >> enum forcewake_domains domains; >> }; >> >> +static int fw_range_cmp(const void *key, const void *elt) >> +{ >> + struct intel_forcewake_range *entry = >> + (struct intel_forcewake_range *)elt; >> + u32 offset = (u32)((unsigned long)key); >> + >> + if (offset < entry->start) >> + return -1; >> + else if (offset > entry->end) >> + return 1; >> + else >> + return 0; >> +} >> + >> static enum forcewake_domains >> find_fw_domain(u32 offset, const struct intel_forcewake_range *ranges, >> unsigned int num_ranges) >> { >> - unsigned int i; >> - struct intel_forcewake_range *entry = >> - (struct intel_forcewake_range *)ranges; >> + struct intel_forcewake_range *entry; >> >> - for (i = 0; i < num_ranges; i++, entry++) { >> - if (offset >= entry->start && offset <= entry->end) >> - return entry->domains; >> - } >> + entry = bsearch((void *)(unsigned long)offset, (const void *)ranges, >> + num_ranges, sizeof(struct intel_forcewake_range), >> + fw_range_cmp); > How much for bsearch() to be turned into a generator macro? By default it is a small code size win (128 bytes). It makes find_fw_domain a function with an inlined comparator (so one function call less per search iteration than using library bsearch) and inlines is_gen8_shadowed completely. Forcing find_fw_domain to be fully inline adds approximately 1k. I am not sure - you think it is worth doing some of the above? Function calls are supposed to be cheap so perhaps just with the default inlining, but then it is either pushing the core patch or having a local copy of a macro. >> - return -1; >> + return entry ? entry->domains : -1; >> } > Looks ok, maybe pass in the default value to return if !entry, saves the > double check. It goes away later in the series so it is fine. I just wanted to be gradual so it is easy to review. Regards, Tvrtko
On Fri, Sep 30, 2016 at 12:08:26PM +0100, Tvrtko Ursulin wrote: > > On 29/09/2016 17:16, Chris Wilson wrote: > >On Thu, Sep 29, 2016 at 04:35:49PM +0100, Tvrtko Ursulin wrote: > >>+ entry = bsearch((void *)(unsigned long)offset, (const void *)ranges, > >>+ num_ranges, sizeof(struct intel_forcewake_range), > >>+ fw_range_cmp); > >How much for bsearch() to be turned into a generator macro? > > By default it is a small code size win (128 bytes). It makes > find_fw_domain a function with an inlined comparator (so one > function call less per search iteration than using library bsearch) > and inlines is_gen8_shadowed completely. > > Forcing find_fw_domain to be fully inline adds approximately 1k. > > I am not sure - you think it is worth doing some of the above? > Function calls are supposed to be cheap so perhaps just with the > default inlining, but then it is either pushing the core patch or > having a local copy of a macro. But I wonder if we get better branch predictor from inlining. Happy enough with the default inlining, whilst we are in the noise compared to the actual fw and mmio, we might as well try to keep the irq paths trim at least. -Chris
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index bee1482a5ece..ae5edaea16f7 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -26,6 +26,7 @@ #include "i915_vgpu.h" #include <linux/pm_runtime.h> +#include <linux/bsearch.h> #define FORCEWAKE_ACK_TIMEOUT_MS 50 @@ -589,20 +590,31 @@ struct intel_forcewake_range enum forcewake_domains domains; }; +static int fw_range_cmp(const void *key, const void *elt) +{ + struct intel_forcewake_range *entry = + (struct intel_forcewake_range *)elt; + u32 offset = (u32)((unsigned long)key); + + if (offset < entry->start) + return -1; + else if (offset > entry->end) + return 1; + else + return 0; +} + static enum forcewake_domains find_fw_domain(u32 offset, const struct intel_forcewake_range *ranges, unsigned int num_ranges) { - unsigned int i; - struct intel_forcewake_range *entry = - (struct intel_forcewake_range *)ranges; + struct intel_forcewake_range *entry; - for (i = 0; i < num_ranges; i++, entry++) { - if (offset >= entry->start && offset <= entry->end) - return entry->domains; - } + entry = bsearch((void *)(unsigned long)offset, (const void *)ranges, + num_ranges, sizeof(struct intel_forcewake_range), + fw_range_cmp); - return -1; + return entry ? entry->domains : -1; } static void