diff mbox

[06/13] drm/i915: Use binary search when looking up forcewake domains

Message ID 1475163356-3463-7-git-send-email-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tvrtko Ursulin Sept. 29, 2016, 3:35 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Instead of the existing linear seach, now that we have sorted
range tables, we can do a binary search on them for some
potential miniscule performance gain, but more importantly
for elegance and code size. Hopefully the perfomance gain is
sufficient to offset the function calls which were not there
before.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_uncore.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

Comments

Chris Wilson Sept. 29, 2016, 4:16 p.m. UTC | #1
On Thu, Sep 29, 2016 at 04:35:49PM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Instead of the existing linear seach, now that we have sorted
> range tables, we can do a binary search on them for some
> potential miniscule performance gain, but more importantly
> for elegance and code size. Hopefully the perfomance gain is
> sufficient to offset the function calls which were not there
> before.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_uncore.c | 28 ++++++++++++++++++++--------
>  1 file changed, 20 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index bee1482a5ece..ae5edaea16f7 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -26,6 +26,7 @@
>  #include "i915_vgpu.h"
>  
>  #include <linux/pm_runtime.h>
> +#include <linux/bsearch.h>
>  
>  #define FORCEWAKE_ACK_TIMEOUT_MS 50
>  
> @@ -589,20 +590,31 @@ struct intel_forcewake_range
>  	enum forcewake_domains domains;
>  };
>  
> +static int fw_range_cmp(const void *key, const void *elt)
> +{
> +	struct intel_forcewake_range *entry =
> +		(struct intel_forcewake_range *)elt;
> +	u32 offset = (u32)((unsigned long)key);
> +
> +	if (offset < entry->start)
> +		return -1;
> +	else if (offset > entry->end)
> +		return 1;
> +	else
> +		return 0;
> +}
> +
>  static enum forcewake_domains
>  find_fw_domain(u32 offset, const struct intel_forcewake_range *ranges,
>  	       unsigned int num_ranges)
>  {
> -	unsigned int i;
> -	struct intel_forcewake_range *entry =
> -		(struct intel_forcewake_range *)ranges;
> +	struct intel_forcewake_range *entry;
>  
> -	for (i = 0; i < num_ranges; i++, entry++) {
> -		if (offset >= entry->start && offset <= entry->end)
> -			return entry->domains;
> -	}
> +	entry = bsearch((void *)(unsigned long)offset, (const void *)ranges,
> +			num_ranges, sizeof(struct intel_forcewake_range),
> +			fw_range_cmp);

How much for bsearch() to be turned into a generator macro?

> -	return -1;
> +	return entry ? entry->domains : -1;
>  }

Looks ok, maybe pass in the default value to return if !entry, saves the
double check.
-Chris
Tvrtko Ursulin Sept. 30, 2016, 11:08 a.m. UTC | #2
On 29/09/2016 17:16, Chris Wilson wrote:
> On Thu, Sep 29, 2016 at 04:35:49PM +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Instead of the existing linear seach, now that we have sorted
>> range tables, we can do a binary search on them for some
>> potential miniscule performance gain, but more importantly
>> for elegance and code size. Hopefully the perfomance gain is
>> sufficient to offset the function calls which were not there
>> before.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/intel_uncore.c | 28 ++++++++++++++++++++--------
>>   1 file changed, 20 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
>> index bee1482a5ece..ae5edaea16f7 100644
>> --- a/drivers/gpu/drm/i915/intel_uncore.c
>> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>> @@ -26,6 +26,7 @@
>>   #include "i915_vgpu.h"
>>   
>>   #include <linux/pm_runtime.h>
>> +#include <linux/bsearch.h>
>>   
>>   #define FORCEWAKE_ACK_TIMEOUT_MS 50
>>   
>> @@ -589,20 +590,31 @@ struct intel_forcewake_range
>>   	enum forcewake_domains domains;
>>   };
>>   
>> +static int fw_range_cmp(const void *key, const void *elt)
>> +{
>> +	struct intel_forcewake_range *entry =
>> +		(struct intel_forcewake_range *)elt;
>> +	u32 offset = (u32)((unsigned long)key);
>> +
>> +	if (offset < entry->start)
>> +		return -1;
>> +	else if (offset > entry->end)
>> +		return 1;
>> +	else
>> +		return 0;
>> +}
>> +
>>   static enum forcewake_domains
>>   find_fw_domain(u32 offset, const struct intel_forcewake_range *ranges,
>>   	       unsigned int num_ranges)
>>   {
>> -	unsigned int i;
>> -	struct intel_forcewake_range *entry =
>> -		(struct intel_forcewake_range *)ranges;
>> +	struct intel_forcewake_range *entry;
>>   
>> -	for (i = 0; i < num_ranges; i++, entry++) {
>> -		if (offset >= entry->start && offset <= entry->end)
>> -			return entry->domains;
>> -	}
>> +	entry = bsearch((void *)(unsigned long)offset, (const void *)ranges,
>> +			num_ranges, sizeof(struct intel_forcewake_range),
>> +			fw_range_cmp);
> How much for bsearch() to be turned into a generator macro?

By default it is a small code size win (128 bytes). It makes 
find_fw_domain a function with an inlined comparator (so one function 
call less per search iteration than using library bsearch) and inlines 
is_gen8_shadowed completely.

Forcing find_fw_domain to be fully inline adds approximately 1k.

I am not sure - you think it is worth doing some of the above? Function 
calls are supposed to be cheap so perhaps just with the default 
inlining, but then it is either pushing the core patch or having a local 
copy of a macro.

>> -	return -1;
>> +	return entry ? entry->domains : -1;
>>   }
> Looks ok, maybe pass in the default value to return if !entry, saves the
> double check.

It goes away later in the series so it is fine. I just wanted to be 
gradual so it is easy to review.

Regards,

Tvrtko
Chris Wilson Sept. 30, 2016, 11:22 a.m. UTC | #3
On Fri, Sep 30, 2016 at 12:08:26PM +0100, Tvrtko Ursulin wrote:
> 
> On 29/09/2016 17:16, Chris Wilson wrote:
> >On Thu, Sep 29, 2016 at 04:35:49PM +0100, Tvrtko Ursulin wrote:
> >>+	entry = bsearch((void *)(unsigned long)offset, (const void *)ranges,
> >>+			num_ranges, sizeof(struct intel_forcewake_range),
> >>+			fw_range_cmp);
> >How much for bsearch() to be turned into a generator macro?
> 
> By default it is a small code size win (128 bytes). It makes
> find_fw_domain a function with an inlined comparator (so one
> function call less per search iteration than using library bsearch)
> and inlines is_gen8_shadowed completely.
> 
> Forcing find_fw_domain to be fully inline adds approximately 1k.
> 
> I am not sure - you think it is worth doing some of the above?
> Function calls are supposed to be cheap so perhaps just with the
> default inlining, but then it is either pushing the core patch or
> having a local copy of a macro.

But I wonder if we get better branch predictor from inlining.

Happy enough with the default inlining, whilst we are in the noise
compared to the actual fw and mmio, we might as well try to keep the irq
paths trim at least.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index bee1482a5ece..ae5edaea16f7 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -26,6 +26,7 @@ 
 #include "i915_vgpu.h"
 
 #include <linux/pm_runtime.h>
+#include <linux/bsearch.h>
 
 #define FORCEWAKE_ACK_TIMEOUT_MS 50
 
@@ -589,20 +590,31 @@  struct intel_forcewake_range
 	enum forcewake_domains domains;
 };
 
+static int fw_range_cmp(const void *key, const void *elt)
+{
+	struct intel_forcewake_range *entry =
+		(struct intel_forcewake_range *)elt;
+	u32 offset = (u32)((unsigned long)key);
+
+	if (offset < entry->start)
+		return -1;
+	else if (offset > entry->end)
+		return 1;
+	else
+		return 0;
+}
+
 static enum forcewake_domains
 find_fw_domain(u32 offset, const struct intel_forcewake_range *ranges,
 	       unsigned int num_ranges)
 {
-	unsigned int i;
-	struct intel_forcewake_range *entry =
-		(struct intel_forcewake_range *)ranges;
+	struct intel_forcewake_range *entry;
 
-	for (i = 0; i < num_ranges; i++, entry++) {
-		if (offset >= entry->start && offset <= entry->end)
-			return entry->domains;
-	}
+	entry = bsearch((void *)(unsigned long)offset, (const void *)ranges,
+			num_ranges, sizeof(struct intel_forcewake_range),
+			fw_range_cmp);
 
-	return -1;
+	return entry ? entry->domains : -1;
 }
 
 static void