diff mbox

of: use hash based search in of_find_node_by_phandle

Message ID 1516875247-19599-1-git-send-email-cpandya@codeaurora.org (mailing list archive)
State Not Applicable, archived
Delegated to: Andy Gross
Headers show

Commit Message

Chintan Pandya Jan. 25, 2018, 10:14 a.m. UTC
of_find_node_by_phandle() takes a lot of time finding
right node when your intended device is too right-side
in the fdt. Reason is, we search each device serially
from the fdt, starting from left-most to right-most.

Implement, device-phandle relation in hash-table so
that look up can be faster.

Change-Id: I4a2bc7eff6de142e4f91a7bf474893a45e61c128
Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
---
 drivers/of/base.c  |  9 +++++++--
 drivers/of/fdt.c   | 18 ++++++++++++++++++
 include/linux/of.h |  6 ++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

Comments

Rob Herring Jan. 25, 2018, 2:50 p.m. UTC | #1
On Thu, Jan 25, 2018 at 4:14 AM, Chintan Pandya <cpandya@codeaurora.org> wrote:
> of_find_node_by_phandle() takes a lot of time finding

Got some numbers for what is "a lot of time"?

> right node when your intended device is too right-side
> in the fdt. Reason is, we search each device serially
> from the fdt, starting from left-most to right-most.

By right side, you mean a deep path?

>
> Implement, device-phandle relation in hash-table so
> that look up can be faster.
>
> Change-Id: I4a2bc7eff6de142e4f91a7bf474893a45e61c128

Run checkpatch.pl

> Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
> ---
>  drivers/of/base.c  |  9 +++++++--
>  drivers/of/fdt.c   | 18 ++++++++++++++++++
>  include/linux/of.h |  6 ++++++
>  3 files changed, 31 insertions(+), 2 deletions(-)

[...]

> diff --git a/include/linux/of.h b/include/linux/of.h
> index 299aeb1..5b3f4f1 100644
> --- a/include/linux/of.h
> +++ b/include/linux/of.h
> @@ -25,6 +25,7 @@
>  #include <linux/notifier.h>
>  #include <linux/property.h>
>  #include <linux/list.h>
> +#include <linux/hashtable.h>
>
>  #include <asm/byteorder.h>
>  #include <asm/errno.h>
> @@ -61,6 +62,7 @@ struct device_node {
>         struct  kobject kobj;
>         unsigned long _flags;
>         void    *data;
> +       struct hlist_node hash;

Always base your patches on the latest -rc at least. This won't apply.

This grows struct device_node for every single node which we recently
worked on to shrink (which is why this won't apply). So I'm now
sensitive to anything that grows it. I'd really prefer something out
of band.

I'd guess that there's really only a few phandle lookups that occur
over and over. The clock controller, interrupt controller, etc. What
if you just had a simple array of previously found nodes for a cache
and of_find_node_by_phandle can check that array first. Probably 8-16
entries would be enough. If that still has too much trashing, you
could also have a lookup count for each entry and expel the least used
first. Or maybe the list_lru would work here.

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Frank Rowand Jan. 25, 2018, 7:54 p.m. UTC | #2
On 01/25/18 02:14, Chintan Pandya wrote:
> of_find_node_by_phandle() takes a lot of time finding
> right node when your intended device is too right-side
> in the fdt. Reason is, we search each device serially
> from the fdt, starting from left-most to right-most.

Please give me a pointer to the code that is doing
this search.

-Frank


> 
> Implement, device-phandle relation in hash-table so
> that look up can be faster.
> 
> Change-Id: I4a2bc7eff6de142e4f91a7bf474893a45e61c128
> Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
> ---
>  drivers/of/base.c  |  9 +++++++--
>  drivers/of/fdt.c   | 18 ++++++++++++++++++
>  include/linux/of.h |  6 ++++++
>  3 files changed, 31 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/of/base.c b/drivers/of/base.c
> index a0bccb5..3e06316 100644
> --- a/drivers/of/base.c
> +++ b/drivers/of/base.c
> @@ -25,6 +25,7 @@
>  #include <linux/cpu.h>
>  #include <linux/module.h>
>  #include <linux/of.h>
> +#include <linux/hashtable.h>
>  #include <linux/of_graph.h>
>  #include <linux/spinlock.h>
>  #include <linux/slab.h>
> @@ -1099,10 +1100,14 @@ struct device_node *of_find_node_by_phandle(phandle handle)
>  	if (!handle)
>  		return NULL;
>  
> -	raw_spin_lock_irqsave(&devtree_lock, flags);
> -	for_each_of_allnodes(np)
> +	spin_lock(&dt_hash_spinlock);
> +	hash_for_each_possible(dt_hash_table, np, hash, handle)
>  		if (np->phandle == handle)
>  			break;
> +
> +	spin_unlock(&dt_hash_spinlock);
> +
> +	raw_spin_lock_irqsave(&devtree_lock, flags);
>  	of_node_get(np);
>  	raw_spin_unlock_irqrestore(&devtree_lock, flags);
>  	return np;
> diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
> index c0914fb..f0e78a7 100644
> --- a/drivers/of/fdt.c
> +++ b/drivers/of/fdt.c
> @@ -31,6 +31,10 @@
>  #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
>  #include <asm/page.h>
>  
> +static bool dt_hash_needs_init = true;
> +DECLARE_HASHTABLE(dt_hash_table, DT_HASH_BITS);
> +DEFINE_SPINLOCK(dt_hash_spinlock);
> +
>  /*
>   * of_fdt_limit_memory - limit the number of regions in the /memory node
>   * @limit: maximum entries
> @@ -227,6 +231,20 @@ static void populate_properties(const void *blob,
>  		pprev      = &pp->next;
>  	}
>  
> +	/*
> +	 * In 'dryrun = true' cases, np is some non-NULL junk. So, protect
> +	 * against those cases.
> +	 */
> +	if (!dryrun && np->phandle) {
> +		spin_lock(&dt_hash_spinlock);
> +		if (dt_hash_needs_init) {
> +			dt_hash_needs_init = false;
> +			hash_init(dt_hash_table);
> +		}
> +		hash_add(dt_hash_table, &np->hash, np->phandle);
> +		spin_unlock(&dt_hash_spinlock);
> +	}
> +
>  	/* With version 0x10 we may not have the name property,
>  	 * recreate it here from the unit name if absent
>  	 */
> diff --git a/include/linux/of.h b/include/linux/of.h
> index 299aeb1..5b3f4f1 100644
> --- a/include/linux/of.h
> +++ b/include/linux/of.h
> @@ -25,6 +25,7 @@
>  #include <linux/notifier.h>
>  #include <linux/property.h>
>  #include <linux/list.h>
> +#include <linux/hashtable.h>
>  
>  #include <asm/byteorder.h>
>  #include <asm/errno.h>
> @@ -61,6 +62,7 @@ struct device_node {
>  	struct	kobject kobj;
>  	unsigned long _flags;
>  	void	*data;
> +	struct hlist_node hash;
>  #if defined(CONFIG_SPARC)
>  	const char *path_component_name;
>  	unsigned int unique_id;
> @@ -68,6 +70,10 @@ struct device_node {
>  #endif
>  };
>  
> +#define DT_HASH_BITS 6
> +extern DECLARE_HASHTABLE(dt_hash_table, DT_HASH_BITS);
> +extern spinlock_t dt_hash_spinlock;
> +
>  #define MAX_PHANDLE_ARGS 16
>  struct of_phandle_args {
>  	struct device_node *np;
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chintan Pandya Jan. 26, 2018, 7:22 a.m. UTC | #3
On 1/25/2018 8:20 PM, Rob Herring wrote:
> On Thu, Jan 25, 2018 at 4:14 AM, Chintan Pandya <cpandya@codeaurora.org> wrote:
>> of_find_node_by_phandle() takes a lot of time finding
> Got some numbers for what is "a lot of time"?
On my SDM device, I see total saving of 400ms during boot time. For some 
clients
whose node is quite deeper, they see 1ms time taken by this API.
>
>> right node when your intended device is too right-side
>> in the fdt. Reason is, we search each device serially
>> from the fdt, starting from left-most to right-most.
> By right side, you mean a deep path?
Yes, will correct this if original is confusing.
>
>> Implement, device-phandle relation in hash-table so
>> that look up can be faster.
>>
>> Change-Id: I4a2bc7eff6de142e4f91a7bf474893a45e61c128
> Run checkpatch.pl
Sure. My bad.
>
>> @@ -61,6 +62,7 @@ struct device_node {
>>          struct  kobject kobj;
>>          unsigned long _flags;
>>          void    *data;
>> +       struct hlist_node hash;
> Always base your patches on the latest -rc at least. This won't apply.
Ok, sure.
>
> This grows struct device_node for every single node which we recently
> worked on to shrink (which is why this won't apply). So I'm now
> sensitive to anything that grows it. I'd really prefer something out
> of band.
>
> I'd guess that there's really only a few phandle lookups that occur
> over and over.
On my system, there are ~6.7k calls of this API during boot.

> The clock controller, interrupt controller, etc. What
> if you just had a simple array of previously found nodes for a cache
> and of_find_node_by_phandle can check that array first. Probably 8-16
> entries would be enough.
I clearly see repeat calling with same phandle. But I have few hundreds 
of nodes.
I see hashing as generic optimization which applies equally good to all 
sized DT.
Using ~4KB more size to save 400 ms is a good trade-off, I believe.


Chintan Pandya
Chintan Pandya Jan. 26, 2018, 8:22 a.m. UTC | #4
On 1/26/2018 1:24 AM, Frank Rowand wrote:
> On 01/25/18 02:14, Chintan Pandya wrote:
>> of_find_node_by_phandle() takes a lot of time finding
>> right node when your intended device is too right-side
>> in the fdt. Reason is, we search each device serially
>> from the fdt, starting from left-most to right-most.
> Please give me a pointer to the code that is doing
> this search.
>
> -Frank
You can refer include/linux/of.h

#define for_each_of_allnodes_from(from, dn) \
         for (dn = __of_find_all_nodes(from); dn; dn = 
__of_find_all_nodes(dn))
#define for_each_of_allnodes(dn) for_each_of_allnodes_from(NULL, dn)

where __of_find_all_nodes() does

struct device_node *__of_find_all_nodes(struct device_node *prev)
{
         struct device_node *np;
         if (!prev) {
                 np = of_root;
         } else if (prev->child) {
                 np = prev->child;
         } else {
                 /* Walk back up looking for a sibling, or the end of 
the structure */
                 np = prev;
                 while (np->parent && !np->sibling)
                         np = np->parent;
                 np = np->sibling; /* Might be null at the end of the 
tree */
         }
         return np;
}
Rob Herring Jan. 26, 2018, 3:21 p.m. UTC | #5
On Fri, Jan 26, 2018 at 1:22 AM, Chintan Pandya <cpandya@codeaurora.org> wrote:
> On 1/25/2018 8:20 PM, Rob Herring wrote:
>>
>> On Thu, Jan 25, 2018 at 4:14 AM, Chintan Pandya <cpandya@codeaurora.org>
>> wrote:
>>>

[...]

>> I'd guess that there's really only a few phandle lookups that occur
>> over and over.
>
> On my system, there are ~6.7k calls of this API during boot.

And after boot it will be near 0 yet we carry the memory usage forever.

>> The clock controller, interrupt controller, etc. What
>> if you just had a simple array of previously found nodes for a cache
>> and of_find_node_by_phandle can check that array first. Probably 8-16
>> entries would be enough.
>
> I clearly see repeat calling with same phandle. But I have few hundreds of
> nodes.
> I see hashing as generic optimization which applies equally good to all
> sized DT.
> Using ~4KB more size to save 400 ms is a good trade-off, I believe.

But if you can use 200 bytes and save 350 ms, that would be a better
trade off IMO. But we don't know because we have no data.

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Frank Rowand Jan. 26, 2018, 9:27 p.m. UTC | #6
On 01/26/18 00:22, Chintan Pandya wrote:
> 
> 
> On 1/26/2018 1:24 AM, Frank Rowand wrote:
>> On 01/25/18 02:14, Chintan Pandya wrote:
>>> of_find_node_by_phandle() takes a lot of time finding
>>> right node when your intended device is too right-side
>>> in the fdt. Reason is, we search each device serially
>>> from the fdt, starting from left-most to right-most.
>> Please give me a pointer to the code that is doing
>> this search.
>>
>> -Frank
> You can refer include/linux/of.h
> 
> #define for_each_of_allnodes_from(from, dn) \
>         for (dn = __of_find_all_nodes(from); dn; dn = __of_find_all_nodes(dn))
> #define for_each_of_allnodes(dn) for_each_of_allnodes_from(NULL, dn)
> 
> where __of_find_all_nodes() does
> 
> struct device_node *__of_find_all_nodes(struct device_node *prev)
> {
>         struct device_node *np;
>         if (!prev) {
>                 np = of_root;
>         } else if (prev->child) {
>                 np = prev->child;
>         } else {
>                 /* Walk back up looking for a sibling, or the end of the structure */
>                 np = prev;
>                 while (np->parent && !np->sibling)
>                         np = np->parent;
>                 np = np->sibling; /* Might be null at the end of the tree */
>         }
>         return np;
> }
> 

Let me restate my question.

Can you point me to the driver code that is invoking
the search?

-Frank
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Frank Rowand Jan. 26, 2018, 9:29 p.m. UTC | #7
On 01/26/18 13:27, Frank Rowand wrote:
> On 01/26/18 00:22, Chintan Pandya wrote:
>>
>>
>> On 1/26/2018 1:24 AM, Frank Rowand wrote:
>>> On 01/25/18 02:14, Chintan Pandya wrote:
>>>> of_find_node_by_phandle() takes a lot of time finding
>>>> right node when your intended device is too right-side
>>>> in the fdt. Reason is, we search each device serially
>>>> from the fdt, starting from left-most to right-most.
>>> Please give me a pointer to the code that is doing
>>> this search.
>>>
>>> -Frank
>> You can refer include/linux/of.h
>>
>> #define for_each_of_allnodes_from(from, dn) \
>>         for (dn = __of_find_all_nodes(from); dn; dn = __of_find_all_nodes(dn))
>> #define for_each_of_allnodes(dn) for_each_of_allnodes_from(NULL, dn)
>>
>> where __of_find_all_nodes() does
>>
>> struct device_node *__of_find_all_nodes(struct device_node *prev)
>> {
>>         struct device_node *np;
>>         if (!prev) {
>>                 np = of_root;
>>         } else if (prev->child) {
>>                 np = prev->child;
>>         } else {
>>                 /* Walk back up looking for a sibling, or the end of the structure */
>>                 np = prev;
>>                 while (np->parent && !np->sibling)
>>                         np = np->parent;
>>                 np = np->sibling; /* Might be null at the end of the tree */
>>         }
>>         return np;
>> }
>>
> 
> Let me restate my question.
> 
> Can you point me to the driver code that is invoking
> the search?
> 
> -Frank
> 

And also the .dts devicetree source file that you are seeing
large overhead with.
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Frank Rowand Jan. 26, 2018, 9:34 p.m. UTC | #8
On 01/26/18 13:29, Frank Rowand wrote:
> On 01/26/18 13:27, Frank Rowand wrote:
>> On 01/26/18 00:22, Chintan Pandya wrote:
>>>
>>>
>>> On 1/26/2018 1:24 AM, Frank Rowand wrote:
>>>> On 01/25/18 02:14, Chintan Pandya wrote:
>>>>> of_find_node_by_phandle() takes a lot of time finding
>>>>> right node when your intended device is too right-side
>>>>> in the fdt. Reason is, we search each device serially
>>>>> from the fdt, starting from left-most to right-most.
>>>> Please give me a pointer to the code that is doing
>>>> this search.
>>>>
>>>> -Frank
>>> You can refer include/linux/of.h
>>>
>>> #define for_each_of_allnodes_from(from, dn) \
>>>         for (dn = __of_find_all_nodes(from); dn; dn = __of_find_all_nodes(dn))
>>> #define for_each_of_allnodes(dn) for_each_of_allnodes_from(NULL, dn)
>>>
>>> where __of_find_all_nodes() does
>>>
>>> struct device_node *__of_find_all_nodes(struct device_node *prev)
>>> {
>>>         struct device_node *np;
>>>         if (!prev) {
>>>                 np = of_root;
>>>         } else if (prev->child) {
>>>                 np = prev->child;
>>>         } else {
>>>                 /* Walk back up looking for a sibling, or the end of the structure */
>>>                 np = prev;
>>>                 while (np->parent && !np->sibling)
>>>                         np = np->parent;
>>>                 np = np->sibling; /* Might be null at the end of the tree */
>>>         }
>>>         return np;
>>> }
>>>
>>
>> Let me restate my question.
>>
>> Can you point me to the driver code that is invoking
>> the search?
>>
>> -Frank
>>
> 
> And also the .dts devicetree source file that you are seeing
> large overhead with.
> 

Sorry about dribbling out questions instead of all at once....

What is the hardware you are testing this on?
Processor?
Cache size?
Memory size?
Processor frequency?
Any other attribute of the system that will help me understand
the boot performance you are seeing?

Thanks,

Frank
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/of/base.c b/drivers/of/base.c
index a0bccb5..3e06316 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -25,6 +25,7 @@ 
 #include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/hashtable.h>
 #include <linux/of_graph.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -1099,10 +1100,14 @@  struct device_node *of_find_node_by_phandle(phandle handle)
 	if (!handle)
 		return NULL;
 
-	raw_spin_lock_irqsave(&devtree_lock, flags);
-	for_each_of_allnodes(np)
+	spin_lock(&dt_hash_spinlock);
+	hash_for_each_possible(dt_hash_table, np, hash, handle)
 		if (np->phandle == handle)
 			break;
+
+	spin_unlock(&dt_hash_spinlock);
+
+	raw_spin_lock_irqsave(&devtree_lock, flags);
 	of_node_get(np);
 	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index c0914fb..f0e78a7 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -31,6 +31,10 @@ 
 #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
 #include <asm/page.h>
 
+static bool dt_hash_needs_init = true;
+DECLARE_HASHTABLE(dt_hash_table, DT_HASH_BITS);
+DEFINE_SPINLOCK(dt_hash_spinlock);
+
 /*
  * of_fdt_limit_memory - limit the number of regions in the /memory node
  * @limit: maximum entries
@@ -227,6 +231,20 @@  static void populate_properties(const void *blob,
 		pprev      = &pp->next;
 	}
 
+	/*
+	 * In 'dryrun = true' cases, np is some non-NULL junk. So, protect
+	 * against those cases.
+	 */
+	if (!dryrun && np->phandle) {
+		spin_lock(&dt_hash_spinlock);
+		if (dt_hash_needs_init) {
+			dt_hash_needs_init = false;
+			hash_init(dt_hash_table);
+		}
+		hash_add(dt_hash_table, &np->hash, np->phandle);
+		spin_unlock(&dt_hash_spinlock);
+	}
+
 	/* With version 0x10 we may not have the name property,
 	 * recreate it here from the unit name if absent
 	 */
diff --git a/include/linux/of.h b/include/linux/of.h
index 299aeb1..5b3f4f1 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -25,6 +25,7 @@ 
 #include <linux/notifier.h>
 #include <linux/property.h>
 #include <linux/list.h>
+#include <linux/hashtable.h>
 
 #include <asm/byteorder.h>
 #include <asm/errno.h>
@@ -61,6 +62,7 @@  struct device_node {
 	struct	kobject kobj;
 	unsigned long _flags;
 	void	*data;
+	struct hlist_node hash;
 #if defined(CONFIG_SPARC)
 	const char *path_component_name;
 	unsigned int unique_id;
@@ -68,6 +70,10 @@  struct device_node {
 #endif
 };
 
+#define DT_HASH_BITS 6
+extern DECLARE_HASHTABLE(dt_hash_table, DT_HASH_BITS);
+extern spinlock_t dt_hash_spinlock;
+
 #define MAX_PHANDLE_ARGS 16
 struct of_phandle_args {
 	struct device_node *np;