diff mbox

[v4,19/21] drivers/of: Support adding sub-tree

Message ID 1430460188-31343-20-git-send-email-gwshan@linux.vnet.ibm.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Gavin Shan May 1, 2015, 6:03 a.m. UTC
The requirement is raised when developing the PCI hotplug feature
for PowerPC PowerNV platform, which runs on top of skiboot firmware.
When plugging PCI adapter to one PCI slot, the firmware rescans the
slot and build FDT (Flat Device Tree) blob, which is sent to the
PowerNV PCI hotplug driver for processing. The new constructed device
nodes from the FDT blob are expected to be attached to the device
node of the PCI slot. Unfortunately, it seems we don't have a API
to support the scenario. The patch intends to support it by newly
introduced function of_fdt_add_subtree(), the design behind it is
shown as below:

   * When the sub-tree FDT blob, which is owned by firmware, is
     received by kernel. It's copied over to the blob, which is
     dynamically allocated. Since then, the FDT blob owned by
     firmware isn't touched.
   * Rework unflatten_dt_node() so that the device nodes in current
     and deeper depth have been constructed from the FDT blob. All
     device nodes are marked with flag OF_DYNAMIC_HYBIRD, which is
     similar to OF_DYNAMIC. However, device node with the flag set
     can be free'd, but in the way other than that for OF_DYNAMIC
     device nodes.
   * of_fdt_add_subtree() is the introduced API to do the work.

Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <robh+dt@kernel.org>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 drivers/of/dynamic.c   |  19 +++++--
 drivers/of/fdt.c       | 133 ++++++++++++++++++++++++++++++++++++++++---------
 include/linux/of.h     |   2 +
 include/linux/of_fdt.h |   1 +
 4 files changed, 127 insertions(+), 28 deletions(-)

Comments

Rob Herring May 1, 2015, 12:54 p.m. UTC | #1
+dt list

On Fri, May 1, 2015 at 1:03 AM, Gavin Shan <gwshan@linux.vnet.ibm.com> wrote:
> The requirement is raised when developing the PCI hotplug feature
> for PowerPC PowerNV platform, which runs on top of skiboot firmware.
> When plugging PCI adapter to one PCI slot, the firmware rescans the
> slot and build FDT (Flat Device Tree) blob, which is sent to the
> PowerNV PCI hotplug driver for processing. The new constructed device
> nodes from the FDT blob are expected to be attached to the device
> node of the PCI slot. Unfortunately, it seems we don't have a API
> to support the scenario. The patch intends to support it by newly
> introduced function of_fdt_add_subtree(), the design behind it is
> shown as below:
>
>    * When the sub-tree FDT blob, which is owned by firmware, is
>      received by kernel. It's copied over to the blob, which is
>      dynamically allocated. Since then, the FDT blob owned by
>      firmware isn't touched.
>    * Rework unflatten_dt_node() so that the device nodes in current
>      and deeper depth have been constructed from the FDT blob. All
>      device nodes are marked with flag OF_DYNAMIC_HYBIRD, which is

Perhaps you meant HYBRID?

>      similar to OF_DYNAMIC. However, device node with the flag set
>      can be free'd, but in the way other than that for OF_DYNAMIC
>      device nodes.

The difference seems to be whether you allocate space or just point to
the FDT for various strings/data. Is that right?

>    * of_fdt_add_subtree() is the introduced API to do the work.

Have you looked at overlays and if so why do they not work for your purposes?

Why do you need to do this with the flattened tree?

Rob

>
> Cc: Grant Likely <grant.likely@linaro.org>
> Cc: Rob Herring <robh+dt@kernel.org>
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> ---
>  drivers/of/dynamic.c   |  19 +++++--
>  drivers/of/fdt.c       | 133 ++++++++++++++++++++++++++++++++++++++++---------
>  include/linux/of.h     |   2 +
>  include/linux/of_fdt.h |   1 +
>  4 files changed, 127 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> index 3351ef4..f562080 100644
> --- a/drivers/of/dynamic.c
> +++ b/drivers/of/dynamic.c
> @@ -330,13 +330,22 @@ void of_node_release(struct kobject *kobj)
>                 return;
>         }
>
> -       if (!of_node_check_flag(node, OF_DYNAMIC))
> +       /* Release the subtree */
> +       if (node->subtree) {
> +               kfree(node->subtree);
> +               node->subtree = NULL;
> +       }
> +
> +       if (!of_node_check_flag(node, OF_DYNAMIC) &&
> +           !of_node_check_flag(node, OF_DYNAMIC_HYBIRD))
>                 return;
>
>         while (prop) {
>                 struct property *next = prop->next;
> -               kfree(prop->name);
> -               kfree(prop->value);
> +               if (of_node_check_flag(node, OF_DYNAMIC)) {
> +                       kfree(prop->name);
> +                       kfree(prop->value);
> +               }
>                 kfree(prop);
>                 prop = next;
>
> @@ -345,7 +354,9 @@ void of_node_release(struct kobject *kobj)
>                         node->deadprops = NULL;
>                 }
>         }
> -       kfree(node->full_name);
> +
> +       if (of_node_check_flag(node, OF_DYNAMIC))
> +               kfree(node->full_name);
>         kfree(node->data);
>         kfree(node);
>  }
> diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
> index cde35c5d01..7659560 100644
> --- a/drivers/of/fdt.c
> +++ b/drivers/of/fdt.c
> @@ -28,6 +28,10 @@
>  #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
>  #include <asm/page.h>
>
> +#include "of_private.h"
> +
> +static int cur_node_depth;
> +
>  /*
>   * of_fdt_limit_memory - limit the number of regions in the /memory node
>   * @limit: maximum entries
> @@ -168,20 +172,20 @@ static void *unflatten_dt_alloc(void **mem, unsigned long size,
>   * @dad: Parent struct device_node
>   * @fpsize: Size of the node path up at the current depth.
>   */
> -static void * unflatten_dt_node(void *blob,
> -                               void *mem,
> -                               int *poffset,
> -                               struct device_node *dad,
> -                               struct device_node **nodepp,
> -                               unsigned long fpsize,
> -                               bool dryrun)
> +static void *unflatten_dt_node(void *blob,
> +                              void *mem,
> +                              int *poffset,
> +                              struct device_node *dad,
> +                              struct device_node **nodepp,
> +                              unsigned long fpsize,
> +                              bool dryrun,
> +                              bool dynamic)
>  {
>         const __be32 *p;
>         struct device_node *np;
>         struct property *pp, **prev_pp = NULL;
>         const char *pathp;
>         unsigned int l, allocl;
> -       static int depth = 0;
>         int old_depth;
>         int offset;
>         int has_name = 0;
> @@ -219,12 +223,18 @@ static void * unflatten_dt_node(void *blob,
>                 }
>         }
>
> -       np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl,
> +       if (dynamic)
> +               np = kzalloc(sizeof(struct device_node) + allocl, GFP_KERNEL);
> +       else
> +               np = unflatten_dt_alloc(&mem,
> +                               sizeof(struct device_node) + allocl,
>                                 __alignof__(struct device_node));
>         if (!dryrun) {
>                 char *fn;
>                 of_node_init(np);
>                 np->full_name = fn = ((char *)np) + sizeof(*np);
> +               if (dynamic)
> +                       of_node_set_flag(np, OF_DYNAMIC_HYBIRD);
>                 if (new_format) {
>                         /* rebuild full path for new format */
>                         if (dad && dad->parent) {
> @@ -267,8 +277,12 @@ static void * unflatten_dt_node(void *blob,
>                 }
>                 if (strcmp(pname, "name") == 0)
>                         has_name = 1;
> -               pp = unflatten_dt_alloc(&mem, sizeof(struct property),
> -                                       __alignof__(struct property));
> +
> +               if (dynamic)
> +                       pp = kzalloc(sizeof(struct property), GFP_KERNEL);
> +               else
> +                       pp = unflatten_dt_alloc(&mem, sizeof(struct property),
> +                                               __alignof__(struct property));
>                 if (!dryrun) {
>                         /* We accept flattened tree phandles either in
>                          * ePAPR-style "phandle" properties, or the
> @@ -309,8 +323,13 @@ static void * unflatten_dt_node(void *blob,
>                 if (pa < ps)
>                         pa = p1;
>                 sz = (pa - ps) + 1;
> -               pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz,
> -                                       __alignof__(struct property));
> +
> +               if (dynamic)
> +                       pp = kzalloc(sizeof(struct property) + sz, GFP_KERNEL);
> +               else
> +                       pp = unflatten_dt_alloc(&mem,
> +                                               sizeof(struct property) + sz,
> +                                               __alignof__(struct property));
>                 if (!dryrun) {
>                         pp->name = "name";
>                         pp->length = sz;
> @@ -334,13 +353,21 @@ static void * unflatten_dt_node(void *blob,
>                         np->type = "<NULL>";
>         }
>
> -       old_depth = depth;
> -       *poffset = fdt_next_node(blob, *poffset, &depth);
> -       if (depth < 0)
> -               depth = 0;
> -       while (*poffset > 0 && depth > old_depth)
> -               mem = unflatten_dt_node(blob, mem, poffset, np, NULL,
> -                                       fpsize, dryrun);
> +       old_depth = cur_node_depth;
> +       *poffset = fdt_next_node(blob, *poffset, &cur_node_depth);
> +       while (*poffset > 0) {
> +               if (cur_node_depth < old_depth)
> +                       break;
> +
> +               if (cur_node_depth == old_depth)
> +                       mem = unflatten_dt_node(blob, mem, poffset,
> +                                               dad, NULL, fpsize,
> +                                               dryrun, dynamic);
> +               else if (cur_node_depth > old_depth)
> +                       mem = unflatten_dt_node(blob, mem, poffset,
> +                                               np, NULL, fpsize,
> +                                               dryrun, dynamic);
> +       }
>
>         if (*poffset < 0 && *poffset != -FDT_ERR_NOTFOUND)
>                 pr_err("unflatten: error %d processing FDT\n", *poffset);
> @@ -379,8 +406,8 @@ static void * unflatten_dt_node(void *blob,
>   * for the resulting tree
>   */
>  static void __unflatten_device_tree(void *blob,
> -                            struct device_node **mynodes,
> -                            void * (*dt_alloc)(u64 size, u64 align))
> +                               struct device_node **mynodes,
> +                               void * (*dt_alloc)(u64 size, u64 align))
>  {
>         unsigned long size;
>         int start;
> @@ -405,7 +432,9 @@ static void __unflatten_device_tree(void *blob,
>
>         /* First pass, scan for size */
>         start = 0;
> -       size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL, NULL, 0, true);
> +       cur_node_depth = 1;
> +       size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL,
> +                                               NULL, 0, true, false);
>         size = ALIGN(size, 4);
>
>         pr_debug("  size is %lx, allocating...\n", size);
> @@ -420,7 +449,8 @@ static void __unflatten_device_tree(void *blob,
>
>         /* Second pass, do actual unflattening */
>         start = 0;
> -       unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false);
> +       cur_node_depth = 1;
> +       unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false, false);
>         if (be32_to_cpup(mem + size) != 0xdeadbeef)
>                 pr_warning("End of tree marker overwritten: %08x\n",
>                            be32_to_cpup(mem + size));
> @@ -448,6 +478,61 @@ void of_fdt_unflatten_tree(unsigned long *blob,
>  }
>  EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
>
> +static void populate_sysfs_for_child_nodes(struct device_node *parent)
> +{
> +       struct device_node *child;
> +
> +       for_each_child_of_node(parent, child) {
> +               __of_attach_node_sysfs(child);
> +               populate_sysfs_for_child_nodes(child);
> +       }
> +}
> +
> +/**
> + * of_fdt_add_substree - Create sub-tree of device nodes
> + * @parent: parent device node to which the sub-tree will attach
> + * @blob: flat device tree blob representing the sub-tree
> + *
> + * Copy over the FDT blob, which passed from firmware, and then
> + * unflatten the sub-tree.
> + */
> +void of_fdt_add_subtree(struct device_node *parent, void *blob)
> +{
> +       int start = 0;
> +
> +       /* Validate the header */
> +       if (!blob || fdt_check_header(blob)) {
> +               pr_err("%s: Invalid device-tree blob header at 0x%p\n",
> +                      __func__, blob);
> +               return;
> +       }
> +
> +       /* Free the flat blob for last time lazily */
> +       if (parent->subtree) {
> +               kfree(parent->subtree);
> +               parent->subtree = NULL;
> +       }
> +
> +       /* Copy over the flat blob */
> +       parent->subtree = kzalloc(fdt_totalsize(blob), GFP_KERNEL);
> +       if (!parent->subtree) {
> +               pr_err("%s: Cannot copy over device-tree blob\n",
> +                      __func__);
> +               return;
> +       }
> +
> +       memcpy(parent->subtree, blob, fdt_totalsize(blob));
> +
> +       /* Unflatten it */
> +       mutex_lock(&of_mutex);
> +       cur_node_depth = 1;
> +       unflatten_dt_node(parent->subtree, NULL, &start, parent, NULL,
> +                         strlen(parent->full_name), false, true);
> +       populate_sysfs_for_child_nodes(parent);
> +       mutex_unlock(&of_mutex);
> +}
> +EXPORT_SYMBOL(of_fdt_add_subtree);
> +
>  /* Everything below here references initial_boot_params directly. */
>  int __initdata dt_root_addr_cells;
>  int __initdata dt_root_size_cells;
> diff --git a/include/linux/of.h b/include/linux/of.h
> index ddeaae6..ac50b02 100644
> --- a/include/linux/of.h
> +++ b/include/linux/of.h
> @@ -60,6 +60,7 @@ struct device_node {
>         struct  device_node *sibling;
>         struct  kobject kobj;
>         unsigned long _flags;
> +       void    *subtree;
>         void    *data;
>  #if defined(CONFIG_SPARC)
>         const char *path_component_name;
> @@ -222,6 +223,7 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size)
>  #define OF_DETACHED    2 /* node has been detached from the device tree */
>  #define OF_POPULATED   3 /* device already created for the node */
>  #define OF_POPULATED_BUS       4 /* of_platform_populate recursed to children of this node */
> +#define OF_DYNAMIC_HYBIRD      5 /* similar to OF_DYNAMIC, but partially */
>
>  #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
>  #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
> diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
> index 587ee50..1fb47d7 100644
> --- a/include/linux/of_fdt.h
> +++ b/include/linux/of_fdt.h
> @@ -39,6 +39,7 @@ extern int of_fdt_match(const void *blob, unsigned long node,
>                         const char *const *compat);
>  extern void of_fdt_unflatten_tree(unsigned long *blob,
>                                struct device_node **mynodes);
> +extern void of_fdt_add_subtree(struct device_node *parent, void *blob);
>
>  /* TBD: Temporary export of fdt globals - remove when code fully merged */
>  extern int __initdata dt_root_addr_cells;
> --
> 2.1.0
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 1, 2015, 3:22 p.m. UTC | #2
On Fri, 2015-05-01 at 07:54 -0500, Rob Herring wrote:

> The difference seems to be whether you allocate space or just point to
> the FDT for various strings/data. Is that right?
> 
> >    * of_fdt_add_subtree() is the introduced API to do the work.
> 
> Have you looked at overlays and if so why do they not work for your purposes?
> 
> Why do you need to do this with the flattened tree?

The basic idea I asked Gavin to implement is that since the FW needs to
provide a bunch of DT updates to Linux at runtime in the form of new
nodes below an existing one, rather than doing it via some new/custom
format, instead, have it send a bit of FDT blob to expand under an
existing node.

As for the details of Gavin implementation, I haven't looked at it in
details yet so there might be issues there, however I don't know what
you mean by "overlays", any pointer ?

Cheers,
Ben.




--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rob Herring May 1, 2015, 6:46 p.m. UTC | #3
On Fri, May 1, 2015 at 10:22 AM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Fri, 2015-05-01 at 07:54 -0500, Rob Herring wrote:
>
>> The difference seems to be whether you allocate space or just point to
>> the FDT for various strings/data. Is that right?
>>
>> >    * of_fdt_add_subtree() is the introduced API to do the work.
>>
>> Have you looked at overlays and if so why do they not work for your purposes?
>>
>> Why do you need to do this with the flattened tree?
>
> The basic idea I asked Gavin to implement is that since the FW needs to
> provide a bunch of DT updates to Linux at runtime in the form of new
> nodes below an existing one, rather than doing it via some new/custom
> format, instead, have it send a bit of FDT blob to expand under an
> existing node.

Overlay = an FDT blob to graft into a live running system. Sounds like
the same thing.

> As for the details of Gavin implementation, I haven't looked at it in
> details yet so there might be issues there, however I don't know what
> you mean by "overlays", any pointer ?

CONFIG_OF_OVERLAY

http://events.linuxfoundation.org/sites/events/files/slides/dynamic-dt-keynote-v3.pdf

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 1, 2015, 10:57 p.m. UTC | #4
On Fri, 2015-05-01 at 13:46 -0500, Rob Herring wrote:
> On Fri, May 1, 2015 at 10:22 AM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Fri, 2015-05-01 at 07:54 -0500, Rob Herring wrote:
> >
> >> The difference seems to be whether you allocate space or just point to
> >> the FDT for various strings/data. Is that right?
> >>
> >> >    * of_fdt_add_subtree() is the introduced API to do the work.
> >>
> >> Have you looked at overlays and if so why do they not work for your purposes?
> >>
> >> Why do you need to do this with the flattened tree?
> >
> > The basic idea I asked Gavin to implement is that since the FW needs to
> > provide a bunch of DT updates to Linux at runtime in the form of new
> > nodes below an existing one, rather than doing it via some new/custom
> > format, instead, have it send a bit of FDT blob to expand under an
> > existing node.
> 
> Overlay = an FDT blob to graft into a live running system. Sounds like
> the same thing.
> 
> > As for the details of Gavin implementation, I haven't looked at it in
> > details yet so there might be issues there, however I don't know what
> > you mean by "overlays", any pointer ?
> 
> CONFIG_OF_OVERLAY
> 
> http://events.linuxfoundation.org/sites/events/files/slides/dynamic-dt-keynote-v3.pdf

Well, that looks horrendously complicated, poorly documented and totally
unused in-tree outside of the unittest stuff, yay ! It has all sort of
"features" that I don't really care about.

I still don't see what it buys me other than making my FW a lot more
complex having to generate all that additional fixup etc... crap that I
don't totally get yet.

What's wrong with just unflattening the nodes in place ? The DT comes
from the FW in the first place so all the phandles are already good in
the new added blob. Internally, the FW created new nodes in its internal
representation and flattened the subtree and sends that subtree to
Linux.

I don't plan to play "revert" either, if you unplug, I do need to remove
what's under the slot but that's true of boot time devices, not just
"new" ones, so the overlay stuff won't do the trick and I certainly
don't want to keep track...

Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 1, 2015, 11:29 p.m. UTC | #5
On Sat, 2015-05-02 at 08:57 +1000, Benjamin Herrenschmidt wrote:

> > Overlay = an FDT blob to graft into a live running system. Sounds like
> > the same thing.
> > 
> > > As for the details of Gavin implementation, I haven't looked at it in
> > > details yet so there might be issues there, however I don't know what
> > > you mean by "overlays", any pointer ?
> > 
> > CONFIG_OF_OVERLAY
> > 
> > http://events.linuxfoundation.org/sites/events/files/slides/dynamic-dt-keynote-v3.pdf
> 
> Well, that looks horrendously complicated, poorly documented and totally
> unused in-tree outside of the unittest stuff, yay ! It has all sort of
> "features" that I don't really care about.

Looking a bit more at it, I don't quite see how I can attach a subtree
using that stuff.

Instead, each node in the overlay seems to need extra nodes and
properties to refer to the original.

So the FW would essentially have to create something a lot more complex
than just reflattening a bit of its internal tree. For each internal
node, it will need to add all those __overlay__ nodes and properties.

That is not going to fly for me at all. It's order of magnitudes more
complex than the solution we are pursuing.

So I think for our use case, we should continue in the direction of
having a helper to unflatten a piece of FDT underneath an existing
node. I don't like the "HYBRID" stuff though, we should not refer to
the original FDT, we should just make them normal dynamic nodes.

Ben.



--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 2, 2015, 2:48 a.m. UTC | #6
On Sat, 2015-05-02 at 09:29 +1000, Benjamin Herrenschmidt wrote:

> Looking a bit more at it, I don't quite see how I can attach a subtree
> using that stuff.
> 
> Instead, each node in the overlay seems to need extra nodes and
> properties to refer to the original.
> 
> So the FW would essentially have to create something a lot more complex
> than just reflattening a bit of its internal tree. For each internal
> node, it will need to add all those __overlay__ nodes and properties.
> 
> That is not going to fly for me at all. It's order of magnitudes more
> complex than the solution we are pursuing.
> 
> So I think for our use case, we should continue in the direction of
> having a helper to unflatten a piece of FDT underneath an existing
> node. I don't like the "HYBRID" stuff though, we should not refer to
> the original FDT, we should just make them normal dynamic nodes.

A bit more thought... if we were to use the overlay stuff, Gavin, what
we *could* do is add to OPAL FW internal representation a generation
count to every node and property.

That way we could essentially know whenever something's changed from
what we flattened originally for the kernel.

We can then create a generic (not PCI specific) call that generates
an overlay tree for every node and property that has a generation
count that is newer than what was flattened (or passed by the OS).

It's still a LOT more complex than what we need though...

Cheers,
Ben.



--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan May 3, 2015, 11:28 p.m. UTC | #7
On Fri, May 01, 2015 at 07:54:03AM -0500, Rob Herring wrote:
>+dt list
>
>On Fri, May 1, 2015 at 1:03 AM, Gavin Shan <gwshan@linux.vnet.ibm.com> wrote:
>> The requirement is raised when developing the PCI hotplug feature
>> for PowerPC PowerNV platform, which runs on top of skiboot firmware.
>> When plugging PCI adapter to one PCI slot, the firmware rescans the
>> slot and build FDT (Flat Device Tree) blob, which is sent to the
>> PowerNV PCI hotplug driver for processing. The new constructed device
>> nodes from the FDT blob are expected to be attached to the device
>> node of the PCI slot. Unfortunately, it seems we don't have a API
>> to support the scenario. The patch intends to support it by newly
>> introduced function of_fdt_add_subtree(), the design behind it is
>> shown as below:
>>
>>    * When the sub-tree FDT blob, which is owned by firmware, is
>>      received by kernel. It's copied over to the blob, which is
>>      dynamically allocated. Since then, the FDT blob owned by
>>      firmware isn't touched.
>>    * Rework unflatten_dt_node() so that the device nodes in current
>>      and deeper depth have been constructed from the FDT blob. All
>>      device nodes are marked with flag OF_DYNAMIC_HYBIRD, which is
>
>Perhaps you meant HYBRID?
>

Yeah, It should be "HYBRID".

>>      similar to OF_DYNAMIC. However, device node with the flag set
>>      can be free'd, but in the way other than that for OF_DYNAMIC
>>      device nodes.
>
>The difference seems to be whether you allocate space or just point to
>the FDT for various strings/data. Is that right?
>

It's correct. The FDT blob passed from firmware is copied by kernel to
the memory chunk, which is allocated from slab. That means the FDT blob
managed by firmware can be released in time. In kernel, the instances of
"struct device_node" and "struct property" are allocated from slab
dynamically, but some of their fields are points to the (copied) FDT
blob. It indicates the (copied) FDT can only be released when the sub-tree
is cut off completely.


>>    * of_fdt_add_subtree() is the introduced API to do the work.
>
>Have you looked at overlays and if so why do they not work for your purposes?
>
>Why do you need to do this with the flattened tree?
>

It seems that Ben already helped answering the questions. I'll reply
in other threads if necessary. Rob, thanks for review.

Thanks,
Gavin

>Rob
>
>>
>> Cc: Grant Likely <grant.likely@linaro.org>
>> Cc: Rob Herring <robh+dt@kernel.org>
>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> ---
>>  drivers/of/dynamic.c   |  19 +++++--
>>  drivers/of/fdt.c       | 133 ++++++++++++++++++++++++++++++++++++++++---------
>>  include/linux/of.h     |   2 +
>>  include/linux/of_fdt.h |   1 +
>>  4 files changed, 127 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>> index 3351ef4..f562080 100644
>> --- a/drivers/of/dynamic.c
>> +++ b/drivers/of/dynamic.c
>> @@ -330,13 +330,22 @@ void of_node_release(struct kobject *kobj)
>>                 return;
>>         }
>>
>> -       if (!of_node_check_flag(node, OF_DYNAMIC))
>> +       /* Release the subtree */
>> +       if (node->subtree) {
>> +               kfree(node->subtree);
>> +               node->subtree = NULL;
>> +       }
>> +
>> +       if (!of_node_check_flag(node, OF_DYNAMIC) &&
>> +           !of_node_check_flag(node, OF_DYNAMIC_HYBIRD))
>>                 return;
>>
>>         while (prop) {
>>                 struct property *next = prop->next;
>> -               kfree(prop->name);
>> -               kfree(prop->value);
>> +               if (of_node_check_flag(node, OF_DYNAMIC)) {
>> +                       kfree(prop->name);
>> +                       kfree(prop->value);
>> +               }
>>                 kfree(prop);
>>                 prop = next;
>>
>> @@ -345,7 +354,9 @@ void of_node_release(struct kobject *kobj)
>>                         node->deadprops = NULL;
>>                 }
>>         }
>> -       kfree(node->full_name);
>> +
>> +       if (of_node_check_flag(node, OF_DYNAMIC))
>> +               kfree(node->full_name);
>>         kfree(node->data);
>>         kfree(node);
>>  }
>> diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
>> index cde35c5d01..7659560 100644
>> --- a/drivers/of/fdt.c
>> +++ b/drivers/of/fdt.c
>> @@ -28,6 +28,10 @@
>>  #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
>>  #include <asm/page.h>
>>
>> +#include "of_private.h"
>> +
>> +static int cur_node_depth;
>> +
>>  /*
>>   * of_fdt_limit_memory - limit the number of regions in the /memory node
>>   * @limit: maximum entries
>> @@ -168,20 +172,20 @@ static void *unflatten_dt_alloc(void **mem, unsigned long size,
>>   * @dad: Parent struct device_node
>>   * @fpsize: Size of the node path up at the current depth.
>>   */
>> -static void * unflatten_dt_node(void *blob,
>> -                               void *mem,
>> -                               int *poffset,
>> -                               struct device_node *dad,
>> -                               struct device_node **nodepp,
>> -                               unsigned long fpsize,
>> -                               bool dryrun)
>> +static void *unflatten_dt_node(void *blob,
>> +                              void *mem,
>> +                              int *poffset,
>> +                              struct device_node *dad,
>> +                              struct device_node **nodepp,
>> +                              unsigned long fpsize,
>> +                              bool dryrun,
>> +                              bool dynamic)
>>  {
>>         const __be32 *p;
>>         struct device_node *np;
>>         struct property *pp, **prev_pp = NULL;
>>         const char *pathp;
>>         unsigned int l, allocl;
>> -       static int depth = 0;
>>         int old_depth;
>>         int offset;
>>         int has_name = 0;
>> @@ -219,12 +223,18 @@ static void * unflatten_dt_node(void *blob,
>>                 }
>>         }
>>
>> -       np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl,
>> +       if (dynamic)
>> +               np = kzalloc(sizeof(struct device_node) + allocl, GFP_KERNEL);
>> +       else
>> +               np = unflatten_dt_alloc(&mem,
>> +                               sizeof(struct device_node) + allocl,
>>                                 __alignof__(struct device_node));
>>         if (!dryrun) {
>>                 char *fn;
>>                 of_node_init(np);
>>                 np->full_name = fn = ((char *)np) + sizeof(*np);
>> +               if (dynamic)
>> +                       of_node_set_flag(np, OF_DYNAMIC_HYBIRD);
>>                 if (new_format) {
>>                         /* rebuild full path for new format */
>>                         if (dad && dad->parent) {
>> @@ -267,8 +277,12 @@ static void * unflatten_dt_node(void *blob,
>>                 }
>>                 if (strcmp(pname, "name") == 0)
>>                         has_name = 1;
>> -               pp = unflatten_dt_alloc(&mem, sizeof(struct property),
>> -                                       __alignof__(struct property));
>> +
>> +               if (dynamic)
>> +                       pp = kzalloc(sizeof(struct property), GFP_KERNEL);
>> +               else
>> +                       pp = unflatten_dt_alloc(&mem, sizeof(struct property),
>> +                                               __alignof__(struct property));
>>                 if (!dryrun) {
>>                         /* We accept flattened tree phandles either in
>>                          * ePAPR-style "phandle" properties, or the
>> @@ -309,8 +323,13 @@ static void * unflatten_dt_node(void *blob,
>>                 if (pa < ps)
>>                         pa = p1;
>>                 sz = (pa - ps) + 1;
>> -               pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz,
>> -                                       __alignof__(struct property));
>> +
>> +               if (dynamic)
>> +                       pp = kzalloc(sizeof(struct property) + sz, GFP_KERNEL);
>> +               else
>> +                       pp = unflatten_dt_alloc(&mem,
>> +                                               sizeof(struct property) + sz,
>> +                                               __alignof__(struct property));
>>                 if (!dryrun) {
>>                         pp->name = "name";
>>                         pp->length = sz;
>> @@ -334,13 +353,21 @@ static void * unflatten_dt_node(void *blob,
>>                         np->type = "<NULL>";
>>         }
>>
>> -       old_depth = depth;
>> -       *poffset = fdt_next_node(blob, *poffset, &depth);
>> -       if (depth < 0)
>> -               depth = 0;
>> -       while (*poffset > 0 && depth > old_depth)
>> -               mem = unflatten_dt_node(blob, mem, poffset, np, NULL,
>> -                                       fpsize, dryrun);
>> +       old_depth = cur_node_depth;
>> +       *poffset = fdt_next_node(blob, *poffset, &cur_node_depth);
>> +       while (*poffset > 0) {
>> +               if (cur_node_depth < old_depth)
>> +                       break;
>> +
>> +               if (cur_node_depth == old_depth)
>> +                       mem = unflatten_dt_node(blob, mem, poffset,
>> +                                               dad, NULL, fpsize,
>> +                                               dryrun, dynamic);
>> +               else if (cur_node_depth > old_depth)
>> +                       mem = unflatten_dt_node(blob, mem, poffset,
>> +                                               np, NULL, fpsize,
>> +                                               dryrun, dynamic);
>> +       }
>>
>>         if (*poffset < 0 && *poffset != -FDT_ERR_NOTFOUND)
>>                 pr_err("unflatten: error %d processing FDT\n", *poffset);
>> @@ -379,8 +406,8 @@ static void * unflatten_dt_node(void *blob,
>>   * for the resulting tree
>>   */
>>  static void __unflatten_device_tree(void *blob,
>> -                            struct device_node **mynodes,
>> -                            void * (*dt_alloc)(u64 size, u64 align))
>> +                               struct device_node **mynodes,
>> +                               void * (*dt_alloc)(u64 size, u64 align))
>>  {
>>         unsigned long size;
>>         int start;
>> @@ -405,7 +432,9 @@ static void __unflatten_device_tree(void *blob,
>>
>>         /* First pass, scan for size */
>>         start = 0;
>> -       size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL, NULL, 0, true);
>> +       cur_node_depth = 1;
>> +       size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL,
>> +                                               NULL, 0, true, false);
>>         size = ALIGN(size, 4);
>>
>>         pr_debug("  size is %lx, allocating...\n", size);
>> @@ -420,7 +449,8 @@ static void __unflatten_device_tree(void *blob,
>>
>>         /* Second pass, do actual unflattening */
>>         start = 0;
>> -       unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false);
>> +       cur_node_depth = 1;
>> +       unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false, false);
>>         if (be32_to_cpup(mem + size) != 0xdeadbeef)
>>                 pr_warning("End of tree marker overwritten: %08x\n",
>>                            be32_to_cpup(mem + size));
>> @@ -448,6 +478,61 @@ void of_fdt_unflatten_tree(unsigned long *blob,
>>  }
>>  EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
>>
>> +static void populate_sysfs_for_child_nodes(struct device_node *parent)
>> +{
>> +       struct device_node *child;
>> +
>> +       for_each_child_of_node(parent, child) {
>> +               __of_attach_node_sysfs(child);
>> +               populate_sysfs_for_child_nodes(child);
>> +       }
>> +}
>> +
>> +/**
>> + * of_fdt_add_substree - Create sub-tree of device nodes
>> + * @parent: parent device node to which the sub-tree will attach
>> + * @blob: flat device tree blob representing the sub-tree
>> + *
>> + * Copy over the FDT blob, which passed from firmware, and then
>> + * unflatten the sub-tree.
>> + */
>> +void of_fdt_add_subtree(struct device_node *parent, void *blob)
>> +{
>> +       int start = 0;
>> +
>> +       /* Validate the header */
>> +       if (!blob || fdt_check_header(blob)) {
>> +               pr_err("%s: Invalid device-tree blob header at 0x%p\n",
>> +                      __func__, blob);
>> +               return;
>> +       }
>> +
>> +       /* Free the flat blob for last time lazily */
>> +       if (parent->subtree) {
>> +               kfree(parent->subtree);
>> +               parent->subtree = NULL;
>> +       }
>> +
>> +       /* Copy over the flat blob */
>> +       parent->subtree = kzalloc(fdt_totalsize(blob), GFP_KERNEL);
>> +       if (!parent->subtree) {
>> +               pr_err("%s: Cannot copy over device-tree blob\n",
>> +                      __func__);
>> +               return;
>> +       }
>> +
>> +       memcpy(parent->subtree, blob, fdt_totalsize(blob));
>> +
>> +       /* Unflatten it */
>> +       mutex_lock(&of_mutex);
>> +       cur_node_depth = 1;
>> +       unflatten_dt_node(parent->subtree, NULL, &start, parent, NULL,
>> +                         strlen(parent->full_name), false, true);
>> +       populate_sysfs_for_child_nodes(parent);
>> +       mutex_unlock(&of_mutex);
>> +}
>> +EXPORT_SYMBOL(of_fdt_add_subtree);
>> +
>>  /* Everything below here references initial_boot_params directly. */
>>  int __initdata dt_root_addr_cells;
>>  int __initdata dt_root_size_cells;
>> diff --git a/include/linux/of.h b/include/linux/of.h
>> index ddeaae6..ac50b02 100644
>> --- a/include/linux/of.h
>> +++ b/include/linux/of.h
>> @@ -60,6 +60,7 @@ struct device_node {
>>         struct  device_node *sibling;
>>         struct  kobject kobj;
>>         unsigned long _flags;
>> +       void    *subtree;
>>         void    *data;
>>  #if defined(CONFIG_SPARC)
>>         const char *path_component_name;
>> @@ -222,6 +223,7 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size)
>>  #define OF_DETACHED    2 /* node has been detached from the device tree */
>>  #define OF_POPULATED   3 /* device already created for the node */
>>  #define OF_POPULATED_BUS       4 /* of_platform_populate recursed to children of this node */
>> +#define OF_DYNAMIC_HYBIRD      5 /* similar to OF_DYNAMIC, but partially */
>>
>>  #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
>>  #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
>> diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
>> index 587ee50..1fb47d7 100644
>> --- a/include/linux/of_fdt.h
>> +++ b/include/linux/of_fdt.h
>> @@ -39,6 +39,7 @@ extern int of_fdt_match(const void *blob, unsigned long node,
>>                         const char *const *compat);
>>  extern void of_fdt_unflatten_tree(unsigned long *blob,
>>                                struct device_node **mynodes);
>> +extern void of_fdt_add_subtree(struct device_node *parent, void *blob);
>>
>>  /* TBD: Temporary export of fdt globals - remove when code fully merged */
>>  extern int __initdata dt_root_addr_cells;
>> --
>> 2.1.0
>>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan May 4, 2015, 12:23 a.m. UTC | #8
On Sat, May 02, 2015 at 09:29:36AM +1000, Benjamin Herrenschmidt wrote:
>On Sat, 2015-05-02 at 08:57 +1000, Benjamin Herrenschmidt wrote:
>
>> > Overlay = an FDT blob to graft into a live running system. Sounds like
>> > the same thing.
>> > 
>> > > As for the details of Gavin implementation, I haven't looked at it in
>> > > details yet so there might be issues there, however I don't know what
>> > > you mean by "overlays", any pointer ?
>> > 
>> > CONFIG_OF_OVERLAY
>> > 
>> > http://events.linuxfoundation.org/sites/events/files/slides/dynamic-dt-keynote-v3.pdf
>> 
>> Well, that looks horrendously complicated, poorly documented and totally
>> unused in-tree outside of the unittest stuff, yay ! It has all sort of
>> "features" that I don't really care about.
>
>Looking a bit more at it, I don't quite see how I can attach a subtree
>using that stuff.
>
>Instead, each node in the overlay seems to need extra nodes and
>properties to refer to the original.
>
>So the FW would essentially have to create something a lot more complex
>than just reflattening a bit of its internal tree. For each internal
>node, it will need to add all those __overlay__ nodes and properties.
>
>That is not going to fly for me at all. It's order of magnitudes more
>complex than the solution we are pursuing.
>
>So I think for our use case, we should continue in the direction of
>having a helper to unflatten a piece of FDT underneath an existing
>node. I don't like the "HYBRID" stuff though, we should not refer to
>the original FDT, we should just make them normal dynamic nodes.
>

The original FDT from firmware is copied over to the memory chunk
allocated from slab by kernel. So we refer to the copy of the FDT,
not original one. Yeah, "HYBRID" wouldn't be a good idea. If we
want make all device nodes and properties of the sub-tree "DYNAMIC",
the FDT hasn't to be copied over from skiboot to kernel, indicating
those dynamic device nodes and properties in the subtree can be
figured out directly from the FDT blob, which is owned by firmware.

Also, it just need small changes to code what we have. Not too much
changes needed.

Thanks,
Gavin

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan May 4, 2015, 1:30 a.m. UTC | #9
On Sat, May 02, 2015 at 12:48:26PM +1000, Benjamin Herrenschmidt wrote:
>On Sat, 2015-05-02 at 09:29 +1000, Benjamin Herrenschmidt wrote:
>
>> Looking a bit more at it, I don't quite see how I can attach a subtree
>> using that stuff.
>> 
>> Instead, each node in the overlay seems to need extra nodes and
>> properties to refer to the original.
>> 
>> So the FW would essentially have to create something a lot more complex
>> than just reflattening a bit of its internal tree. For each internal
>> node, it will need to add all those __overlay__ nodes and properties.
>> 
>> That is not going to fly for me at all. It's order of magnitudes more
>> complex than the solution we are pursuing.
>> 
>> So I think for our use case, we should continue in the direction of
>> having a helper to unflatten a piece of FDT underneath an existing
>> node. I don't like the "HYBRID" stuff though, we should not refer to
>> the original FDT, we should just make them normal dynamic nodes.

Just took a close look on the overlay code. Hopefully I understand
how it works completely. Yeah, there is one questions according to my
understanding. The "overlay" device node should have been in child list
of the device node, who also has the indicator to "target" node. That
means some one else has to create "overlay" node and figure out the
"target" node in advance, then invokes overlay module to apply the
changes. From this perspective, the mechanism is something used to
apply the changes to device-tree, not parsing and create device nodes
from input. It does gurantee all the changes will be applied or none
of them. So I agree on what Ben suggested: to continue the direction
of having a helper to unflatten FDT blobk underneath the existing node,
and "HYBRID" should be replaced with "OF_DYNAMIC".

>
>A bit more thought... if we were to use the overlay stuff, Gavin, what
>we *could* do is add to OPAL FW internal representation a generation
>count to every node and property.
>
>That way we could essentially know whenever something's changed from
>what we flattened originally for the kernel.
>
>We can then create a generic (not PCI specific) call that generates
>an overlay tree for every node and property that has a generation
>count that is newer than what was flattened (or passed by the OS).
>
>It's still a LOT more complex than what we need though...
>

Thanks, Ben. If we really need utilize overlay to support our case,
we need some one to parse the input (device-tree changes) from firmware
and create "overlay" device node and "target" node as I mentioned above.
It's not simpler than the way we had to support our case. I'm not sure
if we really need utilize overlay for our case.

Thanks,
Gavin

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 4, 2015, 4:51 a.m. UTC | #10
On Mon, 2015-05-04 at 11:30 +1000, Gavin Shan wrote:
> Thanks, Ben. If we really need utilize overlay to support our case,
> we need some one to parse the input (device-tree changes) from
> firmware
> and create "overlay" device node and "target" node as I mentioned
> above.
> It's not simpler than the way we had to support our case. I'm not sure
> if we really need utilize overlay for our case.

No, if we decide to go down that path, then the FW needs to create
the overlay.

This could be done by having some kind of versioning to all nodes and
properties using a global generation count.

Ie, if we "know" what we passed to Linux, we can generate an overlay
that contains everything that changed since then using the version
numbers.

However, we should probably encode the version in the tree itself and
have specific APIs to retrieve "from" a given version to properly deal
with kexec'ing a kernel since in that case, the new kernel will have
something that isn't version 0 but version N where N is the latest
applied overlay.

Also I don't know how removing nodes works with overlay. IE the overlay
system is designed around the idea of removing the overlay to retrieve
the original tree.

In our case, our overlays are meant to be fully committed, and I don't
know whether there's a way to keep track. On unplug, we will just remove
all the nodes below the slot.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pantelis Antoniou May 4, 2015, 4:41 p.m. UTC | #11
Hi Ben,

> On May 2, 2015, at 01:57 , Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> On Fri, 2015-05-01 at 13:46 -0500, Rob Herring wrote:
>> On Fri, May 1, 2015 at 10:22 AM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org> wrote:
>>> On Fri, 2015-05-01 at 07:54 -0500, Rob Herring wrote:
>>> 
>>>> The difference seems to be whether you allocate space or just point to
>>>> the FDT for various strings/data. Is that right?
>>>> 
>>>>>   * of_fdt_add_subtree() is the introduced API to do the work.
>>>> 
>>>> Have you looked at overlays and if so why do they not work for your purposes?
>>>> 
>>>> Why do you need to do this with the flattened tree?
>>> 
>>> The basic idea I asked Gavin to implement is that since the FW needs to
>>> provide a bunch of DT updates to Linux at runtime in the form of new
>>> nodes below an existing one, rather than doing it via some new/custom
>>> format, instead, have it send a bit of FDT blob to expand under an
>>> existing node.
>> 
>> Overlay = an FDT blob to graft into a live running system. Sounds like
>> the same thing.
>> 
>>> As for the details of Gavin implementation, I haven't looked at it in
>>> details yet so there might be issues there, however I don't know what
>>> you mean by "overlays", any pointer ?
>> 
>> CONFIG_OF_OVERLAY
>> 
>> http://events.linuxfoundation.org/sites/events/files/slides/dynamic-dt-keynote-v3.pdf
> 
> Well, that looks horrendously complicated, poorly documented and totally
> unused in-tree outside of the unittest stuff, yay ! It has all sort of
> "features" that I don't really care about.
> 

If it was easy to get stuff in, it would get more of the real-use drivers
in.

> I still don't see what it buys me other than making my FW a lot more
> complex having to generate all that additional fixup etc... crap that I
> don't totally get yet.
> 

You don’t generate any additional fixups. You just compile with the option
that generates all the fixups for you.

> What's wrong with just unflattening the nodes in place ? The DT comes
> from the FW in the first place so all the phandles are already good in
> the new added blob. Internally, the FW created new nodes in its internal
> representation and flattened the subtree and sends that subtree to
> Linux.
> 
> I don't plan to play "revert" either, if you unplug, I do need to remove
> what's under the slot but that's true of boot time devices, not just
> "new" ones, so the overlay stuff won't do the trick and I certainly
> don't want to keep track…
> 

You get all of the corner cases handled for free. Perhaps it works for your
case too.

Perhaps you can educate me on what you need supported and we can make sure
it’s included.

> Ben.
> 
> 

Regards

— Pantelis

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 4, 2015, 9:14 p.m. UTC | #12
On Mon, 2015-05-04 at 19:41 +0300, Pantelis Antoniou wrote:
> 
> You get all of the corner cases handled for free. Perhaps it works for
> your case too.
> 
> Perhaps you can educate me on what you need supported and we can make
> sure it’s included.

Which corner cases ?

IE, what I want is simply "update" the device-tree below a PCIe slot on
PCI hotplug.

The DT isn't "compiled" from a dts (it's amazing how many people seem to
believe this is the only way you get fdt's nowadays). It's dynamically
(ie programatically) generated by firmware at boot time and contains
whatever PCIe devices happen to be plugged during boot.

When doing PCIe hotplug operations, the kernel does various FW calls
(among others to control slot power), and during these, the FW re-probes
underneath the slot and refreshes its internal representation. So the
phandles remain fully consistent, there is no fixup needed.

We want the kernel to also update his copy as wee in order to avoid
keeping stale nodes that don't match what's there anymore. Also, when
plugging specific kind of IO drawers, the FW can provide additional node
and properties that will be used to control slots inside the drawers.

So what we need is:

  - On PCIe unplug, remove all old nodes below the slot
  - On PCIe plug, get all the new nodes from FW

Note that there is no need to do anything like platform device probing
etc... the PCI layer takes care of that, we will remove the old nodes
after the pci_dev are gone and create the new ones before Linux
re-probes the PCIe bus subtree.

So what we need is very simple: The removal can be handled without FW
help, and the plug case is a matter of just transferring all those new
nodes to Linux to re-expand.

Since the phandle etc... are all consistent with the original tree,
there is no fixups required.

So the "trivial" way to do it (and the way we have implemented the FW
side so far) is to have the FW simply "flatten" the subtree below the
slot and pass it to Linux, with the intent of expanding it back below
the slot node.

This is what Gavin proposed patches do.

The overlay mechanism adds all sorts of features that we don't seen to
need and would make the above more complex.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 13, 2015, 11:35 p.m. UTC | #13
On Tue, 2015-05-05 at 07:14 +1000, Benjamin Herrenschmidt wrote:
> So the "trivial" way to do it (and the way we have implemented the FW
> side so far) is to have the FW simply "flatten" the subtree below the
> slot and pass it to Linux, with the intent of expanding it back below
> the slot node.
> 
> This is what Gavin proposed patches do.
> 
> The overlay mechanism adds all sorts of features that we don't seen to
> need and would make the above more complex.

Guys, I never got a final answer from you on this. Are we ok with adding
the way to just expand a subtree or are you insistent we need to use the
overlap mechanism ?

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rob Herring May 14, 2015, 12:18 a.m. UTC | #14
On Wed, May 13, 2015 at 6:35 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Tue, 2015-05-05 at 07:14 +1000, Benjamin Herrenschmidt wrote:
>> So the "trivial" way to do it (and the way we have implemented the FW
>> side so far) is to have the FW simply "flatten" the subtree below the
>> slot and pass it to Linux, with the intent of expanding it back below
>> the slot node.
>>
>> This is what Gavin proposed patches do.
>>
>> The overlay mechanism adds all sorts of features that we don't seen to
>> need and would make the above more complex.
>
> Guys, I never got a final answer from you on this. Are we ok with adding
> the way to just expand a subtree or are you insistent we need to use the
> overlap mechanism ?

I haven't decided really.

The main thing with the current patch is I don't really like the added
complexity to unflatten_dt_node. It is already a fairly complex
function. Perhaps removing of "hybrid" as discussed will help?

If there are things we can do to make overlays easier to use in your
use case, I'd like to hear ideas. I don't really buy that being more
complex than needed is an obstacle. That is very often the case to
have common, scale-able solutions. I want to see a simple case be
simple to support.

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 14, 2015, 12:54 a.m. UTC | #15
On Wed, 2015-05-13 at 19:18 -0500, Rob Herring wrote:

> I haven't decided really.
> 
> The main thing with the current patch is I don't really like the added
> complexity to unflatten_dt_node. It is already a fairly complex
> function. Perhaps removing of "hybrid" as discussed will help?

I agree, we should be able to make that much simpler, I was planning on
sorting that out with Gavin.

> If there are things we can do to make overlays easier to use in your
> use case, I'd like to hear ideas. I don't really buy that being more
> complex than needed is an obstacle. That is very often the case to
> have common, scale-able solutions. I want to see a simple case be
> simple to support.

Well, it's a LOT more complex from the FW perspective for a bunch of
features we don't really need, in a way because the DT update in our
case is just purely informational to avoid keeping wrong/outdated DT
bits, it has little functional impact (it might have a bit for interrupt
routing through bridges though).

However, I am also pursuing an approach on FW side using a generation
count in our nodes and properties which we could use to generate
arbitrary overlays if we know what generation linux has.

There might actual be a usage scenario for a generic way for our
firwmare to convey DT updates to Linux for other reasons.

A few things that I don't find in the overlay code (but maybe I haven't
looked at it hard enough):

 - Can it remove nodes/properties ?

 - Can it "commit" a changeset so it's permanently part of the main DT ?
We will never have a concept of "revertable" changesets, if we need a
subsequent update, we will get a new overlay from FW that will remove
what needs to be removed and add what needs to be added.

IE, our current mechanism without overlay is fairly simple:

  - On PCI unplug, we remove all nodes below the slot (from linux),
the FW does the equivalent internally.

  - On PCI re-plug, the FW internally builds new nodes and sends a
new subtree as an FDT that we can expand/attach.

Now we could consider that subtree as a changeset that can be undone,
but that wouldn't work for boot time. And subsequent updates wouldn't
have that concept of "undoing" anyway.

IE. conceptually, what overlays do today is quite rooted around the idea
of having a fixed "base" DT and some pre-compiled DTB overlays that
get added/removed. The design completely ignore the idea of a FW that
maintains a "live" tree which we want to keep in sync, which is what we
want to do here, or what we could do with a "live" open firmware
implementation.

Now we might be able to reconcile them, but it feels to me that the
overlay/changeset stuff is too rooted in the first concept...

Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pantelis Antoniou May 14, 2015, 6:23 a.m. UTC | #16
Hi Ben,

Sorry for taking this long to respond, but I am working on the same problem right
now. I thought I might have something to show, but not yet :)

My PCI overlay case is different. In my case there is no firmware and there
is the blob is provided as an overlay.

The idea is that for a given PCI bus, when a PCI device with a matching
device id, vendor id is probed a matching overlay should be applied.

The trickiness lies in the way that the way that the target is different
each time and how to handle generational issues (i.e. what happens if the pci device
is removed before the application of the overlay occurs, what happens when multiple
applications should happen in parallel, etc.)


> On May 14, 2015, at 03:54 , Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> On Wed, 2015-05-13 at 19:18 -0500, Rob Herring wrote:
> 
>> I haven't decided really.
>> 
>> The main thing with the current patch is I don't really like the added
>> complexity to unflatten_dt_node. It is already a fairly complex
>> function. Perhaps removing of "hybrid" as discussed will help?
> 
> I agree, we should be able to make that much simpler, I was planning on
> sorting that out with Gavin.
> 

I think using overlays should cover your case without any issues.
I don’t like messing with the unflatten method TBH.

>> If there are things we can do to make overlays easier to use in your
>> use case, I'd like to hear ideas. I don't really buy that being more
>> complex than needed is an obstacle. That is very often the case to
>> have common, scale-able solutions. I want to see a simple case be
>> simple to support.
> 
> Well, it's a LOT more complex from the FW perspective for a bunch of
> features we don't really need, in a way because the DT update in our
> case is just purely informational to avoid keeping wrong/outdated DT
> bits, it has little functional impact (it might have a bit for interrupt
> routing through bridges though).
> 
> However, I am also pursuing an approach on FW side using a generation
> count in our nodes and properties which we could use to generate
> arbitrary overlays if we know what generation linux has.
> 
> There might actual be a usage scenario for a generic way for our
> firwmare to convey DT updates to Linux for other reasons.
> 
> A few things that I don't find in the overlay code (but maybe I haven't
> looked at it hard enough):
> 
> - Can it remove nodes/properties ?
> 

Yes.

> - Can it "commit" a changeset so it's permanently part of the main DT ?
> We will never have a concept of "revertable" changesets, if we need a
> subsequent update, we will get a new overlay from FW that will remove
> what needs to be removed and add what needs to be added.
> 

The overlay when applied is a part of the kernel DT tree.
It is trivial to add a mechanism that simply commits everything and
tosses away the revert information.

Note that in that case you have to make provisions for the unflatten
blob to not be freed or for the device tree nodes/properties to be
dynamically allocated.

> IE, our current mechanism without overlay is fairly simple:
> 
>  - On PCI unplug, we remove all nodes below the slot (from linux),
> the FW does the equivalent internally.
> 

If you use an overlay, you just revert it and everything would
be as it was before, without anything hanging below the slot node.

Note that the ‘remove all nodes below the slot’ does not work for my case.

That is because there are devices being instantiated under the slot
(i2c busses, i2c devices, FPGAs etc) that need to be removed from the
system.

>  - On PCI re-plug, the FW internally builds new nodes and sends a
> new subtree as an FDT that we can expand/attach.
> 

You can easily send a DT blob containing an overlay from firmware.

It can be even easy, since you might not have to recreate the full blob
each time, but instead using flat device tree methods to populate the
few properties that change each time.

> Now we could consider that subtree as a changeset that can be undone,
> but that wouldn't work for boot time. And subsequent updates wouldn't
> have that concept of "undoing" anyway.
> 

I have posted another patch that does boot-time DT quirk which are
non-revertable.

https://lkml.org/lkml/2015/2/18/258

> IE. conceptually, what overlays do today is quite rooted around the idea
> of having a fixed "base" DT and some pre-compiled DTB overlays that
> get added/removed. The design completely ignore the idea of a FW that
> maintains a "live" tree which we want to keep in sync, which is what we
> want to do here, or what we could do with a "live" open firmware
> implementation.
> 
> Now we might be able to reconcile them, but it feels to me that the
> overlay/changeset stuff is too rooted in the first concept…
> 

The first DT overlays use case (beaglebone capes) is what got the concept
started.

Right now is a generic mechanism to apply modifications to the kernel
live tree, with the possibility to revert them.

> Ben.
> 
> 

Regards

— Pantelis

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 14, 2015, 6:46 a.m. UTC | #17
On Thu, 2015-05-14 at 09:23 +0300, Pantelis Antoniou wrote:

> > A few things that I don't find in the overlay code (but maybe I haven't
> > looked at it hard enough):
> > 
> > - Can it remove nodes/properties ?
> > 
> 
> Yes.

Ok, I've missed that when looking at the overlay code then, I'll have to
give it a closer look.

> > - Can it "commit" a changeset so it's permanently part of the main DT ?
> > We will never have a concept of "revertable" changesets, if we need a
> > subsequent update, we will get a new overlay from FW that will remove
> > what needs to be removed and add what needs to be added.
> > 
> 
> The overlay when applied is a part of the kernel DT tree.
> It is trivial to add a mechanism that simply commits everything and
> tosses away the revert information.
> 
> Note that in that case you have to make provisions for the unflatten
> blob to not be freed or for the device tree nodes/properties to be
> dynamically allocated.

I think it makes sense to do the dynamic thing anyway...

> > IE, our current mechanism without overlay is fairly simple:
> > 
> >  - On PCI unplug, we remove all nodes below the slot (from linux),
> > the FW does the equivalent internally.
> > 
> 
> If you use an overlay, you just revert it and everything would
> be as it was before, without anything hanging below the slot node.

Except that doesn't work for the boot time content which we get
from the firmware as part of the initial FDT (and we can't change that
without breaking backward compatibility).

> Note that the ‘remove all nodes below the slot’ does not work for my case.
> 
> That is because there are devices being instantiated under the slot
> (i2c busses, i2c devices, FPGAs etc) that need to be removed from the
> system.

Right while in my case, there isn't, it's just the standard OF PCI
representation generated by FW, the main thing is that it might have
some enriched properties for some known cable cards of external drawers
that are good to have.

> >  - On PCI re-plug, the FW internally builds new nodes and sends a
> > new subtree as an FDT that we can expand/attach.
> > 
> 
> You can easily send a DT blob containing an overlay from firmware.

Sending one is easy. Building it is not :-)

> It can be even easy, since you might not have to recreate the full blob
> each time, but instead using flat device tree methods to populate the
> few properties that change each time.

No, we basically have our internal tree in the firmware in a format
similar to Linux, ie, a pointer based tree. We can "flatten" it of
course, but generating an overlay is trickier. We can, it's just more
work and we are running out of time (I basically have to cut that FW in
the next few days, then we'll be stuck with whatever interfaces we
created, I have a big of time to fix bugs after that but that's about
it).

> > Now we could consider that subtree as a changeset that can be undone,
> > but that wouldn't work for boot time. And subsequent updates wouldn't
> > have that concept of "undoing" anyway.
> > 
> 
> I have posted another patch that does boot-time DT quirk which are
> non-revertable.
> 
> https://lkml.org/lkml/2015/2/18/258

Not sure how that applies in my case ... I can't change the
representation of the PCI subtree, this is standard OFW representation,
I can't change the FW to make it an overlay-like thing at boot time,
that would break existing kernels.

> > IE. conceptually, what overlays do today is quite rooted around the idea
> > of having a fixed "base" DT and some pre-compiled DTB overlays that
> > get added/removed. The design completely ignore the idea of a FW that
> > maintains a "live" tree which we want to keep in sync, which is what we
> > want to do here, or what we could do with a "live" open firmware
> > implementation.
> > 
> > Now we might be able to reconcile them, but it feels to me that the
> > overlay/changeset stuff is too rooted in the first concept…
> > 
> 
> The first DT overlays use case (beaglebone capes) is what got the concept
> started.
> 
> Right now is a generic mechanism to apply modifications to the kernel
> live tree, with the possibility to revert them.

Yes but as I said it's not really thought in term of keeping the kernel
tree in sync with an external dynamically generated tree. Maybe we can
fix it, but it's more complex...

Ben.

> > Ben.
> > 
> > 
> 
> Regards
> 
> — Pantelis


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pantelis Antoniou May 14, 2015, 7:04 a.m. UTC | #18
Hi Ben,

> On May 14, 2015, at 09:46 , Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> On Thu, 2015-05-14 at 09:23 +0300, Pantelis Antoniou wrote:
> 
>>> A few things that I don't find in the overlay code (but maybe I haven't
>>> looked at it hard enough):
>>> 
>>> - Can it remove nodes/properties ?
>>> 
>> 
>> Yes.
> 
> Ok, I've missed that when looking at the overlay code then, I'll have to
> give it a closer look.
> 

Ok, let me be more specific. It used to be able to do it ;)
The problem was the format used (a ‘-‘ prefix to the name).

Since I didn’t have clear use for it, I was asked to drop it by Grant.

The removal capability is of-course there for the revert case.

>>> - Can it "commit" a changeset so it's permanently part of the main DT ?
>>> We will never have a concept of "revertable" changesets, if we need a
>>> subsequent update, we will get a new overlay from FW that will remove
>>> what needs to be removed and add what needs to be added.
>>> 
>> 
>> The overlay when applied is a part of the kernel DT tree.
>> It is trivial to add a mechanism that simply commits everything and
>> tosses away the revert information.
>> 
>> Note that in that case you have to make provisions for the unflatten
>> blob to not be freed or for the device tree nodes/properties to be
>> dynamically allocated.
> 
> I think it makes sense to do the dynamic thing anyway...
> 
>>> IE, our current mechanism without overlay is fairly simple:
>>> 
>>> - On PCI unplug, we remove all nodes below the slot (from linux),
>>> the FW does the equivalent internally.
>>> 
>> 
>> If you use an overlay, you just revert it and everything would
>> be as it was before, without anything hanging below the slot node.
> 
> Except that doesn't work for the boot time content which we get
> from the firmware as part of the initial FDT (and we can't change that
> without breaking backward compatibility).
> 

OK

>> Note that the ‘remove all nodes below the slot’ does not work for my case.
>> 
>> That is because there are devices being instantiated under the slot
>> (i2c busses, i2c devices, FPGAs etc) that need to be removed from the
>> system.
> 
> Right while in my case, there isn't, it's just the standard OF PCI
> representation generated by FW, the main thing is that it might have
> some enriched properties for some known cable cards of external drawers
> that are good to have.
> 

I see.

>>> - On PCI re-plug, the FW internally builds new nodes and sends a
>>> new subtree as an FDT that we can expand/attach.
>>> 
>> 
>> You can easily send a DT blob containing an overlay from firmware.
> 
> Sending one is easy. Building it is not :-)
> 

Heh, true ;)

>> It can be even easy, since you might not have to recreate the full blob
>> each time, but instead using flat device tree methods to populate the
>> few properties that change each time.
> 
> No, we basically have our internal tree in the firmware in a format
> similar to Linux, ie, a pointer based tree. We can "flatten" it of
> course, but generating an overlay is trickier. We can, it's just more
> work and we are running out of time (I basically have to cut that FW in
> the next few days, then we'll be stuck with whatever interfaces we
> created, I have a big of time to fix bugs after that but that's about
> it).
> 

Hmm, since you just want to transmit a whole subtree things are a bit
simpler.

You don’t need any of the fixups, and your target node is known.

So your overlay is simply:

/ {
	fragment@0 {
		target-path = “/foo”;
		__overlay__ {
			/* contents of the slot */
		};
	}; 
};

I think it’s possible to just bit-mangle a blob (in pseudo code).

	const u8 template_overlay_blob[] = { <compiled blob of the above> };

	flatten_slot(slot_blob);

	overlay_blob = allocate_new_blob(template_overlay_blob, slot_blob);

	overlay_node = find_node(overlay_blob, “/fragment@0/__overlay__);
	target_prop = find_prop(overlay_blob, “/fragment@0/target-path”);

	inject_slot_blob(overlay_blob, overlay_node, slot_blob);
	modify_slot_target(overlay_blob, target_prop, slot_target);
	
I don’t think you need to re-flatten anything, shuffling bits around with
memmove should work.

>>> Now we could consider that subtree as a changeset that can be undone,
>>> but that wouldn't work for boot time. And subsequent updates wouldn't
>>> have that concept of "undoing" anyway.
>>> 
>> 
>> I have posted another patch that does boot-time DT quirk which are
>> non-revertable.
>> 
>> https://lkml.org/lkml/2015/2/18/258
> 
> Not sure how that applies in my case ... I can't change the
> representation of the PCI subtree, this is standard OFW representation,
> I can't change the FW to make it an overlay-like thing at boot time,
> that would break existing kernels.
> 

The idea is to append the ‘quirk’ to the already booting device tree blob.

Another idea floating around was to simple concatenate the booting blob with
any overlay blobs you want applied at boot time.

>>> IE. conceptually, what overlays do today is quite rooted around the idea
>>> of having a fixed "base" DT and some pre-compiled DTB overlays that
>>> get added/removed. The design completely ignore the idea of a FW that
>>> maintains a "live" tree which we want to keep in sync, which is what we
>>> want to do here, or what we could do with a "live" open firmware
>>> implementation.
>>> 
>>> Now we might be able to reconcile them, but it feels to me that the
>>> overlay/changeset stuff is too rooted in the first concept…
>>> 
>> 
>> The first DT overlays use case (beaglebone capes) is what got the concept
>> started.
>> 
>> Right now is a generic mechanism to apply modifications to the kernel
>> live tree, with the possibility to revert them.
> 
> Yes but as I said it's not really thought in term of keeping the kernel
> tree in sync with an external dynamically generated tree. Maybe we can
> fix it, but it's more complex…
> 

Yes it is, unfortunately.

> Ben.
> 
>>> Ben.
>>> 
>>> 
>> 
>> Regards
>> 
>> — Pantelis
> 
> 

Regards

— Pantelis

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 14, 2015, 7:14 a.m. UTC | #19
On Thu, 2015-05-14 at 10:04 +0300, Pantelis Antoniou wrote:

> Hmm, since you just want to transmit a whole subtree things are a bit
> simpler.
> 
> You don’t need any of the fixups, and your target node is known.
> 
> So your overlay is simply:
> 
> / {
> 	fragment@0 {
> 		target-path = “/foo”;
> 		__overlay__ {
> 			/* contents of the slot */
> 		};
> 	}; 
> };
>
> I think it’s possible to just bit-mangle a blob (in pseudo code).
> 
> 	const u8 template_overlay_blob[] = { <compiled blob of the above> };
> 
> 	flatten_slot(slot_blob);
> 
> 	overlay_blob = allocate_new_blob(template_overlay_blob, slot_blob);
> 
> 	overlay_node = find_node(overlay_blob, “/fragment@0/__overlay__);
> 	target_prop = find_prop(overlay_blob, “/fragment@0/target-path”);
> 
> 	inject_slot_blob(overlay_blob, overlay_node, slot_blob);
> 	modify_slot_target(overlay_blob, target_prop, slot_target);
> 	
> I don’t think you need to re-flatten anything, shuffling bits around with
> memmove should work.

Fairly gross :-)

But yeah generating the overlay doesn't necessarily scare me, I can
generate a temp tree that is the overlay in which I "copy" the subtree
(or in my internal ptr-based representation I could have a concept of
alias which I follow while flattening).

That leaves me with these problems:

 - No support for removing of nodes, so that needs to be added back to
the format and to Linux unless I continue removing by hand in the PCI
hotplug code itself

 - No support for "committing" the overlay which needs to be added as
well.

> >>> Now we could consider that subtree as a changeset that can be undone,
> >>> but that wouldn't work for boot time. And subsequent updates wouldn't
> >>> have that concept of "undoing" anyway.
> >>> 
> >> 
> >> I have posted another patch that does boot-time DT quirk which are
> >> non-revertable.
> >> 
> >> https://lkml.org/lkml/2015/2/18/258
> > 
> > Not sure how that applies in my case ... I can't change the
> > representation of the PCI subtree, this is standard OFW representation,
> > I can't change the FW to make it an overlay-like thing at boot time,
> > that would break existing kernels.
> > 
> 
> The idea is to append the ‘quirk’ to the already booting device tree blob.

I know but that's not how things work for me. At boot time the FW passes
me one tree that contains all the PCI stuff it has probed.

> Another idea floating around was to simple concatenate the booting blob with
> any overlay blobs you want applied at boot time.

Sure but I don't get overlay blobs at boot time.

> >>> IE. conceptually, what overlays do today is quite rooted around the idea
> >>> of having a fixed "base" DT and some pre-compiled DTB overlays that
> >>> get added/removed. The design completely ignore the idea of a FW that
> >>> maintains a "live" tree which we want to keep in sync, which is what we
> >>> want to do here, or what we could do with a "live" open firmware
> >>> implementation.
> >>> 
> >>> Now we might be able to reconcile them, but it feels to me that the
> >>> overlay/changeset stuff is too rooted in the first concept…
> >>> 
> >> 
> >> The first DT overlays use case (beaglebone capes) is what got the concept
> >> started.
> >> 
> >> Right now is a generic mechanism to apply modifications to the kernel
> >> live tree, with the possibility to revert them.
> > 
> > Yes but as I said it's not really thought in term of keeping the kernel
> > tree in sync with an external dynamically generated tree. Maybe we can
> > fix it, but it's more complex…
> > 
> 
> Yes it is, unfortunately.

Right. Which makes the solution of just passing my bit of tree as a blob
which I expand in Linux where I want it rather than an overlay tempting
if we can make Gavin patch more palatable (removing the hybrid stuff
etc...).

Cheers,
Ben.

> > Ben.
> > 
> >>> Ben.
> >>> 
> >>> 
> >> 
> >> Regards
> >> 
> >> — Pantelis
> > 
> > 
> 
> Regards
> 
> — Pantelis


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pantelis Antoniou May 14, 2015, 7:19 a.m. UTC | #20
Hi Ben,

> On May 14, 2015, at 10:14 , Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> On Thu, 2015-05-14 at 10:04 +0300, Pantelis Antoniou wrote:
> 
>> Hmm, since you just want to transmit a whole subtree things are a bit
>> simpler.
>> 
>> You don’t need any of the fixups, and your target node is known.
>> 
>> So your overlay is simply:
>> 
>> / {
>> 	fragment@0 {
>> 		target-path = “/foo”;
>> 		__overlay__ {
>> 			/* contents of the slot */
>> 		};
>> 	}; 
>> };
>> 
>> I think it’s possible to just bit-mangle a blob (in pseudo code).
>> 
>> 	const u8 template_overlay_blob[] = { <compiled blob of the above> };
>> 
>> 	flatten_slot(slot_blob);
>> 
>> 	overlay_blob = allocate_new_blob(template_overlay_blob, slot_blob);
>> 
>> 	overlay_node = find_node(overlay_blob, “/fragment@0/__overlay__);
>> 	target_prop = find_prop(overlay_blob, “/fragment@0/target-path”);
>> 
>> 	inject_slot_blob(overlay_blob, overlay_node, slot_blob);
>> 	modify_slot_target(overlay_blob, target_prop, slot_target);
>> 	
>> I don’t think you need to re-flatten anything, shuffling bits around with
>> memmove should work.
> 
> Fairly gross :-)
> 

You don’t want to know how sausages are made, but they are delicious :)

> But yeah generating the overlay doesn't necessarily scare me, I can
> generate a temp tree that is the overlay in which I "copy" the subtree
> (or in my internal ptr-based representation I could have a concept of
> alias which I follow while flattening).
> 
> That leaves me with these problems:
> 
> - No support for removing of nodes, so that needs to be added back to
> the format and to Linux unless I continue removing by hand in the PCI
> hotplug code itself
> 

What kind of nodes/properties you need to remove at _application_ time?

What you describe is inserting a bunch of properties and nodes under
a slot’s device node. Reverting the overlay removes them all just fine.

> - No support for "committing" the overlay which needs to be added as
> well.
> 

That’s the easiest part.

>>>>> Now we could consider that subtree as a changeset that can be undone,
>>>>> but that wouldn't work for boot time. And subsequent updates wouldn't
>>>>> have that concept of "undoing" anyway.
>>>>> 
>>>> 
>>>> I have posted another patch that does boot-time DT quirk which are
>>>> non-revertable.
>>>> 
>>>> https://lkml.org/lkml/2015/2/18/258
>>> 
>>> Not sure how that applies in my case ... I can't change the
>>> representation of the PCI subtree, this is standard OFW representation,
>>> I can't change the FW to make it an overlay-like thing at boot time,
>>> that would break existing kernels.
>>> 
>> 
>> The idea is to append the ‘quirk’ to the already booting device tree blob.
> 
> I know but that's not how things work for me. At boot time the FW passes
> me one tree that contains all the PCI stuff it has probed.
> 
>> Another idea floating around was to simple concatenate the booting blob with
>> any overlay blobs you want applied at boot time.
> 
> Sure but I don't get overlay blobs at boot time.
> 
>>>>> IE. conceptually, what overlays do today is quite rooted around the idea
>>>>> of having a fixed "base" DT and some pre-compiled DTB overlays that
>>>>> get added/removed. The design completely ignore the idea of a FW that
>>>>> maintains a "live" tree which we want to keep in sync, which is what we
>>>>> want to do here, or what we could do with a "live" open firmware
>>>>> implementation.
>>>>> 
>>>>> Now we might be able to reconcile them, but it feels to me that the
>>>>> overlay/changeset stuff is too rooted in the first concept…
>>>>> 
>>>> 
>>>> The first DT overlays use case (beaglebone capes) is what got the concept
>>>> started.
>>>> 
>>>> Right now is a generic mechanism to apply modifications to the kernel
>>>> live tree, with the possibility to revert them.
>>> 
>>> Yes but as I said it's not really thought in term of keeping the kernel
>>> tree in sync with an external dynamically generated tree. Maybe we can
>>> fix it, but it's more complex…
>>> 
>> 
>> Yes it is, unfortunately.
> 
> Right. Which makes the solution of just passing my bit of tree as a blob
> which I expand in Linux where I want it rather than an overlay tempting
> if we can make Gavin patch more palatable (removing the hybrid stuff
> etc…)
> .
> 

I see. Well, how about this?

Who said you have to do the whole blob dance in the firmware?

You can just as easily pass the blob as it is to the linux kernel and
the kernel there can convert it to an overlay and apply it.

> Cheers,
> Ben.
> 
>>> Ben.
>>> 
>>>>> Ben.
>>>>> 
>>>>> 
>>>> 
>>>> Regards
>>>> 
>>>> — Pantelis
>>> 
>>> 
>> 
>> Regards
>> 
>> — Pantelis
> 
> 

Regards

— Pantelis

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 14, 2015, 7:25 a.m. UTC | #21
On Thu, 2015-05-14 at 10:19 +0300, Pantelis Antoniou wrote:

> 
> You don’t want to know how sausages are made, but they are delicious :)

 ... most of the time :)

> > But yeah generating the overlay doesn't necessarily scare me, I can
> > generate a temp tree that is the overlay in which I "copy" the subtree
> > (or in my internal ptr-based representation I could have a concept of
> > alias which I follow while flattening).
> > 
> > That leaves me with these problems:
> > 
> > - No support for removing of nodes, so that needs to be added back to
> > the format and to Linux unless I continue removing by hand in the PCI
> > hotplug code itself
> > 
> 
> What kind of nodes/properties you need to remove at _application_ time?

Well, if we stick to removing by hand in Linux for the unplug case, then
none.

> What you describe is inserting a bunch of properties and nodes under
> a slot’s device node. Reverting the overlay removes them all just fine.

Except that still doesn't work for boot time :-)

So I would have to do a special case on unplug:

	if (slot->dt_is_overlay) /* set to false at boot */
		remove_subtree_myself();
	else
		undo_overlay(slot->overlay);

> > - No support for "committing" the overlay which needs to be added as
> > well.
> > 
> 
> That’s the easiest part.

Yeah, I will need to get my head around the code a bit more but it
doesn't seem too scary.

> I see. Well, how about this?
> 
> Who said you have to do the whole blob dance in the firmware?
> 
> You can just as easily pass the blob as it is to the linux kernel and
> the kernel there can convert it to an overlay and apply it.

That's not that pretty but we can do that too which solve the problem of
fixing the FW interface.

There is however an argument to be made in having the FW be able to
generate arbitrary overlays. If we ever want to pass more "property"
updates or node updates to Linux at runtime.

A few cases have crept up on the radar, like updating the pstate tables
or VPD informations ...

If we go down that path, then I would implement a concept of generation
count in the firmware, so I can generate an overlay that include all the
changes since the last "generation" given to Linux.

However that requires supporting removal of nodes/properties. So I'm
tempted to keep that feature on the back burner and go with an ad-hoc
interface for PCI for now.

Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 14, 2015, 7:29 a.m. UTC | #22
> So I would have to do a special case on unplug:
> 
> 	if (slot->dt_is_overlay) /* set to false at boot */
> 		remove_subtree_myself();
> 	else
> 		undo_overlay(slot->overlay);

Of course I just inverted the polarity of the if () in the example :-)

But you get the idea...


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pantelis Antoniou May 14, 2015, 7:34 a.m. UTC | #23
Hi Ben,

> On May 14, 2015, at 10:25 , Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> On Thu, 2015-05-14 at 10:19 +0300, Pantelis Antoniou wrote:
> 
>> 
>> You don’t want to know how sausages are made, but they are delicious :)
> 
> ... most of the time :)
> 
>>> But yeah generating the overlay doesn't necessarily scare me, I can
>>> generate a temp tree that is the overlay in which I "copy" the subtree
>>> (or in my internal ptr-based representation I could have a concept of
>>> alias which I follow while flattening).
>>> 
>>> That leaves me with these problems:
>>> 
>>> - No support for removing of nodes, so that needs to be added back to
>>> the format and to Linux unless I continue removing by hand in the PCI
>>> hotplug code itself
>>> 
>> 
>> What kind of nodes/properties you need to remove at _application_ time?
> 
> Well, if we stick to removing by hand in Linux for the unplug case, then
> none.
> 

OK

>> What you describe is inserting a bunch of properties and nodes under
>> a slot’s device node. Reverting the overlay removes them all just fine.
> 
> Except that still doesn't work for boot time :-)
> 
> So I would have to do a special case on unplug:
> 
> 	if (slot->dt_is_overlay) /* set to false at boot */
> 		remove_subtree_myself();
> 	else
> 		undo_overlay(slot->overlay);
> 

OK, in that case you do require removal. But in any case it’s the ‘negative’
of an already applied one, either at boot time or not.

Modifying the overlay code to apply a ‘negative’ property should do the trick.

Is that correct?

>>> - No support for "committing" the overlay which needs to be added as
>>> well.
>>> 
>> 
>> That’s the easiest part.
> 
> Yeah, I will need to get my head around the code a bit more but it
> doesn't seem too scary.
> 
>> I see. Well, how about this?
>> 
>> Who said you have to do the whole blob dance in the firmware?
>> 
>> You can just as easily pass the blob as it is to the linux kernel and
>> the kernel there can convert it to an overlay and apply it.
> 
> That's not that pretty but we can do that too which solve the problem of
> fixing the FW interface.
> 
> There is however an argument to be made in having the FW be able to
> generate arbitrary overlays. If we ever want to pass more "property"
> updates or node updates to Linux at runtime.
> 
> A few cases have crept up on the radar, like updating the pstate tables
> or VPD informations ...
> 
> If we go down that path, then I would implement a concept of generation
> count in the firmware, so I can generate an overlay that include all the
> changes since the last "generation" given to Linux.
> 

I will probably need that generation count myself for my PCI use case.

> However that requires supporting removal of nodes/properties. So I'm
> tempted to keep that feature on the back burner and go with an ad-hoc
> interface for PCI for now.
> 

I see. Bonne chance :)

> Ben.
> 
> 

Regards

— Pantelis

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 14, 2015, 7:47 a.m. UTC | #24
On Thu, 2015-05-14 at 10:34 +0300, Pantelis Antoniou wrote:

> >> What you describe is inserting a bunch of properties and nodes under
> >> a slot’s device node. Reverting the overlay removes them all just fine.
> > 
> > Except that still doesn't work for boot time :-)
> > 
> > So I would have to do a special case on unplug:
> > 
> > 	if (slot->dt_is_overlay) /* set to false at boot */
> > 		remove_subtree_myself();
> > 	else
> > 		undo_overlay(slot->overlay);
> > 
> 
> OK, in that case you do require removal. But in any case it’s the ‘negative’
> of an already applied one, either at boot time or not.

Sort-of, unless we have a way in the overlay to simply specify node
removal statements so we don't have to explicitly remove all properties
(or even all children).

> Modifying the overlay code to apply a ‘negative’ property should do the trick.
> 
> Is that correct?

I would do negatives node and let Linux imply the properties (or even
children).

But yes, that would probably do.

 .../...

> I will probably need that generation count myself for my PCI use case.
> 
> > However that requires supporting removal of nodes/properties. So I'm
> > tempted to keep that feature on the back burner and go with an ad-hoc
> > interface for PCI for now.
> > 
> 
> I see. Bonne chance :)

Merci :)

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pantelis Antoniou May 14, 2015, 11:02 a.m. UTC | #25
Hi Ben,

> On May 14, 2015, at 10:47 , Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 

[snip]

So I spend some time thinking about your use case and I think it boils down
to this:

I have a live tree in the firmware, I have made changes and I need to reflect
those changes to the live tree in the kernel.

Sounds like ‘how do I generate a patch for getting those two in sync'. No?

I can see where this might be useful for others as all.

I think we really need to create a liblivedt like we have libfdt since
we have a number of projects going about using/manipulating DT at runtime.

1. The linux kernel, with it’s own live tree implementation.
2. The device tree compiler (it has a live tree) custom implemented.
3. Your weird and wonderful (or wacky) firmware.
4. u-boot does use DT now, but it does with libfdt. I believe this is suboptimal.
5. barebox does DT as well.

Most of what we want to do with DT can be abstracted in a library I think that
all of those projects can use.

What are your thoughts?

Regards

— Pantelis

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt May 14, 2015, 11:25 p.m. UTC | #26
On Thu, 2015-05-14 at 14:02 +0300, Pantelis Antoniou wrote:
> Hi Ben,
> 
> > On May 14, 2015, at 10:47 , Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > 
> 
> [snip]
> 
> So I spend some time thinking about your use case and I think it boils down
> to this:
> 
> I have a live tree in the firmware, I have made changes and I need to reflect
> those changes to the live tree in the kernel.
> 
> Sounds like ‘how do I generate a patch for getting those two in sync'. No?

More or less.

> I can see where this might be useful for others as all.
> 
> I think we really need to create a liblivedt like we have libfdt since
> we have a number of projects going about using/manipulating DT at runtime.
> 
> 1. The linux kernel, with it’s own live tree implementation.
> 2. The device tree compiler (it has a live tree) custom implemented.
> 3. Your weird and wonderful (or wacky) firmware.
> 4. u-boot does use DT now, but it does with libfdt. I believe this is suboptimal.
> 5. barebox does DT as well.
> 
> Most of what we want to do with DT can be abstracted in a library I think that
> all of those projects can use.
> 
> What are your thoughts?

Well, we have at least two implementations, the kernel one and the one
in our OPAL firmware:

https://github.com/open-power/skiboot/blob/master/include/device.h
https://github.com/open-power/skiboot/blob/master/core/device.c

The latter uses some nice Rusty tricks (tm) for multiple argument
functions.

It would make sense to do a library somewhere yes. However, I need to
cut my firmware API pretty much today so I think for now I'll stick
to something Ad-Hoc for the PCI hotplug code that just passes the
bit of FDT with the new devices and leave the "grand project" of live
sync of the tree for later.

There are other implementations of live DT in various Open Firmware
variants out there, most are in Forth which I suggest you don't bother
with unless you enjoy pain, but I think at least one of these is
actually in C.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan May 15, 2015, 1:27 a.m. UTC | #27
On Fri, May 01, 2015 at 04:03:06PM +1000, Gavin Shan wrote:
>The requirement is raised when developing the PCI hotplug feature
>for PowerPC PowerNV platform, which runs on top of skiboot firmware.
>When plugging PCI adapter to one PCI slot, the firmware rescans the
>slot and build FDT (Flat Device Tree) blob, which is sent to the
>PowerNV PCI hotplug driver for processing. The new constructed device
>nodes from the FDT blob are expected to be attached to the device
>node of the PCI slot. Unfortunately, it seems we don't have a API
>to support the scenario. The patch intends to support it by newly
>introduced function of_fdt_add_subtree(), the design behind it is
>shown as below:
>
>   * When the sub-tree FDT blob, which is owned by firmware, is
>     received by kernel. It's copied over to the blob, which is
>     dynamically allocated. Since then, the FDT blob owned by
>     firmware isn't touched.
>   * Rework unflatten_dt_node() so that the device nodes in current
>     and deeper depth have been constructed from the FDT blob. All
>     device nodes are marked with flag OF_DYNAMIC_HYBIRD, which is
>     similar to OF_DYNAMIC. However, device node with the flag set
>     can be free'd, but in the way other than that for OF_DYNAMIC
>     device nodes.
>   * of_fdt_add_subtree() is the introduced API to do the work.
>

There are already lots of discussion on how to reuse overlay for my case.
Thanks to all for your time on this. I spend some time thinking about it
last night and this morning. I would like to summarize it as below. It's
for sure almost all ideas coming from your guys and I'm just documenting
it. If there are obvious problems in the following summary, please let me
know so that I can fix them as early as possible to save more time.

================
SKIBOOT & KERNEL
================

The idea came from Ben and I'm following to implement it as follows:

- One counter is mantained: cur_counter = 0;
- PCI hot plugging happens happens as:
  * Kernel gets skiboot's cur_counter with OPAL API, which is (x).
  * Skiboot does hot plugging and rescans the slot, then populate
    the device-tree nodes with (++cur_counter), which means the
    counter is turned to (x+1).
  * Kernel retrieves the FDT overlay blob on the device-tree changes
    since last time/token ((x)). Kernel unflattens it and applies the
    changes by overlay. The slot simply records the overlay (IDR) ID
    for the device-tree change.
- PCI hot unplugging happens as:
  * Revert the changes simply if the slot had valid IDR ID. Otherwise,
    the device nodes are flatten during bootup time, we just remove them
    as we're doing now. Note that device nodes can't be free'd because
    the memory chunks consumed by them are allocated from memblock or
    reserved by skiboot.
- Some questions/problems:
  * I don't understand how kexec can figure out the device-tree with
    applied changes from overlay. I assume kexec is simply using the
    FDT blob from skiboot as seen by the first kernel during frest
    boot.

Kernel APIs:
  * id = of_overlay_create(); of_overlay_destroy(id)
Skiboot API:
  * int64_t opal_get_overlay_dt(uint64_t *counter, void *blob, uint32_t len)
    blob == NULL, get current counter.
    blob != NULL, get overlay blob since (*counter). Skiboot also
    returns the last counter.
    The memory chunk for "blob" is always owned by kernel, which doesn't know
    the memory size to hold the overlay FDT blob. So we have to try with
    discret 64KB, which is PAGE_SIZE. After the overlay blob is unflatten
    and applied, the memory chunk can be free. We don't have to keep it
    for reverting the changes introduced by the overlay.

=============================
OVERLAY FDT BLOB FROM SKIBOOT
=============================

ROOT {
   A {
      target=<target node's phandle>
      __overlay__ {

      }
   }

   B {
      target=<target node's phandle>
      __overlay__ {

      }
   }
}

Thanks,
Gavin

>Cc: Grant Likely <grant.likely@linaro.org>
>Cc: Rob Herring <robh+dt@kernel.org>
>Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
> drivers/of/dynamic.c   |  19 +++++--
> drivers/of/fdt.c       | 133 ++++++++++++++++++++++++++++++++++++++++---------
> include/linux/of.h     |   2 +
> include/linux/of_fdt.h |   1 +
> 4 files changed, 127 insertions(+), 28 deletions(-)
>
>diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>index 3351ef4..f562080 100644
>--- a/drivers/of/dynamic.c
>+++ b/drivers/of/dynamic.c
>@@ -330,13 +330,22 @@ void of_node_release(struct kobject *kobj)
> 		return;
> 	}
>
>-	if (!of_node_check_flag(node, OF_DYNAMIC))
>+	/* Release the subtree */
>+	if (node->subtree) {
>+		kfree(node->subtree);
>+		node->subtree = NULL;
>+	}
>+
>+	if (!of_node_check_flag(node, OF_DYNAMIC) &&
>+	    !of_node_check_flag(node, OF_DYNAMIC_HYBIRD))
> 		return;
>
> 	while (prop) {
> 		struct property *next = prop->next;
>-		kfree(prop->name);
>-		kfree(prop->value);
>+		if (of_node_check_flag(node, OF_DYNAMIC)) {
>+			kfree(prop->name);
>+			kfree(prop->value);
>+		}
> 		kfree(prop);
> 		prop = next;
>
>@@ -345,7 +354,9 @@ void of_node_release(struct kobject *kobj)
> 			node->deadprops = NULL;
> 		}
> 	}
>-	kfree(node->full_name);
>+
>+	if (of_node_check_flag(node, OF_DYNAMIC))
>+		kfree(node->full_name);
> 	kfree(node->data);
> 	kfree(node);
> }
>diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
>index cde35c5d01..7659560 100644
>--- a/drivers/of/fdt.c
>+++ b/drivers/of/fdt.c
>@@ -28,6 +28,10 @@
> #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
> #include <asm/page.h>
>
>+#include "of_private.h"
>+
>+static int cur_node_depth;
>+
> /*
>  * of_fdt_limit_memory - limit the number of regions in the /memory node
>  * @limit: maximum entries
>@@ -168,20 +172,20 @@ static void *unflatten_dt_alloc(void **mem, unsigned long size,
>  * @dad: Parent struct device_node
>  * @fpsize: Size of the node path up at the current depth.
>  */
>-static void * unflatten_dt_node(void *blob,
>-				void *mem,
>-				int *poffset,
>-				struct device_node *dad,
>-				struct device_node **nodepp,
>-				unsigned long fpsize,
>-				bool dryrun)
>+static void *unflatten_dt_node(void *blob,
>+			       void *mem,
>+			       int *poffset,
>+			       struct device_node *dad,
>+			       struct device_node **nodepp,
>+			       unsigned long fpsize,
>+			       bool dryrun,
>+			       bool dynamic)
> {
> 	const __be32 *p;
> 	struct device_node *np;
> 	struct property *pp, **prev_pp = NULL;
> 	const char *pathp;
> 	unsigned int l, allocl;
>-	static int depth = 0;
> 	int old_depth;
> 	int offset;
> 	int has_name = 0;
>@@ -219,12 +223,18 @@ static void * unflatten_dt_node(void *blob,
> 		}
> 	}
>
>-	np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl,
>+	if (dynamic)
>+		np = kzalloc(sizeof(struct device_node) + allocl, GFP_KERNEL);
>+	else
>+		np = unflatten_dt_alloc(&mem,
>+				sizeof(struct device_node) + allocl,
> 				__alignof__(struct device_node));
> 	if (!dryrun) {
> 		char *fn;
> 		of_node_init(np);
> 		np->full_name = fn = ((char *)np) + sizeof(*np);
>+		if (dynamic)
>+			of_node_set_flag(np, OF_DYNAMIC_HYBIRD);
> 		if (new_format) {
> 			/* rebuild full path for new format */
> 			if (dad && dad->parent) {
>@@ -267,8 +277,12 @@ static void * unflatten_dt_node(void *blob,
> 		}
> 		if (strcmp(pname, "name") == 0)
> 			has_name = 1;
>-		pp = unflatten_dt_alloc(&mem, sizeof(struct property),
>-					__alignof__(struct property));
>+
>+		if (dynamic)
>+			pp = kzalloc(sizeof(struct property), GFP_KERNEL);
>+		else
>+			pp = unflatten_dt_alloc(&mem, sizeof(struct property),
>+						__alignof__(struct property));
> 		if (!dryrun) {
> 			/* We accept flattened tree phandles either in
> 			 * ePAPR-style "phandle" properties, or the
>@@ -309,8 +323,13 @@ static void * unflatten_dt_node(void *blob,
> 		if (pa < ps)
> 			pa = p1;
> 		sz = (pa - ps) + 1;
>-		pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz,
>-					__alignof__(struct property));
>+
>+		if (dynamic)
>+			pp = kzalloc(sizeof(struct property) + sz, GFP_KERNEL);
>+		else
>+			pp = unflatten_dt_alloc(&mem,
>+						sizeof(struct property) + sz,
>+						__alignof__(struct property));
> 		if (!dryrun) {
> 			pp->name = "name";
> 			pp->length = sz;
>@@ -334,13 +353,21 @@ static void * unflatten_dt_node(void *blob,
> 			np->type = "<NULL>";
> 	}
>
>-	old_depth = depth;
>-	*poffset = fdt_next_node(blob, *poffset, &depth);
>-	if (depth < 0)
>-		depth = 0;
>-	while (*poffset > 0 && depth > old_depth)
>-		mem = unflatten_dt_node(blob, mem, poffset, np, NULL,
>-					fpsize, dryrun);
>+	old_depth = cur_node_depth;
>+	*poffset = fdt_next_node(blob, *poffset, &cur_node_depth);
>+	while (*poffset > 0) {
>+		if (cur_node_depth < old_depth)
>+			break;
>+
>+		if (cur_node_depth == old_depth)
>+			mem = unflatten_dt_node(blob, mem, poffset,
>+						dad, NULL, fpsize,
>+						dryrun, dynamic);
>+		else if (cur_node_depth > old_depth)
>+			mem = unflatten_dt_node(blob, mem, poffset,
>+						np, NULL, fpsize,
>+						dryrun, dynamic);
>+	}
>
> 	if (*poffset < 0 && *poffset != -FDT_ERR_NOTFOUND)
> 		pr_err("unflatten: error %d processing FDT\n", *poffset);
>@@ -379,8 +406,8 @@ static void * unflatten_dt_node(void *blob,
>  * for the resulting tree
>  */
> static void __unflatten_device_tree(void *blob,
>-			     struct device_node **mynodes,
>-			     void * (*dt_alloc)(u64 size, u64 align))
>+				struct device_node **mynodes,
>+				void * (*dt_alloc)(u64 size, u64 align))
> {
> 	unsigned long size;
> 	int start;
>@@ -405,7 +432,9 @@ static void __unflatten_device_tree(void *blob,
>
> 	/* First pass, scan for size */
> 	start = 0;
>-	size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL, NULL, 0, true);
>+	cur_node_depth = 1;
>+	size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL,
>+						NULL, 0, true, false);
> 	size = ALIGN(size, 4);
>
> 	pr_debug("  size is %lx, allocating...\n", size);
>@@ -420,7 +449,8 @@ static void __unflatten_device_tree(void *blob,
>
> 	/* Second pass, do actual unflattening */
> 	start = 0;
>-	unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false);
>+	cur_node_depth = 1;
>+	unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false, false);
> 	if (be32_to_cpup(mem + size) != 0xdeadbeef)
> 		pr_warning("End of tree marker overwritten: %08x\n",
> 			   be32_to_cpup(mem + size));
>@@ -448,6 +478,61 @@ void of_fdt_unflatten_tree(unsigned long *blob,
> }
> EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
>
>+static void populate_sysfs_for_child_nodes(struct device_node *parent)
>+{
>+	struct device_node *child;
>+
>+	for_each_child_of_node(parent, child) {
>+		__of_attach_node_sysfs(child);
>+		populate_sysfs_for_child_nodes(child);
>+	}
>+}
>+
>+/**
>+ * of_fdt_add_substree - Create sub-tree of device nodes
>+ * @parent: parent device node to which the sub-tree will attach
>+ * @blob: flat device tree blob representing the sub-tree
>+ *
>+ * Copy over the FDT blob, which passed from firmware, and then
>+ * unflatten the sub-tree.
>+ */
>+void of_fdt_add_subtree(struct device_node *parent, void *blob)
>+{
>+	int start = 0;
>+
>+	/* Validate the header */
>+	if (!blob || fdt_check_header(blob)) {
>+		pr_err("%s: Invalid device-tree blob header at 0x%p\n",
>+		       __func__, blob);
>+		return;
>+	}
>+
>+	/* Free the flat blob for last time lazily */
>+	if (parent->subtree) {
>+		kfree(parent->subtree);
>+		parent->subtree = NULL;
>+	}
>+
>+	/* Copy over the flat blob */
>+	parent->subtree = kzalloc(fdt_totalsize(blob), GFP_KERNEL);
>+	if (!parent->subtree) {
>+		pr_err("%s: Cannot copy over device-tree blob\n",
>+		       __func__);
>+		return;
>+	}
>+
>+	memcpy(parent->subtree, blob, fdt_totalsize(blob));
>+
>+	/* Unflatten it */
>+	mutex_lock(&of_mutex);
>+	cur_node_depth = 1;
>+	unflatten_dt_node(parent->subtree, NULL, &start, parent, NULL,
>+			  strlen(parent->full_name), false, true);
>+	populate_sysfs_for_child_nodes(parent);
>+	mutex_unlock(&of_mutex);
>+}
>+EXPORT_SYMBOL(of_fdt_add_subtree);
>+
> /* Everything below here references initial_boot_params directly. */
> int __initdata dt_root_addr_cells;
> int __initdata dt_root_size_cells;
>diff --git a/include/linux/of.h b/include/linux/of.h
>index ddeaae6..ac50b02 100644
>--- a/include/linux/of.h
>+++ b/include/linux/of.h
>@@ -60,6 +60,7 @@ struct device_node {
> 	struct	device_node *sibling;
> 	struct	kobject kobj;
> 	unsigned long _flags;
>+	void	*subtree;
> 	void	*data;
> #if defined(CONFIG_SPARC)
> 	const char *path_component_name;
>@@ -222,6 +223,7 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size)
> #define OF_DETACHED	2 /* node has been detached from the device tree */
> #define OF_POPULATED	3 /* device already created for the node */
> #define OF_POPULATED_BUS	4 /* of_platform_populate recursed to children of this node */
>+#define OF_DYNAMIC_HYBIRD	5 /* similar to OF_DYNAMIC, but partially */
>
> #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
> #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
>diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
>index 587ee50..1fb47d7 100644
>--- a/include/linux/of_fdt.h
>+++ b/include/linux/of_fdt.h
>@@ -39,6 +39,7 @@ extern int of_fdt_match(const void *blob, unsigned long node,
> 			const char *const *compat);
> extern void of_fdt_unflatten_tree(unsigned long *blob,
> 			       struct device_node **mynodes);
>+extern void of_fdt_add_subtree(struct device_node *parent, void *blob);
>
> /* TBD: Temporary export of fdt globals - remove when code fully merged */
> extern int __initdata dt_root_addr_cells;
>-- 
>2.1.0
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Grant Likely June 7, 2015, 7:54 a.m. UTC | #28
Sorry for not weighing in earlier, I've had other work keeping me away.

My short answer: don't use overlays. They're not what you need. Generic
CONFIG_OF_DYNAMIC should be all that is required to make changes in your
use case.

Overlays are a specific api for being able to apply a set of changes in
a revertable way, but as you say, it is a lot more complicated. However,
overlays are built on top of the of_changeset API which is a lot
simpler. It doesn't do any phandle resolution, and it doesn't do any
tracking. It takes a set of changes (attach node, detach node, add
property, remove property), an applies them to the live tree. At that
point the changes are permenant*.

It is documented in Documentation/devicetree/changesets.txt

Ideally, I want all DT changes to go through the changeset API so that
the lifecycle issues are delt with in one place. It also defers firing
notifiers until after the entire changeset is applied. With
of_attach_node/of_detach_node the notifiers are sent immediately after
each change when the tree may be in an inconsistent state. For example,
a driver expecting child nodes, but the child nodes haven't been added
yet.

Comments below...

* There is an API for reverting a changeset, which simply applies the
  changes backwards and in reverse. The overlay code uses it, but you
  won't need it.

On Thu, 14 May 2015 10:54:31 +1000
, Benjamin Herrenschmidt <benh@kernel.crashing.org>
 wrote:
> On Wed, 2015-05-13 at 19:18 -0500, Rob Herring wrote:
> 
> > I haven't decided really.
> > 
> > The main thing with the current patch is I don't really like the added
> > complexity to unflatten_dt_node. It is already a fairly complex
> > function. Perhaps removing of "hybrid" as discussed will help?
> 
> I agree, we should be able to make that much simpler, I was planning on
> sorting that out with Gavin.

Ditto here. I don't want to have any new kinds of nodes created either.
They are either OF_DYNAMIC, and therefore freeable, or they are not and
cannot be freed.

> > If there are things we can do to make overlays easier to use in your
> > use case, I'd like to hear ideas. I don't really buy that being more
> > complex than needed is an obstacle. That is very often the case to
> > have common, scale-able solutions. I want to see a simple case be
> > simple to support.
> 
> Well, it's a LOT more complex from the FW perspective for a bunch of
> features we don't really need, in a way because the DT update in our
> case is just purely informational to avoid keeping wrong/outdated DT
> bits, it has little functional impact (it might have a bit for interrupt
> routing through bridges though).
> 
> However, I am also pursuing an approach on FW side using a generation
> count in our nodes and properties which we could use to generate
> arbitrary overlays if we know what generation linux has.
> 
> There might actual be a usage scenario for a generic way for our
> firwmare to convey DT updates to Linux for other reasons.
> 
> A few things that I don't find in the overlay code (but maybe I haven't
> looked at it hard enough):
> 
>  - Can it remove nodes/properties ?

Overlays: No, because I asked Pantelis to drop them.
Changeset: yes, absolutely

>  - Can it "commit" a changeset so it's permanently part of the main DT ?
> We will never have a concept of "revertable" changesets, if we need a
> subsequent update, we will get a new overlay from FW that will remove
> what needs to be removed and add what needs to be added.

Yes

> IE, our current mechanism without overlay is fairly simple:
> 
>   - On PCI unplug, we remove all nodes below the slot (from linux),
> the FW does the equivalent internally.
> 
>   - On PCI re-plug, the FW internally builds new nodes and sends a
> new subtree as an FDT that we can expand/attach.
> 
> Now we could consider that subtree as a changeset that can be undone,
> but that wouldn't work for boot time. And subsequent updates wouldn't
> have that concept of "undoing" anyway.
> 
> IE. conceptually, what overlays do today is quite rooted around the idea
> of having a fixed "base" DT and some pre-compiled DTB overlays that
> get added/removed. The design completely ignore the idea of a FW that
> maintains a "live" tree which we want to keep in sync, which is what we
> want to do here, or what we could do with a "live" open firmware
> implementation.

Right, which is exactly the reason for the changeset/overlay split.
Overlays assume a fixed base, and that overlays are kind of like plug-in
modules. changeset makes no such assumption.

g.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt June 8, 2015, 8:57 p.m. UTC | #29
On Sun, 2015-06-07 at 08:54 +0100, Grant Likely wrote:
> > IE. conceptually, what overlays do today is quite rooted around the idea
> > of having a fixed "base" DT and some pre-compiled DTB overlays that
> > get added/removed. The design completely ignore the idea of a FW that
> > maintains a "live" tree which we want to keep in sync, which is what we
> > want to do here, or what we could do with a "live" open firmware
> > implementation.
> 
> Right, which is exactly the reason for the changeset/overlay split.
> Overlays assume a fixed base, and that overlays are kind of like plug-in
> modules. changeset makes no such assumption.

So you suggest we create a function that takes an fdt and an "anchor" as input,
and expands that FDT below that anchor, but does so by using the changeset API
under the hood ?

Even that looks somewhat tricky (turn that bit of FDT into a pile of changeset
actions), however, I can see how we could create a new function inside changeset
to attach a subtree.

Ie. of_attach_subtree() (which could have it's own reconfig action but we
don't care that much yet), which takes an expanded subtree and an anchor, and
calls of_attach_node() in effect for all nodes in there.

We could then have a two pass mechanism in our hotplug code:

 - Expand the bit of fdt into a separate tree
 - Use of_attach_subtree to "add" that subtree to the main tree

What do you think ?

Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Grant Likely June 8, 2015, 9:34 p.m. UTC | #30
On Mon, Jun 8, 2015 at 9:57 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Sun, 2015-06-07 at 08:54 +0100, Grant Likely wrote:
>> > IE. conceptually, what overlays do today is quite rooted around the idea
>> > of having a fixed "base" DT and some pre-compiled DTB overlays that
>> > get added/removed. The design completely ignore the idea of a FW that
>> > maintains a "live" tree which we want to keep in sync, which is what we
>> > want to do here, or what we could do with a "live" open firmware
>> > implementation.
>>
>> Right, which is exactly the reason for the changeset/overlay split.
>> Overlays assume a fixed base, and that overlays are kind of like plug-in
>> modules. changeset makes no such assumption.
>
> So you suggest we create a function that takes an fdt and an "anchor" as input,
> and expands that FDT below that anchor, but does so by using the changeset API
> under the hood ?
>
> Even that looks somewhat tricky (turn that bit of FDT into a pile of changeset
> actions), however, I can see how we could create a new function inside changeset
> to attach a subtree.
>
> Ie. of_attach_subtree() (which could have it's own reconfig action but we
> don't care that much yet), which takes an expanded subtree and an anchor, and
> calls of_attach_node() in effect for all nodes in there.
>
> We could then have a two pass mechanism in our hotplug code:
>
>  - Expand the bit of fdt into a separate tree
>  - Use of_attach_subtree to "add" that subtree to the main tree
>
> What do you think ?

I like that.

g.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gavin Shan June 10, 2015, 6:55 a.m. UTC | #31
On Mon, Jun 08, 2015 at 10:34:13PM +0100, Grant Likely wrote:
>On Mon, Jun 8, 2015 at 9:57 PM, Benjamin Herrenschmidt
><benh@kernel.crashing.org> wrote:
>> On Sun, 2015-06-07 at 08:54 +0100, Grant Likely wrote:
>>> > IE. conceptually, what overlays do today is quite rooted around the idea
>>> > of having a fixed "base" DT and some pre-compiled DTB overlays that
>>> > get added/removed. The design completely ignore the idea of a FW that
>>> > maintains a "live" tree which we want to keep in sync, which is what we
>>> > want to do here, or what we could do with a "live" open firmware
>>> > implementation.
>>>
>>> Right, which is exactly the reason for the changeset/overlay split.
>>> Overlays assume a fixed base, and that overlays are kind of like plug-in
>>> modules. changeset makes no such assumption.
>>
>> So you suggest we create a function that takes an fdt and an "anchor" as input,
>> and expands that FDT below that anchor, but does so by using the changeset API
>> under the hood ?
>>
>> Even that looks somewhat tricky (turn that bit of FDT into a pile of changeset
>> actions), however, I can see how we could create a new function inside changeset
>> to attach a subtree.
>>
>> Ie. of_attach_subtree() (which could have it's own reconfig action but we
>> don't care that much yet), which takes an expanded subtree and an anchor, and
>> calls of_attach_node() in effect for all nodes in there.
>>
>> We could then have a two pass mechanism in our hotplug code:
>>
>>  - Expand the bit of fdt into a separate tree
>>  - Use of_attach_subtree to "add" that subtree to the main tree
>>
>> What do you think ?
>
>I like that.
>

Thanks, Grant and Ben. Currently, I'm collecting more feedbacks for v5. So
it's something I will address in v6.

Thanks,
Gavin

>g.
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 3351ef4..f562080 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -330,13 +330,22 @@  void of_node_release(struct kobject *kobj)
 		return;
 	}
 
-	if (!of_node_check_flag(node, OF_DYNAMIC))
+	/* Release the subtree */
+	if (node->subtree) {
+		kfree(node->subtree);
+		node->subtree = NULL;
+	}
+
+	if (!of_node_check_flag(node, OF_DYNAMIC) &&
+	    !of_node_check_flag(node, OF_DYNAMIC_HYBIRD))
 		return;
 
 	while (prop) {
 		struct property *next = prop->next;
-		kfree(prop->name);
-		kfree(prop->value);
+		if (of_node_check_flag(node, OF_DYNAMIC)) {
+			kfree(prop->name);
+			kfree(prop->value);
+		}
 		kfree(prop);
 		prop = next;
 
@@ -345,7 +354,9 @@  void of_node_release(struct kobject *kobj)
 			node->deadprops = NULL;
 		}
 	}
-	kfree(node->full_name);
+
+	if (of_node_check_flag(node, OF_DYNAMIC))
+		kfree(node->full_name);
 	kfree(node->data);
 	kfree(node);
 }
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index cde35c5d01..7659560 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -28,6 +28,10 @@ 
 #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
 #include <asm/page.h>
 
+#include "of_private.h"
+
+static int cur_node_depth;
+
 /*
  * of_fdt_limit_memory - limit the number of regions in the /memory node
  * @limit: maximum entries
@@ -168,20 +172,20 @@  static void *unflatten_dt_alloc(void **mem, unsigned long size,
  * @dad: Parent struct device_node
  * @fpsize: Size of the node path up at the current depth.
  */
-static void * unflatten_dt_node(void *blob,
-				void *mem,
-				int *poffset,
-				struct device_node *dad,
-				struct device_node **nodepp,
-				unsigned long fpsize,
-				bool dryrun)
+static void *unflatten_dt_node(void *blob,
+			       void *mem,
+			       int *poffset,
+			       struct device_node *dad,
+			       struct device_node **nodepp,
+			       unsigned long fpsize,
+			       bool dryrun,
+			       bool dynamic)
 {
 	const __be32 *p;
 	struct device_node *np;
 	struct property *pp, **prev_pp = NULL;
 	const char *pathp;
 	unsigned int l, allocl;
-	static int depth = 0;
 	int old_depth;
 	int offset;
 	int has_name = 0;
@@ -219,12 +223,18 @@  static void * unflatten_dt_node(void *blob,
 		}
 	}
 
-	np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl,
+	if (dynamic)
+		np = kzalloc(sizeof(struct device_node) + allocl, GFP_KERNEL);
+	else
+		np = unflatten_dt_alloc(&mem,
+				sizeof(struct device_node) + allocl,
 				__alignof__(struct device_node));
 	if (!dryrun) {
 		char *fn;
 		of_node_init(np);
 		np->full_name = fn = ((char *)np) + sizeof(*np);
+		if (dynamic)
+			of_node_set_flag(np, OF_DYNAMIC_HYBIRD);
 		if (new_format) {
 			/* rebuild full path for new format */
 			if (dad && dad->parent) {
@@ -267,8 +277,12 @@  static void * unflatten_dt_node(void *blob,
 		}
 		if (strcmp(pname, "name") == 0)
 			has_name = 1;
-		pp = unflatten_dt_alloc(&mem, sizeof(struct property),
-					__alignof__(struct property));
+
+		if (dynamic)
+			pp = kzalloc(sizeof(struct property), GFP_KERNEL);
+		else
+			pp = unflatten_dt_alloc(&mem, sizeof(struct property),
+						__alignof__(struct property));
 		if (!dryrun) {
 			/* We accept flattened tree phandles either in
 			 * ePAPR-style "phandle" properties, or the
@@ -309,8 +323,13 @@  static void * unflatten_dt_node(void *blob,
 		if (pa < ps)
 			pa = p1;
 		sz = (pa - ps) + 1;
-		pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz,
-					__alignof__(struct property));
+
+		if (dynamic)
+			pp = kzalloc(sizeof(struct property) + sz, GFP_KERNEL);
+		else
+			pp = unflatten_dt_alloc(&mem,
+						sizeof(struct property) + sz,
+						__alignof__(struct property));
 		if (!dryrun) {
 			pp->name = "name";
 			pp->length = sz;
@@ -334,13 +353,21 @@  static void * unflatten_dt_node(void *blob,
 			np->type = "<NULL>";
 	}
 
-	old_depth = depth;
-	*poffset = fdt_next_node(blob, *poffset, &depth);
-	if (depth < 0)
-		depth = 0;
-	while (*poffset > 0 && depth > old_depth)
-		mem = unflatten_dt_node(blob, mem, poffset, np, NULL,
-					fpsize, dryrun);
+	old_depth = cur_node_depth;
+	*poffset = fdt_next_node(blob, *poffset, &cur_node_depth);
+	while (*poffset > 0) {
+		if (cur_node_depth < old_depth)
+			break;
+
+		if (cur_node_depth == old_depth)
+			mem = unflatten_dt_node(blob, mem, poffset,
+						dad, NULL, fpsize,
+						dryrun, dynamic);
+		else if (cur_node_depth > old_depth)
+			mem = unflatten_dt_node(blob, mem, poffset,
+						np, NULL, fpsize,
+						dryrun, dynamic);
+	}
 
 	if (*poffset < 0 && *poffset != -FDT_ERR_NOTFOUND)
 		pr_err("unflatten: error %d processing FDT\n", *poffset);
@@ -379,8 +406,8 @@  static void * unflatten_dt_node(void *blob,
  * for the resulting tree
  */
 static void __unflatten_device_tree(void *blob,
-			     struct device_node **mynodes,
-			     void * (*dt_alloc)(u64 size, u64 align))
+				struct device_node **mynodes,
+				void * (*dt_alloc)(u64 size, u64 align))
 {
 	unsigned long size;
 	int start;
@@ -405,7 +432,9 @@  static void __unflatten_device_tree(void *blob,
 
 	/* First pass, scan for size */
 	start = 0;
-	size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL, NULL, 0, true);
+	cur_node_depth = 1;
+	size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL,
+						NULL, 0, true, false);
 	size = ALIGN(size, 4);
 
 	pr_debug("  size is %lx, allocating...\n", size);
@@ -420,7 +449,8 @@  static void __unflatten_device_tree(void *blob,
 
 	/* Second pass, do actual unflattening */
 	start = 0;
-	unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false);
+	cur_node_depth = 1;
+	unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false, false);
 	if (be32_to_cpup(mem + size) != 0xdeadbeef)
 		pr_warning("End of tree marker overwritten: %08x\n",
 			   be32_to_cpup(mem + size));
@@ -448,6 +478,61 @@  void of_fdt_unflatten_tree(unsigned long *blob,
 }
 EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
 
+static void populate_sysfs_for_child_nodes(struct device_node *parent)
+{
+	struct device_node *child;
+
+	for_each_child_of_node(parent, child) {
+		__of_attach_node_sysfs(child);
+		populate_sysfs_for_child_nodes(child);
+	}
+}
+
+/**
+ * of_fdt_add_substree - Create sub-tree of device nodes
+ * @parent: parent device node to which the sub-tree will attach
+ * @blob: flat device tree blob representing the sub-tree
+ *
+ * Copy over the FDT blob, which passed from firmware, and then
+ * unflatten the sub-tree.
+ */
+void of_fdt_add_subtree(struct device_node *parent, void *blob)
+{
+	int start = 0;
+
+	/* Validate the header */
+	if (!blob || fdt_check_header(blob)) {
+		pr_err("%s: Invalid device-tree blob header at 0x%p\n",
+		       __func__, blob);
+		return;
+	}
+
+	/* Free the flat blob for last time lazily */
+	if (parent->subtree) {
+		kfree(parent->subtree);
+		parent->subtree = NULL;
+	}
+
+	/* Copy over the flat blob */
+	parent->subtree = kzalloc(fdt_totalsize(blob), GFP_KERNEL);
+	if (!parent->subtree) {
+		pr_err("%s: Cannot copy over device-tree blob\n",
+		       __func__);
+		return;
+	}
+
+	memcpy(parent->subtree, blob, fdt_totalsize(blob));
+
+	/* Unflatten it */
+	mutex_lock(&of_mutex);
+	cur_node_depth = 1;
+	unflatten_dt_node(parent->subtree, NULL, &start, parent, NULL,
+			  strlen(parent->full_name), false, true);
+	populate_sysfs_for_child_nodes(parent);
+	mutex_unlock(&of_mutex);
+}
+EXPORT_SYMBOL(of_fdt_add_subtree);
+
 /* Everything below here references initial_boot_params directly. */
 int __initdata dt_root_addr_cells;
 int __initdata dt_root_size_cells;
diff --git a/include/linux/of.h b/include/linux/of.h
index ddeaae6..ac50b02 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -60,6 +60,7 @@  struct device_node {
 	struct	device_node *sibling;
 	struct	kobject kobj;
 	unsigned long _flags;
+	void	*subtree;
 	void	*data;
 #if defined(CONFIG_SPARC)
 	const char *path_component_name;
@@ -222,6 +223,7 @@  static inline unsigned long of_read_ulong(const __be32 *cell, int size)
 #define OF_DETACHED	2 /* node has been detached from the device tree */
 #define OF_POPULATED	3 /* device already created for the node */
 #define OF_POPULATED_BUS	4 /* of_platform_populate recursed to children of this node */
+#define OF_DYNAMIC_HYBIRD	5 /* similar to OF_DYNAMIC, but partially */
 
 #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
 #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 587ee50..1fb47d7 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -39,6 +39,7 @@  extern int of_fdt_match(const void *blob, unsigned long node,
 			const char *const *compat);
 extern void of_fdt_unflatten_tree(unsigned long *blob,
 			       struct device_node **mynodes);
+extern void of_fdt_add_subtree(struct device_node *parent, void *blob);
 
 /* TBD: Temporary export of fdt globals - remove when code fully merged */
 extern int __initdata dt_root_addr_cells;