diff mbox series

[v5,08/10] drm/bridge: samsung-dsim: use supporting variable for out_bridge

Message ID 20241231-hotplug-drm-bridge-v5-8-173065a1ece1@bootlin.com (mailing list archive)
State New, archived
Headers show
Series Add support for hot-pluggable DRM bridges | expand

Commit Message

Luca Ceresoli Dec. 31, 2024, 10:40 a.m. UTC
Instead of using dsi->out_bridge during the bridge search process, use a
temporary variable and assign dsi->out_bridge only on successful
completion.

The main goal is to be able to drm_bridge_get() the out_bridge before
setting it in dsi->out_bridge, which is done in a later commit. Setting
dsi->out_bridge as in current code would leave a use-after-free window in
case the bridge is deallocated by some other thread between
'dsi->out_bridge = devm_drm_panel_bridge_add()' and drm_bridge_get().

This change additionally avoids leaving an ERR_PTR value in dsi->out_bridge
on failure. This is not necessarily a problem but it is not clean.

Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>

---

This patch was added in v5.
---
 drivers/gpu/drm/bridge/samsung-dsim.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

Comments

Dmitry Baryshkov Dec. 31, 2024, 2:57 p.m. UTC | #1
On Tue, Dec 31, 2024 at 11:40:02AM +0100, Luca Ceresoli wrote:
> Instead of using dsi->out_bridge during the bridge search process, use a
> temporary variable and assign dsi->out_bridge only on successful
> completion.
> 
> The main goal is to be able to drm_bridge_get() the out_bridge before
> setting it in dsi->out_bridge, which is done in a later commit. Setting
> dsi->out_bridge as in current code would leave a use-after-free window in
> case the bridge is deallocated by some other thread between
> 'dsi->out_bridge = devm_drm_panel_bridge_add()' and drm_bridge_get().

I don't think that's how refcounting should work. Any of the functions
that give you the bridge should also increase refcount, requiring manual
_put() call afterwards. We might need a separate API for that.

> 
> This change additionally avoids leaving an ERR_PTR value in dsi->out_bridge
> on failure. This is not necessarily a problem but it is not clean.
> 
> Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
> 
> ---
> 
> This patch was added in v5.
> ---
>  drivers/gpu/drm/bridge/samsung-dsim.c | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c
> index f8b4fb8357659018ec0db65374ee5d05330639ae..c4d1563fd32019efde523dfc0863be044c05a826 100644
> --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> @@ -1705,6 +1705,7 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
>  	struct device *dev = dsi->dev;
>  	struct device_node *np = dev->of_node;
>  	struct device_node *remote;
> +	struct drm_bridge *out_bridge;
>  	struct drm_panel *panel;
>  	int ret;
>  
> @@ -1740,21 +1741,23 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
>  
>  	panel = of_drm_find_panel(remote);
>  	if (!IS_ERR(panel)) {
> -		dsi->out_bridge = devm_drm_panel_bridge_add(dev, panel);
> +		out_bridge = devm_drm_panel_bridge_add(dev, panel);
>  	} else {
> -		dsi->out_bridge = of_drm_find_bridge(remote);
> -		if (!dsi->out_bridge)
> -			dsi->out_bridge = ERR_PTR(-EINVAL);
> +		out_bridge = of_drm_find_bridge(remote);
> +		if (!out_bridge)
> +			out_bridge = ERR_PTR(-EINVAL);
>  	}

While looking at this patch, I think we should migrate the driver to
drm_of_find_panel_or_bridge(). Then your patch might add a function
close to devm_drm_of_get_bridge() or drmm_of_get_bridge(). Otherwise we
end up having too much duplicate boilerplate code.

>  
>  	of_node_put(remote);
>  
> -	if (IS_ERR(dsi->out_bridge)) {
> -		ret = PTR_ERR(dsi->out_bridge);
> +	if (IS_ERR(out_bridge)) {
> +		ret = PTR_ERR(out_bridge);
>  		DRM_DEV_ERROR(dev, "failed to find the bridge: %d\n", ret);
>  		return ret;
>  	}
>  
> +	dsi->out_bridge = out_bridge;
> +
>  	DRM_DEV_INFO(dev, "Attached %s device (lanes:%d bpp:%d mode-flags:0x%lx)\n",
>  		     device->name, device->lanes,
>  		     mipi_dsi_pixel_format_to_bpp(device->format),
> 
> -- 
> 2.34.1
>
Luca Ceresoli Jan. 2, 2025, 12:01 p.m. UTC | #2
Hi Dmitry,

On Tue, 31 Dec 2024 16:57:38 +0200
Dmitry Baryshkov <dmitry.baryshkov@linaro.org> wrote:

> On Tue, Dec 31, 2024 at 11:40:02AM +0100, Luca Ceresoli wrote:
> > Instead of using dsi->out_bridge during the bridge search process, use a
> > temporary variable and assign dsi->out_bridge only on successful
> > completion.
> > 
> > The main goal is to be able to drm_bridge_get() the out_bridge before
> > setting it in dsi->out_bridge, which is done in a later commit. Setting
> > dsi->out_bridge as in current code would leave a use-after-free window in
> > case the bridge is deallocated by some other thread between
> > 'dsi->out_bridge = devm_drm_panel_bridge_add()' and drm_bridge_get().  
> 
> I don't think that's how refcounting should work. Any of the functions
> that give you the bridge should also increase refcount, requiring manual
> _put() call afterwards. We might need a separate API for that.

You're perfectly right.

> > This change additionally avoids leaving an ERR_PTR value in dsi->out_bridge
> > on failure. This is not necessarily a problem but it is not clean.
> > 
> > Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
> > 
> > ---
> > 
> > This patch was added in v5.
> > ---
> >  drivers/gpu/drm/bridge/samsung-dsim.c | 15 +++++++++------
> >  1 file changed, 9 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c
> > index f8b4fb8357659018ec0db65374ee5d05330639ae..c4d1563fd32019efde523dfc0863be044c05a826 100644
> > --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> > +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> > @@ -1705,6 +1705,7 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> >  	struct device *dev = dsi->dev;
> >  	struct device_node *np = dev->of_node;
> >  	struct device_node *remote;
> > +	struct drm_bridge *out_bridge;
> >  	struct drm_panel *panel;
> >  	int ret;
> >  
> > @@ -1740,21 +1741,23 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> >  
> >  	panel = of_drm_find_panel(remote);
> >  	if (!IS_ERR(panel)) {
> > -		dsi->out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > +		out_bridge = devm_drm_panel_bridge_add(dev, panel);
> >  	} else {
> > -		dsi->out_bridge = of_drm_find_bridge(remote);
> > -		if (!dsi->out_bridge)
> > -			dsi->out_bridge = ERR_PTR(-EINVAL);
> > +		out_bridge = of_drm_find_bridge(remote);
> > +		if (!out_bridge)
> > +			out_bridge = ERR_PTR(-EINVAL);
> >  	}  
> 
> While looking at this patch, I think we should migrate the driver to
> drm_of_find_panel_or_bridge().

Indeed, the code here is duplicating drm_of_find_panel_or_bridge(). I'm
going to convert it.

> Then your patch might add a function
> close to devm_drm_of_get_bridge() or drmm_of_get_bridge().

...which would return a bridge pointer, with refcount already
incremented. Sure, except I think it should _not_ be a drmm, as
the bridge might itself disappear while the card keeps existing.

Luca
Dmitry Baryshkov Jan. 3, 2025, 6 a.m. UTC | #3
On Thu, Jan 02, 2025 at 01:01:49PM +0100, Luca Ceresoli wrote:
> Hi Dmitry,
> 
> On Tue, 31 Dec 2024 16:57:38 +0200
> Dmitry Baryshkov <dmitry.baryshkov@linaro.org> wrote:
> 
> > On Tue, Dec 31, 2024 at 11:40:02AM +0100, Luca Ceresoli wrote:
> > > Instead of using dsi->out_bridge during the bridge search process, use a
> > > temporary variable and assign dsi->out_bridge only on successful
> > > completion.
> > > 
> > > The main goal is to be able to drm_bridge_get() the out_bridge before
> > > setting it in dsi->out_bridge, which is done in a later commit. Setting
> > > dsi->out_bridge as in current code would leave a use-after-free window in
> > > case the bridge is deallocated by some other thread between
> > > 'dsi->out_bridge = devm_drm_panel_bridge_add()' and drm_bridge_get().  
> > 
> > I don't think that's how refcounting should work. Any of the functions
> > that give you the bridge should also increase refcount, requiring manual
> > _put() call afterwards. We might need a separate API for that.
> 
> You're perfectly right.
> 
> > > This change additionally avoids leaving an ERR_PTR value in dsi->out_bridge
> > > on failure. This is not necessarily a problem but it is not clean.
> > > 
> > > Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
> > > 
> > > ---
> > > 
> > > This patch was added in v5.
> > > ---
> > >  drivers/gpu/drm/bridge/samsung-dsim.c | 15 +++++++++------
> > >  1 file changed, 9 insertions(+), 6 deletions(-)
> > > 

[...]

> > > @@ -1740,21 +1741,23 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > >  
> > >  	panel = of_drm_find_panel(remote);
> > >  	if (!IS_ERR(panel)) {
> > > -		dsi->out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > > +		out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > >  	} else {
> > > -		dsi->out_bridge = of_drm_find_bridge(remote);
> > > -		if (!dsi->out_bridge)
> > > -			dsi->out_bridge = ERR_PTR(-EINVAL);
> > > +		out_bridge = of_drm_find_bridge(remote);
> > > +		if (!out_bridge)
> > > +			out_bridge = ERR_PTR(-EINVAL);
> > >  	}  
> > 
> > While looking at this patch, I think we should migrate the driver to
> > drm_of_find_panel_or_bridge().
> 
> Indeed, the code here is duplicating drm_of_find_panel_or_bridge(). I'm
> going to convert it.
> 
> > Then your patch might add a function
> > close to devm_drm_of_get_bridge() or drmm_of_get_bridge().
> 
> ...which would return a bridge pointer, with refcount already
> incremented. Sure, except I think it should _not_ be a drmm, as
> the bridge might itself disappear while the card keeps existing.

Feel free to add new one.

> 
> Luca
> 
> -- 
> Luca Ceresoli, Bootlin
> Embedded Linux and Kernel engineering
> https://bootlin.com
Luca Ceresoli Jan. 10, 2025, 10:58 a.m. UTC | #4
Hi Dmitry,

On Thu, 2 Jan 2025 13:01:49 +0100
Luca Ceresoli <luca.ceresoli@bootlin.com> wrote:

> > > diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > index f8b4fb8357659018ec0db65374ee5d05330639ae..c4d1563fd32019efde523dfc0863be044c05a826 100644
> > > --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> > > +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > @@ -1705,6 +1705,7 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > >  	struct device *dev = dsi->dev;
> > >  	struct device_node *np = dev->of_node;
> > >  	struct device_node *remote;
> > > +	struct drm_bridge *out_bridge;
> > >  	struct drm_panel *panel;
> > >  	int ret;
> > >  
> > > @@ -1740,21 +1741,23 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > >  
> > >  	panel = of_drm_find_panel(remote);
> > >  	if (!IS_ERR(panel)) {
> > > -		dsi->out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > > +		out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > >  	} else {
> > > -		dsi->out_bridge = of_drm_find_bridge(remote);
> > > -		if (!dsi->out_bridge)
> > > -			dsi->out_bridge = ERR_PTR(-EINVAL);
> > > +		out_bridge = of_drm_find_bridge(remote);
> > > +		if (!out_bridge)
> > > +			out_bridge = ERR_PTR(-EINVAL);
> > >  	}    
> > 
> > While looking at this patch, I think we should migrate the driver to
> > drm_of_find_panel_or_bridge().  
> 
> Indeed, the code here is duplicating drm_of_find_panel_or_bridge(). I'm
> going to convert it.

Or maybe not. A similar work has been attempted in the past [0] and
then reverted. There are many subtleties one would need to take care of
before getting this right, I don't think opening this other can of
worms in the middle of the bridge refcounting work makes sense.

[0] https://patchwork.freedesktop.org/patch/482751/

Luca
Luca Ceresoli Jan. 16, 2025, 10:32 a.m. UTC | #5
Hello Dmitry, Maxime, All,

On Fri, 10 Jan 2025 11:58:19 +0100
Luca Ceresoli <luca.ceresoli@bootlin.com> wrote:

> Hi Dmitry,
> 
> On Thu, 2 Jan 2025 13:01:49 +0100
> Luca Ceresoli <luca.ceresoli@bootlin.com> wrote:
> 
> > > > diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > index f8b4fb8357659018ec0db65374ee5d05330639ae..c4d1563fd32019efde523dfc0863be044c05a826 100644
> > > > --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > @@ -1705,6 +1705,7 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > > >  	struct device *dev = dsi->dev;
> > > >  	struct device_node *np = dev->of_node;
> > > >  	struct device_node *remote;
> > > > +	struct drm_bridge *out_bridge;
> > > >  	struct drm_panel *panel;
> > > >  	int ret;
> > > >  
> > > > @@ -1740,21 +1741,23 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > > >  
> > > >  	panel = of_drm_find_panel(remote);
> > > >  	if (!IS_ERR(panel)) {
> > > > -		dsi->out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > > > +		out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > > >  	} else {
> > > > -		dsi->out_bridge = of_drm_find_bridge(remote);
> > > > -		if (!dsi->out_bridge)
> > > > -			dsi->out_bridge = ERR_PTR(-EINVAL);
> > > > +		out_bridge = of_drm_find_bridge(remote);
> > > > +		if (!out_bridge)
> > > > +			out_bridge = ERR_PTR(-EINVAL);
> > > >  	}      
> > > 
> > > While looking at this patch, I think we should migrate the driver to
> > > drm_of_find_panel_or_bridge().    
> > 
> > Indeed, the code here is duplicating drm_of_find_panel_or_bridge(). I'm
> > going to convert it.  

I've been struggling to find a good way to handle the panel bridge
lifetime, and still haven't found a way that looks totally good.
Here's my analysis and some possible ways forward.

For "normal" bridges there is a device driver that probes, allocates a
struct drm_bridge and registers it via drm_bridge_add() and at that
point the bridge can be found by other drivers, such as the previous
bridge or the encoder. Those "other drivers" will obtain a pointer to
the struct drm_bridge and with refcounting they need to
drm_bridge_put() it.

So there are two clearly separate roles: the provider (bridge driver)
and the consumers (which gets/puts a pointer). So far so good.

And there are panels, which probe similarly as far as I can see.

And then there is the panel bridge. My understanding (which I'd love to
get clarified in case it is not accurate) is that DRM bridges expect to
always interact with "the next bridge", which cannot work for the last
bridge of course, and so the panel bridge wraps the panel pretending it
is a bridge.

This software structure is clearly not accurately modeling the
hardware (panel is not bridge), but it has been serving us so far.
However now I'm definitely hitting some troubles due to how the panel
bridge is created.

First: when the panel probes, no panel bridge is created. So other
bridges cannot find it as they would find other bridges, using
of_drm_find_bridge().

Second: to circumvent this, we have {drmm,devm_drm}_of_get_bridge()
which do (not counting error cases):

 1. call drm_of_find_panel_or_bridge() which returns:
    - a panel pointer, if found
    - otherwise a bridge pointer, if one already exists
 2. if a panel was found, it is assumed that no bridge exists yet (*)
    so one is created:
    2.1) call {drmm,devm_drm}_panel_bridge_add: a new panel bridge is
         allocated and its pointer returned
 3. the bridge obtained at 1 or 2 is returned

So, the pointer returned by {drmm,devm_drm}_of_get_bridge() can be a)
pre-existing or b) a panel bridge allocated automagically if there is a
panel. However the caller has no way to know whether a) or b) happened.
Yet a) and b) have different implications for the panel bridge lifetime
management and require that the returned pointer is disposed of in a
different way.

The fundamental design choice that is problematic with respect to
hotplugging is that the panel bridge (which is a struct drm_bridge
after all) is not created by the provider (the panel driver) but on the
fly by the first consumer that happens to need it. And the consumer is
not aware of this: it obtains a struct drm_bridge pointer and doesn't
know whether it was a) pre-existing or b) created on the fly.

So far this approach has been working because devm and drmm ensure the
panel bridge would be dealloacted at some point. However the devm and drmm
release actions are associated to the consumer struct device (the panel
bridge consumer), so if the panel bridge is removed and the consumer is
not, deallocation won't happen.

For hotplugging we cannot use drmm and devm and instead we use get/put,
to let the "next bridge" disappear with the previous one still present.
So the trivial idea is to add a drm_of_get_bridge(), similar to
{drmm,devm_drm}_of_get_bridge() except it uses plain
drm_panel_bridge_add() instead of devm/drmm variants. But then the
caller (which is the panel consumer) will have to dispose of the struct
drm_bridge pointer by calling:

 - drm_bridge_put() in case a)
 - drm_panel_bridge_remove in case b)

And that's the problem I need to solve.


First and foremost, do you think my analysis is correct?


(*) superficially this looks like another fundamental issue to me, but
    it is not my focus at the moment


Assuming it is, here are some possible ways to make the panel-bridge
work with hotplug.

 1. have drm_of_get_bridge() return an indication on how to dispose of
    the returned pointer
 2. add an ad-hoc remover function alongside drm_of_get_bridge()
 3. let all panel drivers automatically add a panel-bridge
 4. stop pretending there is always a "next bridge" after each bridge

Idea 1:

The new (non drmm/devm) drm_of_get_bridge() would return a flag to
indicate whether case a) or b) happened. Or it could return a function
pointer to be called to dispose of the returned pointer, to be
stored and called by the consumer.

I find this quite ugly and I'd call this a workaround rather than a
solution, but I'm open to discussion.

Idea 2:

I'm proposing to add drm_of_get_bridge(), which as a non-drmm, non-devm
variant to be used with refcounting. So the idea is to add alongside it
a corresponding removal function [drm_of_put_bridge()?]:

  drm_of_put_bridge(struct drm_bridge *bridge)
  {
      if (drm_bridge_is_panel(bridge))
          drm_panel_bridge_remove(bridge);
      drm_bridge_put(bridge);
  }

So the consumer would always have to call this function, which is as
automagic as *drm_of_get_bridge().

My concern is what would happen in case:

 * driver A calls drm_of_get_bridge() and a panel_bridge is created
 * driver B calls drm_of_get_bridge() on the same panel, the existing
   panel_bridge is returned

Both drivers would call drm_of_put_bridge -> drm_panel_bridge_remove,
so removing twice. However I don't think this is possible due to how the
*_drm_of_get_bridge() functions are currently implemented.

Even more, I don't think it is realistic that two different drivers call
*_drm_of_get_bridge() for the same panel. Is this assumption correct?

Idea 3: 

The idea is that if the panel driver framework always creates a panel
bridge, it will never need to be created on the fly automagically by
its consumers, so the whole problem would disappear. It also would be
better modeling the hardware: still wrapping a panel with a drm_bridge
that does not exist in the hardware, but at least having it created by
the provider driver and not by the consumer driver which happens to
look for it.

This looks like a promising and simple idea, so I tried a quick
implementation:

 void drm_panel_init(struct drm_panel *panel, struct device *dev,
                    const struct drm_panel_funcs *funcs, int connector_type)
 {
+       struct drm_bridge *bridge;
+
        INIT_LIST_HEAD(&panel->list);
        INIT_LIST_HEAD(&panel->followers);
        mutex_init(&panel->follower_lock);
        panel->dev = dev;
        panel->funcs = funcs;
        panel->connector_type = connector_type;
+
+       bridge = devm_drm_panel_bridge_add(panel->dev, panel);
+       WARN_ON(!bridge);
 }

This is somewhat working but it requires more work because:

 * as it is, it creates a circular dependency between drm_panel and the
   panel bridge, and modular builds will fail:

     depmod: ERROR: Cycle detected: drm -> drm_kms_helper -> drm

 * The panel bridge implementation should be made private to the panel
   driver only (possibly helping to solve the previous issue?)

 * Many drivers currently call devm_drm_panel_bridge_add() directly,
   they should probably call drm_of_get_bridge instead

 * drm_of_find_panel_or_bridge() should disappear: other drivers would
   just look for a bridge

Opinions about this idea?

Idea 4:

'stop pretending there is always a "next bridge" after each bridge'
looks like a _very_ long term goal, but it would be interesting to
discuss whether this is a correct idea.

If you've been reading thus far, thanks for your patience! I'll be very
glad to hear more opinions on all the above.

Luca
Dmitry Baryshkov Jan. 16, 2025, 10:56 a.m. UTC | #6
On Thu, Jan 16, 2025 at 11:32:36AM +0100, Luca Ceresoli wrote:
> Hello Dmitry, Maxime, All,
> 
> On Fri, 10 Jan 2025 11:58:19 +0100
> Luca Ceresoli <luca.ceresoli@bootlin.com> wrote:
> 
> > Hi Dmitry,
> > 
> > On Thu, 2 Jan 2025 13:01:49 +0100
> > Luca Ceresoli <luca.ceresoli@bootlin.com> wrote:
> > 
> > > > > diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > > index f8b4fb8357659018ec0db65374ee5d05330639ae..c4d1563fd32019efde523dfc0863be044c05a826 100644
> > > > > --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > > +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > > @@ -1705,6 +1705,7 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > > > >  	struct device *dev = dsi->dev;
> > > > >  	struct device_node *np = dev->of_node;
> > > > >  	struct device_node *remote;
> > > > > +	struct drm_bridge *out_bridge;
> > > > >  	struct drm_panel *panel;
> > > > >  	int ret;
> > > > >  
> > > > > @@ -1740,21 +1741,23 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > > > >  
> > > > >  	panel = of_drm_find_panel(remote);
> > > > >  	if (!IS_ERR(panel)) {
> > > > > -		dsi->out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > > > > +		out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > > > >  	} else {
> > > > > -		dsi->out_bridge = of_drm_find_bridge(remote);
> > > > > -		if (!dsi->out_bridge)
> > > > > -			dsi->out_bridge = ERR_PTR(-EINVAL);
> > > > > +		out_bridge = of_drm_find_bridge(remote);
> > > > > +		if (!out_bridge)
> > > > > +			out_bridge = ERR_PTR(-EINVAL);
> > > > >  	}      
> > > > 
> > > > While looking at this patch, I think we should migrate the driver to
> > > > drm_of_find_panel_or_bridge().    
> > > 
> > > Indeed, the code here is duplicating drm_of_find_panel_or_bridge(). I'm
> > > going to convert it.  
> 
> I've been struggling to find a good way to handle the panel bridge
> lifetime, and still haven't found a way that looks totally good.
> Here's my analysis and some possible ways forward.
> 
> For "normal" bridges there is a device driver that probes, allocates a
> struct drm_bridge and registers it via drm_bridge_add() and at that
> point the bridge can be found by other drivers, such as the previous
> bridge or the encoder. Those "other drivers" will obtain a pointer to
> the struct drm_bridge and with refcounting they need to
> drm_bridge_put() it.
> 
> So there are two clearly separate roles: the provider (bridge driver)
> and the consumers (which gets/puts a pointer). So far so good.
> 
> And there are panels, which probe similarly as far as I can see.
> 
> And then there is the panel bridge. My understanding (which I'd love to
> get clarified in case it is not accurate) is that DRM bridges expect to
> always interact with "the next bridge", which cannot work for the last
> bridge of course, and so the panel bridge wraps the panel pretending it
> is a bridge.

Well.. There are some drivers that interact with the drm_panel directly.
However I think the tendency was to migrate to always having the bridge
instead. I.e. the only reason to interact with the panel directly seems
to be able to access panel timings instead of modes.

[skipeed 1 & 2]

> 
> Idea 3: 
> 
> The idea is that if the panel driver framework always creates a panel
> bridge, it will never need to be created on the fly automagically by
> its consumers, so the whole problem would disappear. It also would be
> better modeling the hardware: still wrapping a panel with a drm_bridge
> that does not exist in the hardware, but at least having it created by
> the provider driver and not by the consumer driver which happens to
> look for it.

I think this is the best option up to now.

> 
> This looks like a promising and simple idea, so I tried a quick
> implementation:
> 
>  void drm_panel_init(struct drm_panel *panel, struct device *dev,
>                     const struct drm_panel_funcs *funcs, int connector_type)
>  {
> +       struct drm_bridge *bridge;
> +
>         INIT_LIST_HEAD(&panel->list);
>         INIT_LIST_HEAD(&panel->followers);
>         mutex_init(&panel->follower_lock);
>         panel->dev = dev;
>         panel->funcs = funcs;
>         panel->connector_type = connector_type;
> +
> +       bridge = devm_drm_panel_bridge_add(panel->dev, panel);
> +       WARN_ON(!bridge);
>  }
> 
> This is somewhat working but it requires more work because:
> 
>  * as it is, it creates a circular dependency between drm_panel and the
>    panel bridge, and modular builds will fail:
> 
>      depmod: ERROR: Cycle detected: drm -> drm_kms_helper -> drm
> 
>  * The panel bridge implementation should be made private to the panel
>    driver only (possibly helping to solve the previous issue?)

Or merge drm_panel.c into bridge/panel.c. The panel bridge already
exports non-standard API, so it should be fine to extend / change that
API. Likewise we might move drm_panel.c to drm_kms_helper.o, also
resolving the loop. There will be a need for a Kconfig update in both
cases.

>  * Many drivers currently call devm_drm_panel_bridge_add() directly,
>    they should probably call drm_of_get_bridge instead

I'd say, make it a stub that calls drm_of_get_bridge() with a possible
deprecation warning.

> 
>  * drm_of_find_panel_or_bridge() should disappear: other drivers would
>    just look for a bridge

Please keep it for now.

Some of the drivers think that it returns literally panel-or-bridge (but
not both). However if panel bridge is already created, it will return
both. So those drivers need to be updated.

But in a longer term indeed  it should fade away.

> 
> Opinions about this idea?
> 
> Idea 4:
> 
> 'stop pretending there is always a "next bridge" after each bridge'
> looks like a _very_ long term goal, but it would be interesting to
> discuss whether this is a correct idea.
> 
> If you've been reading thus far, thanks for your patience! I'll be very
> glad to hear more opinions on all the above.
> 
> Luca
> 
> -- 
> Luca Ceresoli, Bootlin
> Embedded Linux and Kernel engineering
> https://bootlin.com
Maxime Ripard Jan. 16, 2025, 12:26 p.m. UTC | #7
Hi Luca,

On Thu, Jan 16, 2025 at 11:32:36AM +0100, Luca Ceresoli wrote:
> Hello Dmitry, Maxime, All,
> 
> On Fri, 10 Jan 2025 11:58:19 +0100
> Luca Ceresoli <luca.ceresoli@bootlin.com> wrote:
> 
> > Hi Dmitry,
> > 
> > On Thu, 2 Jan 2025 13:01:49 +0100
> > Luca Ceresoli <luca.ceresoli@bootlin.com> wrote:
> > 
> > > > > diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > > index f8b4fb8357659018ec0db65374ee5d05330639ae..c4d1563fd32019efde523dfc0863be044c05a826 100644
> > > > > --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > > +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > > > @@ -1705,6 +1705,7 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > > > >  	struct device *dev = dsi->dev;
> > > > >  	struct device_node *np = dev->of_node;
> > > > >  	struct device_node *remote;
> > > > > +	struct drm_bridge *out_bridge;
> > > > >  	struct drm_panel *panel;
> > > > >  	int ret;
> > > > >  
> > > > > @@ -1740,21 +1741,23 @@ static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
> > > > >  
> > > > >  	panel = of_drm_find_panel(remote);
> > > > >  	if (!IS_ERR(panel)) {
> > > > > -		dsi->out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > > > > +		out_bridge = devm_drm_panel_bridge_add(dev, panel);
> > > > >  	} else {
> > > > > -		dsi->out_bridge = of_drm_find_bridge(remote);
> > > > > -		if (!dsi->out_bridge)
> > > > > -			dsi->out_bridge = ERR_PTR(-EINVAL);
> > > > > +		out_bridge = of_drm_find_bridge(remote);
> > > > > +		if (!out_bridge)
> > > > > +			out_bridge = ERR_PTR(-EINVAL);
> > > > >  	}      
> > > > 
> > > > While looking at this patch, I think we should migrate the driver to
> > > > drm_of_find_panel_or_bridge().    
> > > 
> > > Indeed, the code here is duplicating drm_of_find_panel_or_bridge(). I'm
> > > going to convert it.  
> 
> I've been struggling to find a good way to handle the panel bridge
> lifetime, and still haven't found a way that looks totally good.
> Here's my analysis and some possible ways forward.
> 
> For "normal" bridges there is a device driver that probes, allocates a
> struct drm_bridge and registers it via drm_bridge_add() and at that
> point the bridge can be found by other drivers, such as the previous
> bridge or the encoder. Those "other drivers" will obtain a pointer to
> the struct drm_bridge and with refcounting they need to
> drm_bridge_put() it.
> 
> So there are two clearly separate roles: the provider (bridge driver)
> and the consumers (which gets/puts a pointer). So far so good.

Yeah, we agree so far :)

> And there are panels, which probe similarly as far as I can see.

Indeed.

> And then there is the panel bridge. My understanding (which I'd love to
> get clarified in case it is not accurate) is that DRM bridges expect to
> always interact with "the next bridge", which cannot work for the last
> bridge of course, and so the panel bridge wraps the panel pretending it
> is a bridge.
> 
> This software structure is clearly not accurately modeling the
> hardware (panel is not bridge),

We don't have a proper definition of what a bridge is, so as far as I'm
concerned, everything is a bridge :)

The name came from "external signal converters", but the API got reused
to support so many hardware components it's not meaningful anymore.

> but it has been serving us so far. However now I'm definitely hitting
> some troubles due to how the panel bridge is created.
> 
> First: when the panel probes, no panel bridge is created. So other
> bridges cannot find it as they would find other bridges, using
> of_drm_find_bridge().
> 
> Second: to circumvent this, we have {drmm,devm_drm}_of_get_bridge()
> which do (not counting error cases):
> 
>  1. call drm_of_find_panel_or_bridge() which returns:
>     - a panel pointer, if found
>     - otherwise a bridge pointer, if one already exists
>  2. if a panel was found, it is assumed that no bridge exists yet (*)
>     so one is created:
>     2.1) call {drmm,devm_drm}_panel_bridge_add: a new panel bridge is
>          allocated and its pointer returned
>  3. the bridge obtained at 1 or 2 is returned
> 
> So, the pointer returned by {drmm,devm_drm}_of_get_bridge() can be a)
> pre-existing or b) a panel bridge allocated automagically if there is a
> panel. However the caller has no way to know whether a) or b) happened.
> Yet a) and b) have different implications for the panel bridge lifetime
> management and require that the returned pointer is disposed of in a
> different way.
> 
> The fundamental design choice that is problematic with respect to
> hotplugging is that the panel bridge (which is a struct drm_bridge
> after all) is not created by the provider (the panel driver) but on the
> fly by the first consumer that happens to need it. And the consumer is
> not aware of this: it obtains a struct drm_bridge pointer and doesn't
> know whether it was a) pre-existing or b) created on the fly.

I'm not entirely sure why it matters? The only thing the consumer should
care about is that the bridge pointer it got is valid between the calls
to drmm_of_get_bridge() and drm_bridge_put(). As long as that statement
is true, why should we care about what or how that bridge was created?

> So far this approach has been working because devm and drmm ensure the
> panel bridge would be dealloacted at some point. However the devm and drmm
> release actions are associated to the consumer struct device (the panel
> bridge consumer), so if the panel bridge is removed and the consumer is
> not, deallocation won't happen.

Oh, right, if one doesn't call drm_bridge_put(), that will result in a
memory leak. The general topic we discuss and try to address here is
memory safety, and a memory leak is considered safe. It's also going to
get allocated only a couple of times anyway, so it's not a *huge*
concern.

And about how to actually fix it, there's two ways to go about it:

  * Either we do a coccinelle script and try to put all those
    drm_bridge_put() everywhere;

  * Or we create a devm/drmm action and drop the reference
    automatically.

The latter is obviously less intrusive, we would need to deprecate
devm_of_get_bridge() for it to be safe, and I'm not entirely sure it
would be enough, but it might just work.

> For hotplugging we cannot use drmm and devm and instead we use get/put,
> to let the "next bridge" disappear with the previous one still present.
> So the trivial idea is to add a drm_of_get_bridge(), similar to
> {drmm,devm_drm}_of_get_bridge() except it uses plain
> drm_panel_bridge_add() instead of devm/drmm variants. But then the
> caller (which is the panel consumer) will have to dispose of the struct
> drm_bridge pointer by calling:
> 
>  - drm_bridge_put() in case a)
>  - drm_panel_bridge_remove in case b)
> 
> And that's the problem I need to solve.

I'm not sure the problem is limited to panel_bridge. Your question is
essentially: how do I make sure a driver-specific init is properly freed
at drm_bridge_put time. This was done so far mostly at bridge remove
time, but we obviously can't do that anymore.

But we'd have the same issue if, say, we needed to remove a workqueue
from a driver.

I think we need a destroy() hook for bridges, just like we have for
connectors for example that would deal with calling
drm_panel_bridge_remove() if necessary, or any other driver-specific
sequence.

> First and foremost, do you think my analysis is correct?
>
> (*) superficially this looks like another fundamental issue to me, but
>     it is not my focus at the moment
>
> Assuming it is, here are some possible ways to make the panel-bridge
> work with hotplug.
> 
>  1. have drm_of_get_bridge() return an indication on how to dispose of
>     the returned pointer
>  2. add an ad-hoc remover function alongside drm_of_get_bridge()
>  3. let all panel drivers automatically add a panel-bridge
>  4. stop pretending there is always a "next bridge" after each bridge
> 
> Idea 1:
> 
> The new (non drmm/devm) drm_of_get_bridge() would return a flag to
> indicate whether case a) or b) happened. Or it could return a function
> pointer to be called to dispose of the returned pointer, to be
> stored and called by the consumer.
> 
> I find this quite ugly and I'd call this a workaround rather than a
> solution, but I'm open to discussion.
> 
> Idea 2:
> 
> I'm proposing to add drm_of_get_bridge(), which as a non-drmm, non-devm
> variant to be used with refcounting. So the idea is to add alongside it
> a corresponding removal function [drm_of_put_bridge()?]:
> 
>   drm_of_put_bridge(struct drm_bridge *bridge)
>   {
>       if (drm_bridge_is_panel(bridge))
>           drm_panel_bridge_remove(bridge);
>       drm_bridge_put(bridge);
>   }
> 
> So the consumer would always have to call this function, which is as
> automagic as *drm_of_get_bridge().
> 
> My concern is what would happen in case:
> 
>  * driver A calls drm_of_get_bridge() and a panel_bridge is created
>  * driver B calls drm_of_get_bridge() on the same panel, the existing
>    panel_bridge is returned
> 
> Both drivers would call drm_of_put_bridge -> drm_panel_bridge_remove,
> so removing twice. However I don't think this is possible due to how the
> *_drm_of_get_bridge() functions are currently implemented.
> 
> Even more, I don't think it is realistic that two different drivers call
> *_drm_of_get_bridge() for the same panel. Is this assumption correct?
> 
> Idea 3: 
> 
> The idea is that if the panel driver framework always creates a panel
> bridge, it will never need to be created on the fly automagically by
> its consumers, so the whole problem would disappear. It also would be
> better modeling the hardware: still wrapping a panel with a drm_bridge
> that does not exist in the hardware, but at least having it created by
> the provider driver and not by the consumer driver which happens to
> look for it.
> 
> This looks like a promising and simple idea, so I tried a quick
> implementation:
> 
>  void drm_panel_init(struct drm_panel *panel, struct device *dev,
>                     const struct drm_panel_funcs *funcs, int connector_type)
>  {
> +       struct drm_bridge *bridge;
> +
>         INIT_LIST_HEAD(&panel->list);
>         INIT_LIST_HEAD(&panel->followers);
>         mutex_init(&panel->follower_lock);
>         panel->dev = dev;
>         panel->funcs = funcs;
>         panel->connector_type = connector_type;
> +
> +       bridge = devm_drm_panel_bridge_add(panel->dev, panel);
> +       WARN_ON(!bridge);
>  }
>
> This is somewhat working but it requires more work because:
> 
>  * as it is, it creates a circular dependency between drm_panel and the
>    panel bridge, and modular builds will fail:
> 
>      depmod: ERROR: Cycle detected: drm -> drm_kms_helper -> drm
> 
>  * The panel bridge implementation should be made private to the panel
>    driver only (possibly helping to solve the previous issue?)
> 
>  * Many drivers currently call devm_drm_panel_bridge_add() directly,
>    they should probably call drm_of_get_bridge instead
> 
>  * drm_of_find_panel_or_bridge() should disappear: other drivers would
>    just look for a bridge
> 
> Opinions about this idea?
> 
> Idea 4:
> 
> 'stop pretending there is always a "next bridge" after each bridge'
> looks like a _very_ long term goal, but it would be interesting to
> discuss whether this is a correct idea.
> 
> If you've been reading thus far, thanks for your patience! I'll be very
> glad to hear more opinions on all the above.

I don't think we need any of them. Adding a mechanism similar to
drm_connector_free() seems to be what you're looking for.

Maxime
Luca Ceresoli Jan. 21, 2025, 11:27 a.m. UTC | #8
Hi Dmitry,

On Thu, 16 Jan 2025 12:56:25 +0200
Dmitry Baryshkov <dmitry.baryshkov@linaro.org> wrote:

[...]

> > Idea 3: 
> > 
> > The idea is that if the panel driver framework always creates a panel
> > bridge, it will never need to be created on the fly automagically by
> > its consumers, so the whole problem would disappear. It also would be
> > better modeling the hardware: still wrapping a panel with a drm_bridge
> > that does not exist in the hardware, but at least having it created by
> > the provider driver and not by the consumer driver which happens to
> > look for it.  
> 
> I think this is the best option up to now.

Thanks for sharing your opinion. However a few points are not clear to
me, see below.

> > This looks like a promising and simple idea, so I tried a quick
> > implementation:
> > 
> >  void drm_panel_init(struct drm_panel *panel, struct device *dev,
> >                     const struct drm_panel_funcs *funcs, int connector_type)
> >  {
> > +       struct drm_bridge *bridge;
> > +
> >         INIT_LIST_HEAD(&panel->list);
> >         INIT_LIST_HEAD(&panel->followers);
> >         mutex_init(&panel->follower_lock);
> >         panel->dev = dev;
> >         panel->funcs = funcs;
> >         panel->connector_type = connector_type;
> > +
> > +       bridge = devm_drm_panel_bridge_add(panel->dev, panel);
> > +       WARN_ON(!bridge);
> >  }
> > 
> > This is somewhat working but it requires more work because:
> > 
> >  * as it is, it creates a circular dependency between drm_panel and the
> >    panel bridge, and modular builds will fail:
> > 
> >      depmod: ERROR: Cycle detected: drm -> drm_kms_helper -> drm
> > 
> >  * The panel bridge implementation should be made private to the panel
> >    driver only (possibly helping to solve the previous issue?)  
> 
> Or merge drm_panel.c into bridge/panel.c.

Not sure here you mean exactly what you wrote, or the other way around.
It looks more correct to me that the panel core (drm_panel.c) starts
exposing a bridge now, and not that the panel bridge which is just one
of many bridge drivers starts handling all the panel-related stuff.

> The panel bridge already
> exports non-standard API, so it should be fine to extend / change that
> API. Likewise we might move drm_panel.c to drm_kms_helper.o, also
> resolving the loop.

Again I'm not following I'm afraid. It would seem more logical to me to
move things from the helper into drm.o as they become more necessary
for common cases, rather than moving things out to a helper module and
then force all panel drivers to depend on the helper module.

Apologies, I'm perhaps not following your reasoning here. :(

> There will be a need for a Kconfig update in both
> cases.
> 
> >  * Many drivers currently call devm_drm_panel_bridge_add() directly,
> >    they should probably call drm_of_get_bridge instead  
> 
> I'd say, make it a stub that calls drm_of_get_bridge() with a possible
> deprecation warning.

I get the idea, but it would need to be implemented differently because
drm_of_get_bridge() calls devm_drm_panel_bridge_add(). They would loop
into each other. I think we might simply add a check at the beginning
of drm_panel_bridge_add_typed(), as in (pseudocode):

drm_panel_bridge_add_typed(struct drm_panel *panel, ...)
{
    if (panel_has_a_panel_bridge(panel))
        return panel->panel_bridge.bridge;

    // existing code
}

But that reopens the same issue: drm_panel_bridge_add_typed() would not
always call drm_bridge_add(), so the caller doesn't know how to dispose
of the returned pointer.

I think idea 3 can only work if the code understands that the panel
bridge is created by the panel and they _never_ have to create it
themselves. So handling the transition for all drivers would be quite
painful.

> >  * drm_of_find_panel_or_bridge() should disappear: other drivers would
> >    just look for a bridge  
> 
> Please keep it for now.
> 
> Some of the drivers think that it returns literally panel-or-bridge (but
> not both). However if panel bridge is already created, it will return
> both. So those drivers need to be updated.

I really believe it never returns both. The *bridge is set only when
ret != 0 here [0], which can happen only if a panel was not found.
Despite the slightly intricate logic of that function, I don't see how
it could return both in its current implementation.

[0]
https://elixir.bootlin.com/linux/v6.13-rc3/source/drivers/gpu/drm/drm_of.c#L273

Luca
Luca Ceresoli Jan. 21, 2025, 11:27 a.m. UTC | #9
Hi Maxime,

On Thu, 16 Jan 2025 13:26:25 +0100
Maxime Ripard <mripard@kernel.org> wrote:

[...]

> > And then there is the panel bridge. My understanding (which I'd love to
> > get clarified in case it is not accurate) is that DRM bridges expect to
> > always interact with "the next bridge", which cannot work for the last
> > bridge of course, and so the panel bridge wraps the panel pretending it
> > is a bridge.
> > 
> > This software structure is clearly not accurately modeling the
> > hardware (panel is not bridge),  
> 
> We don't have a proper definition of what a bridge is, so as far as I'm
> concerned, everything is a bridge :)
> 
> The name came from "external signal converters", but the API got reused
> to support so many hardware components it's not meaningful anymore.

So if I'm getting your point here, drm_bridge is actually the base
class for DRM devices in OOP jargon, or a "DRM subunit" in V4L2 jargon.
OK, that's fine for me (except maybe for a "we should rename" thought).

[...]

> > So far this approach has been working because devm and drmm ensure the
> > panel bridge would be dealloacted at some point. However the devm and drmm
> > release actions are associated to the consumer struct device (the panel
> > bridge consumer), so if the panel bridge is removed and the consumer is
> > not, deallocation won't happen.  
> 
> Oh, right, if one doesn't call drm_bridge_put(), that will result in a
> memory leak. The general topic we discuss and try to address here is
> memory safety, and a memory leak is considered safe. It's also going to
> get allocated only a couple of times anyway, so it's not a *huge*
> concern.
> 
> And about how to actually fix it, there's two ways to go about it:
> 
>   * Either we do a coccinelle script and try to put all those
>     drm_bridge_put() everywhere;
> 
>   * Or we create a devm/drmm action and drop the reference
>     automatically.
> 
> The latter is obviously less intrusive, we would need to deprecate
> devm_of_get_bridge() for it to be safe, and I'm not entirely sure it
> would be enough, but it might just work.
> 
> > For hotplugging we cannot use drmm and devm and instead we use get/put,
> > to let the "next bridge" disappear with the previous one still present.
> > So the trivial idea is to add a drm_of_get_bridge(), similar to
> > {drmm,devm_drm}_of_get_bridge() except it uses plain
> > drm_panel_bridge_add() instead of devm/drmm variants. But then the
> > caller (which is the panel consumer) will have to dispose of the struct
> > drm_bridge pointer by calling:
> > 
> >  - drm_bridge_put() in case a)
> >  - drm_panel_bridge_remove in case b)
> > 
> > And that's the problem I need to solve.  
> 
> I'm not sure the problem is limited to panel_bridge. Your question is
> essentially: how do I make sure a driver-specific init is properly freed
> at drm_bridge_put time. This was done so far mostly at bridge remove
> time, but we obviously can't do that anymore.
> 
> But we'd have the same issue if, say, we needed to remove a workqueue
> from a driver.
> 
> I think we need a destroy() hook for bridges, just like we have for
> connectors for example that would deal with calling
> drm_panel_bridge_remove() if necessary, or any other driver-specific
> sequence.

The .destroy hook looked appealing at first, however as I tried to
apply the idea to bridges I'm not sure it matches. Here's why.

The normal (and sane) flow for a bridge is:

 A) probe
    1. allocate private struct embedding struct drm_bridge
       (I have an _alloc() variant ready for v5 to improve this as you proposed)
    2. get resources, initialize struct fields
    3. drm_bridge_add(): publish bridge into global bridge_list

Now the bridge can be found and pointers taken and used.

And on hardware removal, in reverse order:
 
 B) remove (hardware is hot-unplugged)
    3. unpublish bridge
    2. release resources, cleanup
    1. kfree private struct

Some drivers do real stuff in B2, so it is important that B3 happens
before B2, isn't it? We don't want other drivers to find and use a
bridge that is being dismantled, or afterwards.

B3 should normally happen by removing the bridge from the global
bridge_list, or other bridges might find it. However setting the "gone"
bool and teaching of_drm_find_bridge() & Co to skip bridges with
gone==true would allow to postpone the actual removal, if needed.

With that said, with hotplugging there will be two distinct events:

 * hardware removal
 * last ref is put

The second event could happen way later than the first one. During the
time frame between the two events we need the bridge to be unpublished
and the bridge resources to be already released, as the hardware is
gone. We cannot do this at the last put, it's too late.

So I think the only sane sequence is:

 * on hardware removal:
     B3) unpublish bridge (drm_bridge_remove() or just set gone flag)
     B2) free resources, deinit whatever needed
 * when last ref is put
     B1) kfree (likely via devm)

So, back to the .destroy hook, it would fit at B2, and not at the last
put.

However in that place it seems unnecessary. The actions "on hardware
removal" (B3, B2) are done by the driver remove function, so they are
already driver specific without any additional hook. I'm however fine
to add the hook on hardware removal in case there's a good reason I
missed.

Do you think my reasoning is correct so far?

If you don't, can you clarify at which events (hardware removal VS last
put) the various actions (drm_bridge_remove, set gone flag, calling
.destroy, free resources and deinint, kfree) should be done?


(Side note: I've been pondering on why the .destroy hook works for
connectors and would not for bridges. I think it's due to the global
bridge_list, or because of the different lifetime management based on
drmm for connectors, or both.)


It may look as if my discussion is about bridges in general and not
about the panel bridge. However before delving into how to dispose of
a panel bridge we need to sort out how to dispose of a bridge in the
first place.

Luca
Dmitry Baryshkov Jan. 21, 2025, 11:57 a.m. UTC | #10
On Tue, Jan 21, 2025 at 12:27:18PM +0100, Luca Ceresoli wrote:
> Hi Dmitry,
> 
> On Thu, 16 Jan 2025 12:56:25 +0200
> Dmitry Baryshkov <dmitry.baryshkov@linaro.org> wrote:
> 
> [...]
> 
> > > Idea 3: 
> > > 
> > > The idea is that if the panel driver framework always creates a panel
> > > bridge, it will never need to be created on the fly automagically by
> > > its consumers, so the whole problem would disappear. It also would be
> > > better modeling the hardware: still wrapping a panel with a drm_bridge
> > > that does not exist in the hardware, but at least having it created by
> > > the provider driver and not by the consumer driver which happens to
> > > look for it.  
> > 
> > I think this is the best option up to now.
> 
> Thanks for sharing your opinion. However a few points are not clear to
> me, see below.
> 
> > > This looks like a promising and simple idea, so I tried a quick
> > > implementation:
> > > 
> > >  void drm_panel_init(struct drm_panel *panel, struct device *dev,
> > >                     const struct drm_panel_funcs *funcs, int connector_type)
> > >  {
> > > +       struct drm_bridge *bridge;
> > > +
> > >         INIT_LIST_HEAD(&panel->list);
> > >         INIT_LIST_HEAD(&panel->followers);
> > >         mutex_init(&panel->follower_lock);
> > >         panel->dev = dev;
> > >         panel->funcs = funcs;
> > >         panel->connector_type = connector_type;
> > > +
> > > +       bridge = devm_drm_panel_bridge_add(panel->dev, panel);
> > > +       WARN_ON(!bridge);
> > >  }
> > > 
> > > This is somewhat working but it requires more work because:
> > > 
> > >  * as it is, it creates a circular dependency between drm_panel and the
> > >    panel bridge, and modular builds will fail:
> > > 
> > >      depmod: ERROR: Cycle detected: drm -> drm_kms_helper -> drm
> > > 
> > >  * The panel bridge implementation should be made private to the panel
> > >    driver only (possibly helping to solve the previous issue?)  
> > 
> > Or merge drm_panel.c into bridge/panel.c.
> 
> Not sure here you mean exactly what you wrote, or the other way around.
> It looks more correct to me that the panel core (drm_panel.c) starts
> exposing a bridge now, and not that the panel bridge which is just one
> of many bridge drivers starts handling all the panel-related stuff.

No, I actually meant what I wrote: merge drm_panel.c into
bridge/panel.c. Indeed we have some drivers that use panel directly.
However drm_bridge is a more generic interface. So, yes, I propose to
have a bridge driver which incorporates panel support.

> 
> > The panel bridge already
> > exports non-standard API, so it should be fine to extend / change that
> > API. Likewise we might move drm_panel.c to drm_kms_helper.o, also
> > resolving the loop.
> 
> Again I'm not following I'm afraid. It would seem more logical to me to
> move things from the helper into drm.o as they become more necessary
> for common cases, rather than moving things out to a helper module and
> then force all panel drivers to depend on the helper module.

There are a lot of DRM drivers which do not use panels (nor bridges).
Merging that code into drm.ko means that everybody ends up having that
piece of code in memory. Moving drm_panel.o out of drm.ko and
bridge/panel.o out of drm_kms_helper.ko would make the kernel slightly
smaller for those desktop-only users.

> 
> Apologies, I'm perhaps not following your reasoning here. :(
> 
> > There will be a need for a Kconfig update in both
> > cases.
> > 
> > >  * Many drivers currently call devm_drm_panel_bridge_add() directly,
> > >    they should probably call drm_of_get_bridge instead  
> > 
> > I'd say, make it a stub that calls drm_of_get_bridge() with a possible
> > deprecation warning.
> 
> I get the idea, but it would need to be implemented differently because
> drm_of_get_bridge() calls devm_drm_panel_bridge_add(). They would loop

If the drm_bridge is added during drm_panel_add(), then there is no need
to call devm_drm_panel_bridge_add() from drm_of_get_bridge().

> into each other. I think we might simply add a check at the beginning
> of drm_panel_bridge_add_typed(), as in (pseudocode):
> 
> drm_panel_bridge_add_typed(struct drm_panel *panel, ...)
> {
>     if (panel_has_a_panel_bridge(panel))
>         return panel->panel_bridge.bridge;
> 
>     // existing code
> }
> 
> But that reopens the same issue: drm_panel_bridge_add_typed() would not
> always call drm_bridge_add(), so the caller doesn't know how to dispose
> of the returned pointer.
> 
> I think idea 3 can only work if the code understands that the panel
> bridge is created by the panel and they _never_ have to create it
> themselves. So handling the transition for all drivers would be quite
> painful.

That's why I suggested just stubbing existing functions just to lookup
and return a bridge: drivers can transition one-by-one. The bridge will
be already there, created by the panel support code.

> 
> > >  * drm_of_find_panel_or_bridge() should disappear: other drivers would
> > >    just look for a bridge  
> > 
> > Please keep it for now.
> > 
> > Some of the drivers think that it returns literally panel-or-bridge (but
> > not both). However if panel bridge is already created, it will return
> > both. So those drivers need to be updated.
> 
> I really believe it never returns both. The *bridge is set only when
> ret != 0 here [0], which can happen only if a panel was not found.
> Despite the slightly intricate logic of that function, I don't see how
> it could return both in its current implementation.

Indeed. You are right, I skipped the if(ret) condition. More boilerplate
code in the drivers :-(

> 
> [0]
> https://elixir.bootlin.com/linux/v6.13-rc3/source/drivers/gpu/drm/drm_of.c#L273
> 
> Luca
> 
> -- 
> Luca Ceresoli, Bootlin
> Embedded Linux and Kernel engineering
> https://bootlin.com
Maxime Ripard Jan. 28, 2025, 3:09 p.m. UTC | #11
On Tue, Jan 21, 2025 at 12:27:29PM +0100, Luca Ceresoli wrote:
> Hi Maxime,
> 
> On Thu, 16 Jan 2025 13:26:25 +0100
> Maxime Ripard <mripard@kernel.org> wrote:
> 
> [...]
> 
> > > And then there is the panel bridge. My understanding (which I'd love to
> > > get clarified in case it is not accurate) is that DRM bridges expect to
> > > always interact with "the next bridge", which cannot work for the last
> > > bridge of course, and so the panel bridge wraps the panel pretending it
> > > is a bridge.
> > > 
> > > This software structure is clearly not accurately modeling the
> > > hardware (panel is not bridge),  
> > 
> > We don't have a proper definition of what a bridge is, so as far as I'm
> > concerned, everything is a bridge :)
> > 
> > The name came from "external signal converters", but the API got reused
> > to support so many hardware components it's not meaningful anymore.
> 
> So if I'm getting your point here, drm_bridge is actually the base
> class for DRM devices in OOP jargon, or a "DRM subunit" in V4L2 jargon.
> OK, that's fine for me (except maybe for a "we should rename" thought).

To be clear, I'm not sure it's worth renaming drm_bridge to something
else, and I certainly don't consider it is a prerequisite to this
series.

> > > So far this approach has been working because devm and drmm ensure the
> > > panel bridge would be dealloacted at some point. However the devm and drmm
> > > release actions are associated to the consumer struct device (the panel
> > > bridge consumer), so if the panel bridge is removed and the consumer is
> > > not, deallocation won't happen.  
> > 
> > Oh, right, if one doesn't call drm_bridge_put(), that will result in a
> > memory leak. The general topic we discuss and try to address here is
> > memory safety, and a memory leak is considered safe. It's also going to
> > get allocated only a couple of times anyway, so it's not a *huge*
> > concern.
> > 
> > And about how to actually fix it, there's two ways to go about it:
> > 
> >   * Either we do a coccinelle script and try to put all those
> >     drm_bridge_put() everywhere;
> > 
> >   * Or we create a devm/drmm action and drop the reference
> >     automatically.
> > 
> > The latter is obviously less intrusive, we would need to deprecate
> > devm_of_get_bridge() for it to be safe, and I'm not entirely sure it
> > would be enough, but it might just work.
> > 
> > > For hotplugging we cannot use drmm and devm and instead we use get/put,
> > > to let the "next bridge" disappear with the previous one still present.
> > > So the trivial idea is to add a drm_of_get_bridge(), similar to
> > > {drmm,devm_drm}_of_get_bridge() except it uses plain
> > > drm_panel_bridge_add() instead of devm/drmm variants. But then the
> > > caller (which is the panel consumer) will have to dispose of the struct
> > > drm_bridge pointer by calling:
> > > 
> > >  - drm_bridge_put() in case a)
> > >  - drm_panel_bridge_remove in case b)
> > > 
> > > And that's the problem I need to solve.  
> > 
> > I'm not sure the problem is limited to panel_bridge. Your question is
> > essentially: how do I make sure a driver-specific init is properly freed
> > at drm_bridge_put time. This was done so far mostly at bridge remove
> > time, but we obviously can't do that anymore.
> > 
> > But we'd have the same issue if, say, we needed to remove a workqueue
> > from a driver.
> > 
> > I think we need a destroy() hook for bridges, just like we have for
> > connectors for example that would deal with calling
> > drm_panel_bridge_remove() if necessary, or any other driver-specific
> > sequence.
> 
> The .destroy hook looked appealing at first, however as I tried to
> apply the idea to bridges I'm not sure it matches. Here's why.
> 
> The normal (and sane) flow for a bridge is:
> 
>  A) probe
>     1. allocate private struct embedding struct drm_bridge
>        (I have an _alloc() variant ready for v5 to improve this as you proposed)
>     2. get resources, initialize struct fields
>     3. drm_bridge_add(): publish bridge into global bridge_list
> 
> Now the bridge can be found and pointers taken and used.

We agree so far.

> And on hardware removal, in reverse order:
>  
>  B) remove (hardware is hot-unplugged)
>     3. unpublish bridge
>     2. release resources, cleanup
>     1. kfree private struct

I think the sequence would rather be something like:

B') remove
  3. unpublish bridge
  2. release device resources
  1. release reference

C') last put
  2. release KMS resources
  1. kfree private struct

> Some drivers do real stuff in B2, so it is important that B3 happens
> before B2, isn't it? We don't want other drivers to find and use a
> bridge that is being dismantled, or afterwards.

Yeah, B3/B'3 should definitely happen first.

> B3 should normally happen by removing the bridge from the global
> bridge_list, or other bridges might find it. However setting the "gone"
> bool and teaching of_drm_find_bridge() & Co to skip bridges with
> gone==true would allow to postpone the actual removal, if needed.
> 
> With that said, with hotplugging there will be two distinct events:
> 
>  * hardware removal
>  * last ref is put
> 
> The second event could happen way later than the first one. During the
> time frame between the two events we need the bridge to be unpublished
> and the bridge resources to be already released, as the hardware is
> gone. We cannot do this at the last put, it's too late.
> 
> So I think the only sane sequence is:
> 
>  * on hardware removal:
>      B3) unpublish bridge (drm_bridge_remove() or just set gone flag)
>      B2) free resources, deinit whatever needed
>  * when last ref is put
>      B1) kfree (likely via devm)

No, devm will have destroyed it in B'2. We need to destroy it in the
cleanup hook of kref_put

> So, back to the .destroy hook, it would fit at B2, and not at the last
> put.

destroy would be called at C'2 time

> However in that place it seems unnecessary. The actions "on hardware
> removal" (B3, B2) are done by the driver remove function, so they are
> already driver specific without any additional hook. I'm however fine
> to add the hook on hardware removal in case there's a good reason I
> missed.
> 
> Do you think my reasoning is correct so far?
> 
> If you don't, can you clarify at which events (hardware removal VS last
> put) the various actions (drm_bridge_remove, set gone flag, calling
> .destroy, free resources and deinint, kfree) should be done?

I believe I did already, the gone flag should be set before B'2

Maxime
> 
> 
> (Side note: I've been pondering on why the .destroy hook works for
> connectors and would not for bridges. I think it's due to the global
> bridge_list, or because of the different lifetime management based on
> drmm for connectors, or both.)
> 
> 
> It may look as if my discussion is about bridges in general and not
> about the panel bridge. However before delving into how to dispose of
> a panel bridge we need to sort out how to dispose of a bridge in the
> first place.
> 
> Luca
> 
> -- 
> Luca Ceresoli, Bootlin
> Embedded Linux and Kernel engineering
> https://bootlin.com
>
Maxime Ripard Jan. 28, 2025, 3:52 p.m. UTC | #12
On Tue, Jan 21, 2025 at 01:57:47PM +0200, Dmitry Baryshkov wrote:
> On Tue, Jan 21, 2025 at 12:27:18PM +0100, Luca Ceresoli wrote:
> > Hi Dmitry,
> > 
> > On Thu, 16 Jan 2025 12:56:25 +0200
> > Dmitry Baryshkov <dmitry.baryshkov@linaro.org> wrote:
> > 
> > [...]
> > 
> > > > Idea 3: 
> > > > 
> > > > The idea is that if the panel driver framework always creates a panel
> > > > bridge, it will never need to be created on the fly automagically by
> > > > its consumers, so the whole problem would disappear. It also would be
> > > > better modeling the hardware: still wrapping a panel with a drm_bridge
> > > > that does not exist in the hardware, but at least having it created by
> > > > the provider driver and not by the consumer driver which happens to
> > > > look for it.  
> > > 
> > > I think this is the best option up to now.
> > 
> > Thanks for sharing your opinion. However a few points are not clear to
> > me, see below.
> > 
> > > > This looks like a promising and simple idea, so I tried a quick
> > > > implementation:
> > > > 
> > > >  void drm_panel_init(struct drm_panel *panel, struct device *dev,
> > > >                     const struct drm_panel_funcs *funcs, int connector_type)
> > > >  {
> > > > +       struct drm_bridge *bridge;
> > > > +
> > > >         INIT_LIST_HEAD(&panel->list);
> > > >         INIT_LIST_HEAD(&panel->followers);
> > > >         mutex_init(&panel->follower_lock);
> > > >         panel->dev = dev;
> > > >         panel->funcs = funcs;
> > > >         panel->connector_type = connector_type;
> > > > +
> > > > +       bridge = devm_drm_panel_bridge_add(panel->dev, panel);
> > > > +       WARN_ON(!bridge);
> > > >  }
> > > > 
> > > > This is somewhat working but it requires more work because:
> > > > 
> > > >  * as it is, it creates a circular dependency between drm_panel and the
> > > >    panel bridge, and modular builds will fail:
> > > > 
> > > >      depmod: ERROR: Cycle detected: drm -> drm_kms_helper -> drm
> > > > 
> > > >  * The panel bridge implementation should be made private to the panel
> > > >    driver only (possibly helping to solve the previous issue?)  
> > > 
> > > Or merge drm_panel.c into bridge/panel.c.
> > 
> > Not sure here you mean exactly what you wrote, or the other way around.
> > It looks more correct to me that the panel core (drm_panel.c) starts
> > exposing a bridge now, and not that the panel bridge which is just one
> > of many bridge drivers starts handling all the panel-related stuff.
> 
> No, I actually meant what I wrote: merge drm_panel.c into
> bridge/panel.c. Indeed we have some drivers that use panel directly.
> However drm_bridge is a more generic interface. So, yes, I propose to
> have a bridge driver which incorporates panel support.

It's a legitimate subject to discuss, but I'm not sure it's worth
focusing on this at this point though.

We should probably split it out into smaller chunks. Figuring out the
drivers lifetime, and reference counting API is hard enough as it is,
throwing panels into the mix at this point just adds more complexity.

Maxime
Luca Ceresoli Jan. 29, 2025, 11:51 a.m. UTC | #13
Hello Maxime,

thanks for the continued feedback.

On Tue, 28 Jan 2025 16:09:53 +0100
Maxime Ripard <mripard@kernel.org> wrote:

> To be clear, I'm not sure it's worth renaming drm_bridge to something
> else, and I certainly don't consider it is a prerequisite to this
> series.

Sure, I agree.

> > > > For hotplugging we cannot use drmm and devm and instead we use get/put,
> > > > to let the "next bridge" disappear with the previous one still present.
> > > > So the trivial idea is to add a drm_of_get_bridge(), similar to
> > > > {drmm,devm_drm}_of_get_bridge() except it uses plain
> > > > drm_panel_bridge_add() instead of devm/drmm variants. But then the
> > > > caller (which is the panel consumer) will have to dispose of the struct
> > > > drm_bridge pointer by calling:
> > > > 
> > > >  - drm_bridge_put() in case a)
> > > >  - drm_panel_bridge_remove in case b)
> > > > 
> > > > And that's the problem I need to solve.    
> > > 
> > > I'm not sure the problem is limited to panel_bridge. Your question is
> > > essentially: how do I make sure a driver-specific init is properly freed
> > > at drm_bridge_put time. This was done so far mostly at bridge remove
> > > time, but we obviously can't do that anymore.
> > > 
> > > But we'd have the same issue if, say, we needed to remove a workqueue
> > > from a driver.
> > > 
> > > I think we need a destroy() hook for bridges, just like we have for
> > > connectors for example that would deal with calling
> > > drm_panel_bridge_remove() if necessary, or any other driver-specific
> > > sequence.  
> > 
> > The .destroy hook looked appealing at first, however as I tried to
> > apply the idea to bridges I'm not sure it matches. Here's why.
> > 
> > The normal (and sane) flow for a bridge is:
> > 
> >  A) probe
> >     1. allocate private struct embedding struct drm_bridge
> >        (I have an _alloc() variant ready for v5 to improve this as you proposed)
> >     2. get resources, initialize struct fields
> >     3. drm_bridge_add(): publish bridge into global bridge_list
> > 
> > Now the bridge can be found and pointers taken and used.  
> 
> We agree so far.

Good :-)

> > And on hardware removal, in reverse order:
> >  
> >  B) remove (hardware is hot-unplugged)
> >     3. unpublish bridge
> >     2. release resources, cleanup
> >     1. kfree private struct  
> 
> I think the sequence would rather be something like:
> 
> B') remove
>   3. unpublish bridge
>   2. release device resources
>   1. release reference
> 
> C') last put
>   2. release KMS resources
>   1. kfree private struct

Just to ensure we are on the same page: patch 3 is already implementing
this model except for C'2.

Well, in reality it even implements a .destroy callback at C'2, even
though it was not meant for the usage you have in mind and it's
scheduled for removal in v6 -- even though as I said I'm OK in
re-adding it if it is useful.

Mainly I'm not sure I understand for which ultimate goal you propose to
postpone releasing KMS resources to C'.

Is it (1) because we _want_ to postpone releasing KMS resources? In this
case I don't know the use case, so if you have a practical example it
would probably help a lot.

Moreover, for the panel bridge specifically, it would mean postponing
the destruction of the struct panel_bridge, which however has a pointer
to the panel. But the panel is probably hot-unplugged at the same time
as the previous removable bridge(s), we'd have a time window between B'
and C' where there is a pointer to a freed struct panel. We'd need to
ensure that pointer is cleared at B'2, even though it is a "KMS
resource" and not a "device resource".

Or is it (2) because there are cases where we don't know how else we
could release the KMS resources? AFAIK all bridge drivers are able to
release everything in their remove function (B'2) with the exception of
the panel bridge, so this sounds like a workaround for just one user
that apparently we all agree should be improved on its own anyway.

Note I'm not strongly against (2), if it simplifies the path towards
dynamic bridge lifetime by postponing the panel bridge rework. I just
need to understand the plan.

Another question is what is a device resource and what is a KMS
resource. What's the boolean expression to classify a
resource in one or the other family? For example, in your example
quoted above ("But we'd have the same issue if, say, we needed to
remove a workqueue from a driver"), is the workqueue a KMS resource?

I need to understand your idea if I want to implement it.

> > Some drivers do real stuff in B2, so it is important that B3 happens
> > before B2, isn't it? We don't want other drivers to find and use a
> > bridge that is being dismantled, or afterwards.  
> 
> Yeah, B3/B'3 should definitely happen first.
> 
> > B3 should normally happen by removing the bridge from the global
> > bridge_list, or other bridges might find it. However setting the "gone"
> > bool and teaching of_drm_find_bridge() & Co to skip bridges with
> > gone==true would allow to postpone the actual removal, if needed.
> > 
> > With that said, with hotplugging there will be two distinct events:
> > 
> >  * hardware removal
> >  * last ref is put
> > 
> > The second event could happen way later than the first one. During the
> > time frame between the two events we need the bridge to be unpublished
> > and the bridge resources to be already released, as the hardware is
> > gone. We cannot do this at the last put, it's too late.
> > 
> > So I think the only sane sequence is:
> > 
> >  * on hardware removal:
> >      B3) unpublish bridge (drm_bridge_remove() or just set gone flag)
> >      B2) free resources, deinit whatever needed
> >  * when last ref is put
> >      B1) kfree (likely via devm)  
> 
> No, devm will have destroyed it in B'2. We need to destroy it in the
> cleanup hook of kref_put

devm will have destroyed what? Sorry I'm not following.

If you mean "it" == "the private struct", then no, this is not the
case. drm_bridge_init in patch 3 does not kfree the private struct but
instead registers a devm action to call drm_bridge_put. Then, at the
last put, drm_bridge_free() will actually kfree the private struct.

In this v5, kree()ing the private struct at the last put is done via
a callback. In my work towards v6 the principle is the same but I have
reworked it all, implementing a devm_drm_bridge_alloc() macro as you
suggested (BTW that was a great improvement, thanks) and removing the
.destroy callback as it was not needed.

In case it helps, here's a preview of my v6, with some added comments to
support this discussion:

/* Internal function (for refcounted bridges) */
void __drm_bridge_free(struct kref *kref)
{
        struct drm_bridge *bridge = container_of(kref, struct drm_bridge, refcount);
        void *container = ((void*)bridge) - bridge->container_offset;

        DRM_DEBUG("bridge=%p, container=%p FREE\n", bridge, container);

        kfree(container);
}
EXPORT_SYMBOL(__drm_bridge_free);

static inline void drm_bridge_put(struct drm_bridge *bridge)
{
        if (!drm_bridge_is_refcounted(bridge))
                return;

        DRM_DEBUG("bridge=%p PUT\n", bridge);

        kref_put(&bridge->refcount, __drm_bridge_free);
}

static void drm_bridge_put_void(void *data)
{
        struct drm_bridge *bridge = (struct drm_bridge *)data;

        drm_bridge_put(bridge);
}

// fold this into __devm_drm_bridge_alloc() or keep for consistency
// with drm_encoder.c?
static int __devm_drm_bridge_init(struct device *dev, struct drm_bridge *bridge,
                                  size_t offset, const struct drm_bridge_funcs *funcs)
{
        int err;

        bridge->container_offset = offset;
        kref_init(&bridge->refcount);
        bridge->is_refcounted = 1;

        err = devm_add_action_or_reset(dev, drm_bridge_put_void, bridge); // <== devm just puts one ref, does not kfree
        if (err)
                return err;

        bridge->funcs = funcs;

        return 0;
}

void *__devm_drm_bridge_alloc(struct device *dev, size_t size, size_t offset,
                              const struct drm_bridge_funcs *funcs)
{
        void *container;
        struct drm_bridge *bridge;
        int ret;

        if (!funcs) {
                dev_warn(dev, "Missing funcs pointer\n");
                return ERR_PTR(-EINVAL);
        }

        container = kzalloc(size, GFP_KERNEL);     // <== NOT allocating with devm
        if (!container)
                return ERR_PTR(-ENOMEM);

        bridge = container + offset;

        ret = __devm_drm_bridge_init(dev, bridge, offset, funcs);
        if (ret)
                return ERR_PTR(ret);

        DRM_DEBUG("bridge=%p, container=%p, funcs=%ps ALLOC\n", bridge, container, funcs);

        return container;
}
EXPORT_SYMBOL(__devm_drm_bridge_alloc);

#define devm_drm_bridge_alloc(dev, type, member, funcs) \
        ((type *)__devm_drm_bridge_alloc(dev, sizeof(type), \
                                         offsetof(type, member), funcs))

Luca
Maxime Ripard Feb. 4, 2025, 3:44 p.m. UTC | #14
Hi Luca,

On Wed, Jan 29, 2025 at 12:51:46PM +0100, Luca Ceresoli wrote:
> > > > > For hotplugging we cannot use drmm and devm and instead we use get/put,
> > > > > to let the "next bridge" disappear with the previous one still present.
> > > > > So the trivial idea is to add a drm_of_get_bridge(), similar to
> > > > > {drmm,devm_drm}_of_get_bridge() except it uses plain
> > > > > drm_panel_bridge_add() instead of devm/drmm variants. But then the
> > > > > caller (which is the panel consumer) will have to dispose of the struct
> > > > > drm_bridge pointer by calling:
> > > > > 
> > > > >  - drm_bridge_put() in case a)
> > > > >  - drm_panel_bridge_remove in case b)
> > > > > 
> > > > > And that's the problem I need to solve.    
> > > > 
> > > > I'm not sure the problem is limited to panel_bridge. Your question is
> > > > essentially: how do I make sure a driver-specific init is properly freed
> > > > at drm_bridge_put time. This was done so far mostly at bridge remove
> > > > time, but we obviously can't do that anymore.
> > > > 
> > > > But we'd have the same issue if, say, we needed to remove a workqueue
> > > > from a driver.
> > > > 
> > > > I think we need a destroy() hook for bridges, just like we have for
> > > > connectors for example that would deal with calling
> > > > drm_panel_bridge_remove() if necessary, or any other driver-specific
> > > > sequence.  
> > > 
> > > The .destroy hook looked appealing at first, however as I tried to
> > > apply the idea to bridges I'm not sure it matches. Here's why.
> > > 
> > > The normal (and sane) flow for a bridge is:
> > > 
> > >  A) probe
> > >     1. allocate private struct embedding struct drm_bridge
> > >        (I have an _alloc() variant ready for v5 to improve this as you proposed)
> > >     2. get resources, initialize struct fields
> > >     3. drm_bridge_add(): publish bridge into global bridge_list
> > > 
> > > Now the bridge can be found and pointers taken and used.  
> > 
> > We agree so far.
> 
> Good :-)
> 
> > > And on hardware removal, in reverse order:
> > >  
> > >  B) remove (hardware is hot-unplugged)
> > >     3. unpublish bridge
> > >     2. release resources, cleanup
> > >     1. kfree private struct  
> > 
> > I think the sequence would rather be something like:
> > 
> > B') remove
> >   3. unpublish bridge
> >   2. release device resources
> >   1. release reference
> > 
> > C') last put
> >   2. release KMS resources
> >   1. kfree private struct
> 
> Just to ensure we are on the same page: patch 3 is already implementing
> this model except for C'2.
> 
> Well, in reality it even implements a .destroy callback at C'2, even
> though it was not meant for the usage you have in mind and it's
> scheduled for removal in v6 -- even though as I said I'm OK in
> re-adding it if it is useful.
> 
> Mainly I'm not sure I understand for which ultimate goal you propose to
> postpone releasing KMS resources to C'.
> 
> Is it (1) because we _want_ to postpone releasing KMS resources? In this
> case I don't know the use case, so if you have a practical example it
> would probably help a lot.

It's not that we want it, it's that it's already happening :)

The main DRM device is only torn down not when its devices goes away,
but when the last application closes its fd to the device file. Thus,
there's a significant window (possibly infinitely long) between the
device being removed and the DRM device being freed, during which the
application will still be able to issue ioctl that might reach the
driver.

Thus, we have two kind of resources: the ones tied to the device
(clocks, register mappings, etc.) that will go away when the device is
removed, and the ones tied to the DRM device (connectors, bridges, etc.)
that need to stick until the DRM device is free'd. It's the difference
between devm and drmm actions.

> Moreover, for the panel bridge specifically, it would mean postponing
> the destruction of the struct panel_bridge, which however has a pointer
> to the panel. But the panel is probably hot-unplugged at the same time
> as the previous removable bridge(s), we'd have a time window between B'
> and C' where there is a pointer to a freed struct panel. We'd need to
> ensure that pointer is cleared at B'2, even though it is a "KMS
> resource" and not a "device resource".

You're correct, but we have to start somewhere. Fixing the issue for
bridges only will already fix it for all setups using only bridges, even
if the ones using panels are still broken.

I'm also mentoring someone at the moment to fix this for panels, so it's
only temporary.

> Or is it (2) because there are cases where we don't know how else we
> could release the KMS resources? AFAIK all bridge drivers are able to
> release everything in their remove function (B'2) with the exception of
> the panel bridge, so this sounds like a workaround for just one user
> that apparently we all agree should be improved on its own anyway.
> 
> Note I'm not strongly against (2), if it simplifies the path towards
> dynamic bridge lifetime by postponing the panel bridge rework. I just
> need to understand the plan.
> 
> Another question is what is a device resource and what is a KMS
> resource. What's the boolean expression to classify a
> resource in one or the other family? For example, in your example
> quoted above ("But we'd have the same issue if, say, we needed to
> remove a workqueue from a driver"), is the workqueue a KMS resource?

It depends on what the workqueue is doing. If it's to handle atomic
commits like the writeback code, then it's KMS facing. If it's to handle
interrupts, it's device facing.

It's hard to come up with a boolean classification, but it's basically
"can any ioctl code path end up using that resource?".

> I need to understand your idea if I want to implement it.
> 
> > > Some drivers do real stuff in B2, so it is important that B3 happens
> > > before B2, isn't it? We don't want other drivers to find and use a
> > > bridge that is being dismantled, or afterwards.  
> > 
> > Yeah, B3/B'3 should definitely happen first.
> > 
> > > B3 should normally happen by removing the bridge from the global
> > > bridge_list, or other bridges might find it. However setting the "gone"
> > > bool and teaching of_drm_find_bridge() & Co to skip bridges with
> > > gone==true would allow to postpone the actual removal, if needed.
> > > 
> > > With that said, with hotplugging there will be two distinct events:
> > > 
> > >  * hardware removal
> > >  * last ref is put
> > > 
> > > The second event could happen way later than the first one. During the
> > > time frame between the two events we need the bridge to be unpublished
> > > and the bridge resources to be already released, as the hardware is
> > > gone. We cannot do this at the last put, it's too late.
> > > 
> > > So I think the only sane sequence is:
> > > 
> > >  * on hardware removal:
> > >      B3) unpublish bridge (drm_bridge_remove() or just set gone flag)
> > >      B2) free resources, deinit whatever needed
> > >  * when last ref is put
> > >      B1) kfree (likely via devm)  
> > 
> > No, devm will have destroyed it in B'2. We need to destroy it in the
> > cleanup hook of kref_put
> 
> devm will have destroyed what? Sorry I'm not following.
>
> If you mean "it" == "the private struct", then no, this is not the
> case. drm_bridge_init in patch 3 does not kfree the private struct but
> instead registers a devm action to call drm_bridge_put. Then, at the
> last put, drm_bridge_free() will actually kfree the private struct.
> 
> In this v5, kree()ing the private struct at the last put is done via
> a callback. In my work towards v6 the principle is the same but I have
> reworked it all, implementing a devm_drm_bridge_alloc() macro as you
> suggested (BTW that was a great improvement, thanks) and removing the
> .destroy callback as it was not needed.
> 
> In case it helps, here's a preview of my v6, with some added comments to
> support this discussion:
> 
> /* Internal function (for refcounted bridges) */
> void __drm_bridge_free(struct kref *kref)
> {
>         struct drm_bridge *bridge = container_of(kref, struct drm_bridge, refcount);
>         void *container = ((void*)bridge) - bridge->container_offset;
> 
>         DRM_DEBUG("bridge=%p, container=%p FREE\n", bridge, container);
> 
>         kfree(container);
> }
> EXPORT_SYMBOL(__drm_bridge_free);
> 
> static inline void drm_bridge_put(struct drm_bridge *bridge)
> {
>         if (!drm_bridge_is_refcounted(bridge))
>                 return;
> 
>         DRM_DEBUG("bridge=%p PUT\n", bridge);
> 
>         kref_put(&bridge->refcount, __drm_bridge_free);
> }
> 
> static void drm_bridge_put_void(void *data)
> {
>         struct drm_bridge *bridge = (struct drm_bridge *)data;
> 
>         drm_bridge_put(bridge);
> }
> 
> // fold this into __devm_drm_bridge_alloc() or keep for consistency
> // with drm_encoder.c?
> static int __devm_drm_bridge_init(struct device *dev, struct drm_bridge *bridge,
>                                   size_t offset, const struct drm_bridge_funcs *funcs)
> {
>         int err;
> 
>         bridge->container_offset = offset;
>         kref_init(&bridge->refcount);
>         bridge->is_refcounted = 1;
> 
>         err = devm_add_action_or_reset(dev, drm_bridge_put_void, bridge); // <== devm just puts one ref, does not kfree
>         if (err)
>                 return err;
> 
>         bridge->funcs = funcs;
> 
>         return 0;
> }
> 
> void *__devm_drm_bridge_alloc(struct device *dev, size_t size, size_t offset,
>                               const struct drm_bridge_funcs *funcs)
> {
>         void *container;
>         struct drm_bridge *bridge;
>         int ret;
> 
>         if (!funcs) {
>                 dev_warn(dev, "Missing funcs pointer\n");
>                 return ERR_PTR(-EINVAL);
>         }
> 
>         container = kzalloc(size, GFP_KERNEL);     // <== NOT allocating with devm
>         if (!container)
>                 return ERR_PTR(-ENOMEM);
> 
>         bridge = container + offset;
> 
>         ret = __devm_drm_bridge_init(dev, bridge, offset, funcs);
>         if (ret)
>                 return ERR_PTR(ret);
> 
>         DRM_DEBUG("bridge=%p, container=%p, funcs=%ps ALLOC\n", bridge, container, funcs);
> 
>         return container;
> }
> EXPORT_SYMBOL(__devm_drm_bridge_alloc);

Awesome, I guess we were actually understanding each other the whole
time then :)

I'm still kind of sure we'll require a destroy callback to call in
__drm_bridge_free, but if it works, I guess it's good enough for now.

Maxime
diff mbox series

Patch

diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c
index f8b4fb8357659018ec0db65374ee5d05330639ae..c4d1563fd32019efde523dfc0863be044c05a826 100644
--- a/drivers/gpu/drm/bridge/samsung-dsim.c
+++ b/drivers/gpu/drm/bridge/samsung-dsim.c
@@ -1705,6 +1705,7 @@  static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
 	struct device *dev = dsi->dev;
 	struct device_node *np = dev->of_node;
 	struct device_node *remote;
+	struct drm_bridge *out_bridge;
 	struct drm_panel *panel;
 	int ret;
 
@@ -1740,21 +1741,23 @@  static int samsung_dsim_host_attach(struct mipi_dsi_host *host,
 
 	panel = of_drm_find_panel(remote);
 	if (!IS_ERR(panel)) {
-		dsi->out_bridge = devm_drm_panel_bridge_add(dev, panel);
+		out_bridge = devm_drm_panel_bridge_add(dev, panel);
 	} else {
-		dsi->out_bridge = of_drm_find_bridge(remote);
-		if (!dsi->out_bridge)
-			dsi->out_bridge = ERR_PTR(-EINVAL);
+		out_bridge = of_drm_find_bridge(remote);
+		if (!out_bridge)
+			out_bridge = ERR_PTR(-EINVAL);
 	}
 
 	of_node_put(remote);
 
-	if (IS_ERR(dsi->out_bridge)) {
-		ret = PTR_ERR(dsi->out_bridge);
+	if (IS_ERR(out_bridge)) {
+		ret = PTR_ERR(out_bridge);
 		DRM_DEV_ERROR(dev, "failed to find the bridge: %d\n", ret);
 		return ret;
 	}
 
+	dsi->out_bridge = out_bridge;
+
 	DRM_DEV_INFO(dev, "Attached %s device (lanes:%d bpp:%d mode-flags:0x%lx)\n",
 		     device->name, device->lanes,
 		     mipi_dsi_pixel_format_to_bpp(device->format),