diff mbox series

[v1] clk: Convert managed get functions to devm_add_action API

Message ID 3d8a58bf-0814-1ec1-038a-10a20b9646ad@free.fr (mailing list archive)
State New, archived
Headers show
Series [v1] clk: Convert managed get functions to devm_add_action API | expand

Commit Message

Marc Gonzalez Nov. 26, 2019, 4:13 p.m. UTC
Date: Tue, 26 Nov 2019 13:56:53 +0100

Using devm_add_action_or_reset() produces simpler code and smaller
object size:

1 file changed, 16 insertions(+), 46 deletions(-)

    text	   data	    bss	    dec	    hex	filename
-   1797	     80	      0	   1877	    755	drivers/clk/clk-devres.o
+   1499	     56	      0	   1555	    613	drivers/clk/clk-devres.o

Signed-off-by: Marc Gonzalez <marc.w.gonzalez@free.fr>
---
 drivers/clk/clk-devres.c | 62 +++++++++++-----------------------------
 1 file changed, 16 insertions(+), 46 deletions(-)

Comments

Bjorn Andersson Nov. 28, 2019, 6:56 p.m. UTC | #1
On Tue 26 Nov 08:13 PST 2019, Marc Gonzalez wrote:

> Date: Tue, 26 Nov 2019 13:56:53 +0100
> 
> Using devm_add_action_or_reset() produces simpler code and smaller
> object size:
> 
> 1 file changed, 16 insertions(+), 46 deletions(-)
> 
>     text	   data	    bss	    dec	    hex	filename
> -   1797	     80	      0	   1877	    755	drivers/clk/clk-devres.o
> +   1499	     56	      0	   1555	    613	drivers/clk/clk-devres.o
> 
> Signed-off-by: Marc Gonzalez <marc.w.gonzalez@free.fr>

Looks neat

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>

Regards,
Bjorn

> ---
>  drivers/clk/clk-devres.c | 62 +++++++++++-----------------------------
>  1 file changed, 16 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
> index be160764911b..04379c1f203e 100644
> --- a/drivers/clk/clk-devres.c
> +++ b/drivers/clk/clk-devres.c
> @@ -4,31 +4,29 @@
>  #include <linux/export.h>
>  #include <linux/gfp.h>
>  
> -static void devm_clk_release(struct device *dev, void *res)
> +static void __clk_put(void *clk)
>  {
> -	clk_put(*(struct clk **)res);
> +	clk_put(clk);
>  }
>  
>  struct clk *devm_clk_get(struct device *dev, const char *id)
>  {
> -	struct clk **ptr, *clk;
> +	struct clk *clk = clk_get(dev, id);
>  
> -	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
> -	if (!ptr)
> -		return ERR_PTR(-ENOMEM);
> -
> -	clk = clk_get(dev, id);
> -	if (!IS_ERR(clk)) {
> -		*ptr = clk;
> -		devres_add(dev, ptr);
> -	} else {
> -		devres_free(ptr);
> -	}
> +	if (!IS_ERR(clk))
> +		if (devm_add_action_or_reset(dev, __clk_put, clk))
> +			clk = ERR_PTR(-ENOMEM);
>  
>  	return clk;
>  }
>  EXPORT_SYMBOL(devm_clk_get);
>  
> +void devm_clk_put(struct device *dev, struct clk *clk)
> +{
> +	devm_release_action(dev, __clk_put, clk);
> +}
> +EXPORT_SYMBOL(devm_clk_put);
> +
>  struct clk *devm_clk_get_optional(struct device *dev, const char *id)
>  {
>  	struct clk *clk = devm_clk_get(dev, id);
> @@ -116,42 +114,14 @@ int __must_check devm_clk_bulk_get_all(struct device *dev,
>  }
>  EXPORT_SYMBOL_GPL(devm_clk_bulk_get_all);
>  
> -static int devm_clk_match(struct device *dev, void *res, void *data)
> -{
> -	struct clk **c = res;
> -	if (!c || !*c) {
> -		WARN_ON(!c || !*c);
> -		return 0;
> -	}
> -	return *c == data;
> -}
> -
> -void devm_clk_put(struct device *dev, struct clk *clk)
> -{
> -	int ret;
> -
> -	ret = devres_release(dev, devm_clk_release, devm_clk_match, clk);
> -
> -	WARN_ON(ret);
> -}
> -EXPORT_SYMBOL(devm_clk_put);
> -
>  struct clk *devm_get_clk_from_child(struct device *dev,
>  				    struct device_node *np, const char *con_id)
>  {
> -	struct clk **ptr, *clk;
> -
> -	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
> -	if (!ptr)
> -		return ERR_PTR(-ENOMEM);
> +	struct clk *clk = of_clk_get_by_name(np, con_id);
>  
> -	clk = of_clk_get_by_name(np, con_id);
> -	if (!IS_ERR(clk)) {
> -		*ptr = clk;
> -		devres_add(dev, ptr);
> -	} else {
> -		devres_free(ptr);
> -	}
> +	if (!IS_ERR(clk))
> +		if (devm_add_action_or_reset(dev, __clk_put, clk))
> +			clk = ERR_PTR(-ENOMEM);
>  
>  	return clk;
>  }
> -- 
> 2.17.1
Dmitry Torokhov Dec. 2, 2019, 1:42 a.m. UTC | #2
On Thu, Nov 28, 2019 at 10:56:30AM -0800, Bjorn Andersson wrote:
> On Tue 26 Nov 08:13 PST 2019, Marc Gonzalez wrote:
> 
> > Date: Tue, 26 Nov 2019 13:56:53 +0100
> > 
> > Using devm_add_action_or_reset() produces simpler code and smaller
> > object size:
> > 
> > 1 file changed, 16 insertions(+), 46 deletions(-)
> > 
> >     text	   data	    bss	    dec	    hex	filename
> > -   1797	     80	      0	   1877	    755	drivers/clk/clk-devres.o
> > +   1499	     56	      0	   1555	    613	drivers/clk/clk-devres.o
> > 
> > Signed-off-by: Marc Gonzalez <marc.w.gonzalez@free.fr>
> 
> Looks neat
> 
> Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>

This however increases the runtime costs as each custom action cost us
an extra pointer. Given that in a system we likely have many clocks
managed by devres, I am not sure that this code savings is actually
gives us overall win. It might still, I just want to understand how we
are allocating/packing devres structures.

Thanks.
Marc Gonzalez Dec. 2, 2019, 9:25 a.m. UTC | #3
On 02/12/2019 02:42, Dmitry Torokhov wrote:

> On Thu, Nov 28, 2019 at 10:56:30AM -0800, Bjorn Andersson wrote:
> 
>> On Tue 26 Nov 08:13 PST 2019, Marc Gonzalez wrote:
>>
>>> Date: Tue, 26 Nov 2019 13:56:53 +0100
>>>
>>> Using devm_add_action_or_reset() produces simpler code and smaller
>>> object size:
>>>
>>> 1 file changed, 16 insertions(+), 46 deletions(-)
>>>
>>>     text	   data	    bss	    dec	    hex	filename
>>> -   1797	     80	      0	   1877	    755	drivers/clk/clk-devres.o
>>> +   1499	     56	      0	   1555	    613	drivers/clk/clk-devres.o
>>>
>>> Signed-off-by: Marc Gonzalez <marc.w.gonzalez@free.fr>
>>
>> Looks neat
>>
>> Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
> 
> This however increases the runtime costs as each custom action cost us
> an extra pointer. Given that in a system we likely have many clocks
> managed by devres, I am not sure that this code savings is actually
> gives us overall win. It might still, I just want to understand how we
> are allocating/packing devres structures.

I'm not 100% sure what you are saying.

Are you arguing that the proposed patch increases the run-time cost of
devm_clk_put() so much that the listed improvements (simpler source code,
smaller object size) are not worth it?

AFAIU, the release action is only called
- explicitly, when devm_clk_put() is called
- implicitly, when the device is removed

How often are clocks removed?

In hot code-path (called hundreds of times per second) it makes sense to
write more complex code, to shave a few cycles every iteration. But in
cold code-path, I think it's better to write short/simple code.

Regards.
Robin Murphy Dec. 2, 2019, 1:51 p.m. UTC | #4
On 02/12/2019 9:25 am, Marc Gonzalez wrote:
> On 02/12/2019 02:42, Dmitry Torokhov wrote:
> 
>> On Thu, Nov 28, 2019 at 10:56:30AM -0800, Bjorn Andersson wrote:
>>
>>> On Tue 26 Nov 08:13 PST 2019, Marc Gonzalez wrote:
>>>
>>>> Date: Tue, 26 Nov 2019 13:56:53 +0100
>>>>
>>>> Using devm_add_action_or_reset() produces simpler code and smaller
>>>> object size:
>>>>
>>>> 1 file changed, 16 insertions(+), 46 deletions(-)
>>>>
>>>>      text	   data	    bss	    dec	    hex	filename
>>>> -   1797	     80	      0	   1877	    755	drivers/clk/clk-devres.o
>>>> +   1499	     56	      0	   1555	    613	drivers/clk/clk-devres.o
>>>>
>>>> Signed-off-by: Marc Gonzalez <marc.w.gonzalez@free.fr>
>>>
>>> Looks neat
>>>
>>> Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
>>
>> This however increases the runtime costs as each custom action cost us
>> an extra pointer. Given that in a system we likely have many clocks
>> managed by devres, I am not sure that this code savings is actually
>> gives us overall win. It might still, I just want to understand how we
>> are allocating/packing devres structures.
> 
> I'm not 100% sure what you are saying.

You reduce the text size by a constant amount, at the cost of allocating 
twice as much runtime data per clock (struct action_devres  vs. void*). 
Assuming 64-bit pointers, that means that in principle your ~320-byte 
saving would be cancelled out at ~40 managed clocks. However, that's 
also assuming that the minimum allocation granularity is no larger than 
a single pointer, which generally isn't true, so in reality it depends 
on whether the difference in data pushes the total struct devres 
allocation over the next ARCH_KMALLOC_MINALIGN boundary - if it doesn't, 
the difference comes entirely for free; if it does, the memory cost 
tradeoff gets even worse.

Robin.

> Are you arguing that the proposed patch increases the run-time cost of
> devm_clk_put() so much that the listed improvements (simpler source code,
> smaller object size) are not worth it?
> 
> AFAIU, the release action is only called
> - explicitly, when devm_clk_put() is called
> - implicitly, when the device is removed
> 
> How often are clocks removed?
> 
> In hot code-path (called hundreds of times per second) it makes sense to
> write more complex code, to shave a few cycles every iteration. But in
> cold code-path, I think it's better to write short/simple code.
> 
> Regards.
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Marc Gonzalez Dec. 11, 2019, 4:17 p.m. UTC | #5
On 02/12/2019 14:51, Robin Murphy wrote:

> On 02/12/2019 9:25 am, Marc Gonzalez wrote:
>
>> On 02/12/2019 02:42, Dmitry Torokhov wrote:
>>
>>> On Thu, Nov 28, 2019 at 10:56:30AM -0800, Bjorn Andersson wrote:
>>>
>>>> On Tue 26 Nov 08:13 PST 2019, Marc Gonzalez wrote:
>>>>
>>>>> Date: Tue, 26 Nov 2019 13:56:53 +0100
>>>>>
>>>>> Using devm_add_action_or_reset() produces simpler code and smaller
>>>>> object size:
>>>>>
>>>>> 1 file changed, 16 insertions(+), 46 deletions(-)
>>>>>
>>>>>      text	   data	    bss	    dec	    hex	filename
>>>>> -   1797	     80	      0	   1877	    755	drivers/clk/clk-devres.o
>>>>> +   1499	     56	      0	   1555	    613	drivers/clk/clk-devres.o
>>>>>
>>>>> Signed-off-by: Marc Gonzalez <marc.w.gonzalez@free.fr>
>>>>
>>>> Looks neat
>>>>
>>>> Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
>>>
>>> This however increases the runtime costs as each custom action cost us
>>> an extra pointer. Given that in a system we likely have many clocks
>>> managed by devres, I am not sure that this code savings is actually
>>> gives us overall win. It might still, I just want to understand how we
>>> are allocating/packing devres structures.
>>
>> I'm not 100% sure what you are saying.
> 
> You reduce the text size by a constant amount, at the cost of allocating 
> twice as much runtime data per clock (struct action_devres  vs. void*). 
> Assuming 64-bit pointers, that means that in principle your ~320-byte 
> saving would be cancelled out at ~40 managed clocks. However, that's 
> also assuming that the minimum allocation granularity is no larger than 
> a single pointer, which generally isn't true, so in reality it depends 
> on whether the difference in data pushes the total struct devres 
> allocation over the next ARCH_KMALLOC_MINALIGN boundary - if it doesn't, 
> the difference comes entirely for free; if it does, the memory cost 
> tradeoff gets even worse.

Aaah... memory overhead. Thanks for pointing it out.

BEFORE

devm_clk_get()
  -> devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
     allocates space for a struct devres + a pointer

struct devres {
	struct devres_node		node;
	/*
	 * Some archs want to perform DMA into kmalloc caches
	 * and need a guaranteed alignment larger than
	 * the alignment of a 64-bit integer.
	 * Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same
	 * buffer alignment as if it was allocated by plain kmalloc().
	 */
	u8 __aligned(ARCH_KMALLOC_MINALIGN) data[];
};

Not sure what it means for a flexible array member to be X-aligned...

(Since the field's address depends on the start address, which is only
determined at run-time...)

For example, on arm64, ARCH_KMALLOC_MINALIGN appears to be 128 (sometimes).

/*
 * Memory returned by kmalloc() may be used for DMA, so we must make
 * sure that all such allocations are cache aligned. Otherwise,
 * unrelated code may cause parts of the buffer to be read into the
 * cache before the transfer is done, causing old data to be seen by
 * the CPU.
 */
#define ARCH_DMA_MINALIGN	(128)


Unless the strict alignment is also imposed on kmalloc?

So basically, a struct devres starts on a multiple-of-128 address,
first the devres_node member, then padding to the next 128, then the
data member?


/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
 */
#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
#else
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#endif


A devres_node boils down to 2 object pointers + 1 function pointer.

Are there architectures supported by Linux where a function pointer
is not the same size as an object pointer? (ia64 maybe?)



OK, I will give this patch some more thought.

But I need to ask: what is the rationale for the devm_add_action API?

Regards.
Dmitry Torokhov Dec. 11, 2019, 10:28 p.m. UTC | #6
On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
> But I need to ask: what is the rationale for the devm_add_action API?

For one-off and maybe complex unwind actions in drivers that wish to use
devm API (as mixing devm and manual release is verboten). Also is often
used when some core subsystem does not provide enough devm APIs.

Thanks.
Marc Gonzalez Dec. 12, 2019, 1:53 p.m. UTC | #7
On 11/12/2019 23:28, Dmitry Torokhov wrote:

> On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
>
>> What is the rationale for the devm_add_action API?
> 
> For one-off and maybe complex unwind actions in drivers that wish to use
> devm API (as mixing devm and manual release is verboten). Also is often
> used when some core subsystem does not provide enough devm APIs.

Thanks for the insight, Dmitry. Thanks to Robin too.

This is what I understand so far:

devm_add_action() is nice because it hides/factorizes the complexity
of the devres API, but it incurs a small storage overhead of one
pointer per call, which makes it unfit for frequently used actions,
such as clk_get.

Is that correct?

My question is: why not design the API without the small overhead?

Proof of concept below:


diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 0bbb328bd17f..76392dd6273b 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -685,6 +685,20 @@ int devres_release_group(struct device *dev, void *id)
 }
 EXPORT_SYMBOL_GPL(devres_release_group);
 
+void *devm_add(struct device *dev, dr_release_t func, void *arg, size_t size)
+{
+	void *data = devres_alloc(func, size, GFP_KERNEL);
+
+	if (data) {
+		memcpy(data, arg, size);
+		devres_add(dev, data);
+	} else
+		func(dev, arg);
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(devm_add);
+
 /*
  * Custom devres actions allow inserting a simple function call
  * into the teadown sequence.
diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index be160764911b..8db671823126 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -4,6 +4,11 @@
 #include <linux/export.h>
 #include <linux/gfp.h>
 
+static void __clk_put(struct device *dev, void *data)
+{
+	clk_put(*(struct clk **)data);
+}
+
 static void devm_clk_release(struct device *dev, void *res)
 {
 	clk_put(*(struct clk **)res);
@@ -11,19 +16,11 @@ static void devm_clk_release(struct device *dev, void *res)
 
 struct clk *devm_clk_get(struct device *dev, const char *id)
 {
-	struct clk **ptr, *clk;
-
-	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
-	if (!ptr)
-		return ERR_PTR(-ENOMEM);
+	struct clk *clk = clk_get(dev, id);
 
-	clk = clk_get(dev, id);
-	if (!IS_ERR(clk)) {
-		*ptr = clk;
-		devres_add(dev, ptr);
-	} else {
-		devres_free(ptr);
-	}
+	if (!IS_ERR(clk))
+		if (!devm_add(dev, __clk_put, &clk, sizeof(clk)))
+			clk = ERR_PTR(-ENOMEM);
 
 	return clk;
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index e226030c1df3..5acb61ec39ab 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -970,6 +970,7 @@ void __iomem *devm_of_iomap(struct device *dev,
 			    resource_size_t *size);
 
 /* allows to add/remove a custom action to devres stack */
+void *devm_add(struct device *dev, dr_release_t func, void *arg, size_t size);
 int devm_add_action(struct device *dev, void (*action)(void *), void *data);
 void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
 void devm_release_action(struct device *dev, void (*action)(void *), void *data);
Russell King (Oracle) Dec. 12, 2019, 2:17 p.m. UTC | #8
On Thu, Dec 12, 2019 at 02:53:40PM +0100, Marc Gonzalez wrote:
> On 11/12/2019 23:28, Dmitry Torokhov wrote:
> 
> > On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
> >
> >> What is the rationale for the devm_add_action API?
> > 
> > For one-off and maybe complex unwind actions in drivers that wish to use
> > devm API (as mixing devm and manual release is verboten). Also is often
> > used when some core subsystem does not provide enough devm APIs.
> 
> Thanks for the insight, Dmitry. Thanks to Robin too.
> 
> This is what I understand so far:
> 
> devm_add_action() is nice because it hides/factorizes the complexity
> of the devres API, but it incurs a small storage overhead of one
> pointer per call, which makes it unfit for frequently used actions,
> such as clk_get.
> 
> Is that correct?
> 
> My question is: why not design the API without the small overhead?
> 
> Proof of concept below:
> 
> 
> diff --git a/drivers/base/devres.c b/drivers/base/devres.c
> index 0bbb328bd17f..76392dd6273b 100644
> --- a/drivers/base/devres.c
> +++ b/drivers/base/devres.c
> @@ -685,6 +685,20 @@ int devres_release_group(struct device *dev, void *id)
>  }
>  EXPORT_SYMBOL_GPL(devres_release_group);
>  
> +void *devm_add(struct device *dev, dr_release_t func, void *arg, size_t size)
> +{
> +	void *data = devres_alloc(func, size, GFP_KERNEL);
> +
> +	if (data) {
> +		memcpy(data, arg, size);
> +		devres_add(dev, data);
> +	} else
> +		func(dev, arg);
> +
> +	return data;
> +}
> +EXPORT_SYMBOL_GPL(devm_add);
> +
>  /*
>   * Custom devres actions allow inserting a simple function call
>   * into the teadown sequence.
> diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
> index be160764911b..8db671823126 100644
> --- a/drivers/clk/clk-devres.c
> +++ b/drivers/clk/clk-devres.c
> @@ -4,6 +4,11 @@
>  #include <linux/export.h>
>  #include <linux/gfp.h>
>  
> +static void __clk_put(struct device *dev, void *data)
> +{
> +	clk_put(*(struct clk **)data);
> +}
> +
>  static void devm_clk_release(struct device *dev, void *res)
>  {
>  	clk_put(*(struct clk **)res);
> @@ -11,19 +16,11 @@ static void devm_clk_release(struct device *dev, void *res)
>  
>  struct clk *devm_clk_get(struct device *dev, const char *id)
>  {
> -	struct clk **ptr, *clk;
> -
> -	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
> -	if (!ptr)
> -		return ERR_PTR(-ENOMEM);
> +	struct clk *clk = clk_get(dev, id);
>  
> -	clk = clk_get(dev, id);
> -	if (!IS_ERR(clk)) {
> -		*ptr = clk;
> -		devres_add(dev, ptr);
> -	} else {
> -		devres_free(ptr);
> -	}
> +	if (!IS_ERR(clk))
> +		if (!devm_add(dev, __clk_put, &clk, sizeof(clk)))
> +			clk = ERR_PTR(-ENOMEM);

You leak clk here.
Marc Gonzalez Dec. 12, 2019, 2:41 p.m. UTC | #9
On 12/12/2019 15:17, Russell King - ARM Linux admin wrote:

> On Thu, Dec 12, 2019 at 02:53:40PM +0100, Marc Gonzalez wrote:
>
>> On 11/12/2019 23:28, Dmitry Torokhov wrote:
>>
>>> On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
>>>
>>>> What is the rationale for the devm_add_action API?
>>>
>>> For one-off and maybe complex unwind actions in drivers that wish to use
>>> devm API (as mixing devm and manual release is verboten). Also is often
>>> used when some core subsystem does not provide enough devm APIs.
>>
>> Thanks for the insight, Dmitry. Thanks to Robin too.
>>
>> This is what I understand so far:
>>
>> devm_add_action() is nice because it hides/factorizes the complexity
>> of the devres API, but it incurs a small storage overhead of one
>> pointer per call, which makes it unfit for frequently used actions,
>> such as clk_get.
>>
>> Is that correct?
>>
>> My question is: why not design the API without the small overhead?
>>
>> Proof of concept below:
>>
>>
>> diff --git a/drivers/base/devres.c b/drivers/base/devres.c
>> index 0bbb328bd17f..76392dd6273b 100644
>> --- a/drivers/base/devres.c
>> +++ b/drivers/base/devres.c
>> @@ -685,6 +685,20 @@ int devres_release_group(struct device *dev, void *id)
>>  }
>>  EXPORT_SYMBOL_GPL(devres_release_group);
>>  
>> +void *devm_add(struct device *dev, dr_release_t func, void *arg, size_t size)
>> +{
>> +	void *data = devres_alloc(func, size, GFP_KERNEL);
>> +
>> +	if (data) {
>> +		memcpy(data, arg, size);
>> +		devres_add(dev, data);
>> +	} else
>> +		func(dev, arg);
>> +
>> +	return data;
>> +}
>> +EXPORT_SYMBOL_GPL(devm_add);
>> +
>>  /*
>>   * Custom devres actions allow inserting a simple function call
>>   * into the teadown sequence.
>> diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
>> index be160764911b..8db671823126 100644
>> --- a/drivers/clk/clk-devres.c
>> +++ b/drivers/clk/clk-devres.c
>> @@ -4,6 +4,11 @@
>>  #include <linux/export.h>
>>  #include <linux/gfp.h>
>>  
>> +static void __clk_put(struct device *dev, void *data)
>> +{
>> +	clk_put(*(struct clk **)data);
>> +}
>> +
>>  static void devm_clk_release(struct device *dev, void *res)
>>  {
>>  	clk_put(*(struct clk **)res);
>> @@ -11,19 +16,11 @@ static void devm_clk_release(struct device *dev, void *res)
>>  
>>  struct clk *devm_clk_get(struct device *dev, const char *id)
>>  {
>> -	struct clk **ptr, *clk;
>> -
>> -	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
>> -	if (!ptr)
>> -		return ERR_PTR(-ENOMEM);
>> +	struct clk *clk = clk_get(dev, id);
>>  
>> -	clk = clk_get(dev, id);
>> -	if (!IS_ERR(clk)) {
>> -		*ptr = clk;
>> -		devres_add(dev, ptr);
>> -	} else {
>> -		devres_free(ptr);
>> -	}
>> +	if (!IS_ERR(clk))
>> +		if (!devm_add(dev, __clk_put, &clk, sizeof(clk)))
>> +			clk = ERR_PTR(-ENOMEM);
> 
> You leak clk here.

I don't think so ;-)

If devm_add() returns NULL, then we have called __clk_put(dev, &clk);

Regards.
Russell King (Oracle) Dec. 12, 2019, 2:46 p.m. UTC | #10
On Thu, Dec 12, 2019 at 03:41:20PM +0100, Marc Gonzalez wrote:
> On 12/12/2019 15:17, Russell King - ARM Linux admin wrote:
> 
> > On Thu, Dec 12, 2019 at 02:53:40PM +0100, Marc Gonzalez wrote:
> >
> >> On 11/12/2019 23:28, Dmitry Torokhov wrote:
> >>
> >>> On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
> >>>
> >>>> What is the rationale for the devm_add_action API?
> >>>
> >>> For one-off and maybe complex unwind actions in drivers that wish to use
> >>> devm API (as mixing devm and manual release is verboten). Also is often
> >>> used when some core subsystem does not provide enough devm APIs.
> >>
> >> Thanks for the insight, Dmitry. Thanks to Robin too.
> >>
> >> This is what I understand so far:
> >>
> >> devm_add_action() is nice because it hides/factorizes the complexity
> >> of the devres API, but it incurs a small storage overhead of one
> >> pointer per call, which makes it unfit for frequently used actions,
> >> such as clk_get.
> >>
> >> Is that correct?
> >>
> >> My question is: why not design the API without the small overhead?
> >>
> >> Proof of concept below:
> >>
> >>
> >> diff --git a/drivers/base/devres.c b/drivers/base/devres.c
> >> index 0bbb328bd17f..76392dd6273b 100644
> >> --- a/drivers/base/devres.c
> >> +++ b/drivers/base/devres.c
> >> @@ -685,6 +685,20 @@ int devres_release_group(struct device *dev, void *id)
> >>  }
> >>  EXPORT_SYMBOL_GPL(devres_release_group);
> >>  
> >> +void *devm_add(struct device *dev, dr_release_t func, void *arg, size_t size)
> >> +{
> >> +	void *data = devres_alloc(func, size, GFP_KERNEL);
> >> +
> >> +	if (data) {
> >> +		memcpy(data, arg, size);
> >> +		devres_add(dev, data);
> >> +	} else
> >> +		func(dev, arg);
> >> +
> >> +	return data;
> >> +}
> >> +EXPORT_SYMBOL_GPL(devm_add);
> >> +
> >>  /*
> >>   * Custom devres actions allow inserting a simple function call
> >>   * into the teadown sequence.
> >> diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
> >> index be160764911b..8db671823126 100644
> >> --- a/drivers/clk/clk-devres.c
> >> +++ b/drivers/clk/clk-devres.c
> >> @@ -4,6 +4,11 @@
> >>  #include <linux/export.h>
> >>  #include <linux/gfp.h>
> >>  
> >> +static void __clk_put(struct device *dev, void *data)
> >> +{
> >> +	clk_put(*(struct clk **)data);
> >> +}
> >> +
> >>  static void devm_clk_release(struct device *dev, void *res)
> >>  {
> >>  	clk_put(*(struct clk **)res);
> >> @@ -11,19 +16,11 @@ static void devm_clk_release(struct device *dev, void *res)
> >>  
> >>  struct clk *devm_clk_get(struct device *dev, const char *id)
> >>  {
> >> -	struct clk **ptr, *clk;
> >> -
> >> -	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
> >> -	if (!ptr)
> >> -		return ERR_PTR(-ENOMEM);
> >> +	struct clk *clk = clk_get(dev, id);
> >>  
> >> -	clk = clk_get(dev, id);
> >> -	if (!IS_ERR(clk)) {
> >> -		*ptr = clk;
> >> -		devres_add(dev, ptr);
> >> -	} else {
> >> -		devres_free(ptr);
> >> -	}
> >> +	if (!IS_ERR(clk))
> >> +		if (!devm_add(dev, __clk_put, &clk, sizeof(clk)))
> >> +			clk = ERR_PTR(-ENOMEM);
> > 
> > You leak clk here.
> 
> I don't think so ;-)
> 
> If devm_add() returns NULL, then we have called __clk_put(dev, &clk);

Okay.

However, please don't call this __clk_put().  git grep __clk_put will
tell you why.  Thanks.
Robin Murphy Dec. 12, 2019, 2:47 p.m. UTC | #11
On 12/12/2019 1:53 pm, Marc Gonzalez wrote:
> On 11/12/2019 23:28, Dmitry Torokhov wrote:
> 
>> On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
>>
>>> What is the rationale for the devm_add_action API?
>>
>> For one-off and maybe complex unwind actions in drivers that wish to use
>> devm API (as mixing devm and manual release is verboten). Also is often
>> used when some core subsystem does not provide enough devm APIs.
> 
> Thanks for the insight, Dmitry. Thanks to Robin too.
> 
> This is what I understand so far:
> 
> devm_add_action() is nice because it hides/factorizes the complexity
> of the devres API, but it incurs a small storage overhead of one
> pointer per call, which makes it unfit for frequently used actions,
> such as clk_get.
> 
> Is that correct?
> 
> My question is: why not design the API without the small overhead?

Probably because on most architectures, ARCH_KMALLOC_MINALIGN is at 
least as big as two pointers anyway, so this "overhead" should mostly be 
free in practice. Plus the devres API is almost entirely about being 
able to write simple robust code, rather than absolute efficiency - I 
mean, struct devres itself is already 5 pointers large at the absolute 
minimum ;)

In summary: the email client in which I'm writing this is currently 
using 2.3GB of my workstation's 64GB of RAM; welcome to 21st century 
software... :P

Robin.

> Proof of concept below:
> 
> 
> diff --git a/drivers/base/devres.c b/drivers/base/devres.c
> index 0bbb328bd17f..76392dd6273b 100644
> --- a/drivers/base/devres.c
> +++ b/drivers/base/devres.c
> @@ -685,6 +685,20 @@ int devres_release_group(struct device *dev, void *id)
>   }
>   EXPORT_SYMBOL_GPL(devres_release_group);
>   
> +void *devm_add(struct device *dev, dr_release_t func, void *arg, size_t size)
> +{
> +	void *data = devres_alloc(func, size, GFP_KERNEL);
> +
> +	if (data) {
> +		memcpy(data, arg, size);
> +		devres_add(dev, data);
> +	} else
> +		func(dev, arg);
> +
> +	return data;
> +}
> +EXPORT_SYMBOL_GPL(devm_add);
> +
>   /*
>    * Custom devres actions allow inserting a simple function call
>    * into the teadown sequence.
> diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
> index be160764911b..8db671823126 100644
> --- a/drivers/clk/clk-devres.c
> +++ b/drivers/clk/clk-devres.c
> @@ -4,6 +4,11 @@
>   #include <linux/export.h>
>   #include <linux/gfp.h>
>   
> +static void __clk_put(struct device *dev, void *data)
> +{
> +	clk_put(*(struct clk **)data);
> +}
> +
>   static void devm_clk_release(struct device *dev, void *res)
>   {
>   	clk_put(*(struct clk **)res);
> @@ -11,19 +16,11 @@ static void devm_clk_release(struct device *dev, void *res)
>   
>   struct clk *devm_clk_get(struct device *dev, const char *id)
>   {
> -	struct clk **ptr, *clk;
> -
> -	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
> -	if (!ptr)
> -		return ERR_PTR(-ENOMEM);
> +	struct clk *clk = clk_get(dev, id);
>   
> -	clk = clk_get(dev, id);
> -	if (!IS_ERR(clk)) {
> -		*ptr = clk;
> -		devres_add(dev, ptr);
> -	} else {
> -		devres_free(ptr);
> -	}
> +	if (!IS_ERR(clk))
> +		if (!devm_add(dev, __clk_put, &clk, sizeof(clk)))
> +			clk = ERR_PTR(-ENOMEM);
>   
>   	return clk;
>   }
> diff --git a/include/linux/device.h b/include/linux/device.h
> index e226030c1df3..5acb61ec39ab 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -970,6 +970,7 @@ void __iomem *devm_of_iomap(struct device *dev,
>   			    resource_size_t *size);
>   
>   /* allows to add/remove a custom action to devres stack */
> +void *devm_add(struct device *dev, dr_release_t func, void *arg, size_t size);
>   int devm_add_action(struct device *dev, void (*action)(void *), void *data);
>   void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
>   void devm_release_action(struct device *dev, void (*action)(void *), void *data);
>
Marc Gonzalez Dec. 12, 2019, 3:51 p.m. UTC | #12
On 12/12/2019 15:46, Russell King - ARM Linux admin wrote:

> However, please don't call this __clk_put().
> git grep __clk_put will tell you why.  Thanks.

$ git grep __clk_put
drivers/clk/clk-devres.c:static void __clk_put(struct device *dev, void *data)
drivers/clk/clk-devres.c:               if (!devm_add(dev, __clk_put, &clk, sizeof(clk)))
drivers/clk/clk.c:void __clk_put(struct clk *clk)
drivers/clk/clk.h:void __clk_put(struct clk *clk);
drivers/clk/clk.h:static inline void __clk_put(struct clk *clk) { }
drivers/clk/clkdev.c:   __clk_put(clk);

I see. I will s/__clk_put/my_clk_put/ in my proposal.

Out of curiosity...

$ git grep __clk_put v2.6.29-rc1
v2.6.29-rc1:arch/arm/common/clkdev.c:   __clk_put(clk);
v2.6.29-rc1:arch/arm/mach-ep93xx/include/mach/clkdev.h:#define __clk_put(clk) do { } while (0)
v2.6.29-rc1:arch/arm/mach-integrator/include/mach/clkdev.h:static inline void __clk_put(struct clk *clk)
v2.6.29-rc1:arch/arm/mach-pxa/include/mach/clkdev.h:#define __clk_put(clk) do { } while (0)
v2.6.29-rc1:arch/arm/mach-realview/include/mach/clkdev.h:#define __clk_put(clk) do { } while (0)
v2.6.29-rc1:arch/arm/mach-versatile/include/mach/clkdev.h:#define __clk_put(clk) do { } while (0)

Genesis seems to be 0318e693d3a56

The clkdev API expected platforms to export a __clk_put method?

Regards.
Russell King (Oracle) Dec. 12, 2019, 4:13 p.m. UTC | #13
On Thu, Dec 12, 2019 at 04:51:25PM +0100, Marc Gonzalez wrote:
> On 12/12/2019 15:46, Russell King - ARM Linux admin wrote:
> 
> > However, please don't call this __clk_put().
> > git grep __clk_put will tell you why.  Thanks.
> 
> $ git grep __clk_put
> drivers/clk/clk-devres.c:static void __clk_put(struct device *dev, void *data)
> drivers/clk/clk-devres.c:               if (!devm_add(dev, __clk_put, &clk, sizeof(clk)))
> drivers/clk/clk.c:void __clk_put(struct clk *clk)
> drivers/clk/clk.h:void __clk_put(struct clk *clk);
> drivers/clk/clk.h:static inline void __clk_put(struct clk *clk) { }
> drivers/clk/clkdev.c:   __clk_put(clk);
> 
> I see. I will s/__clk_put/my_clk_put/ in my proposal.
> 
> Out of curiosity...
> 
> $ git grep __clk_put v2.6.29-rc1
> v2.6.29-rc1:arch/arm/common/clkdev.c:   __clk_put(clk);
> v2.6.29-rc1:arch/arm/mach-ep93xx/include/mach/clkdev.h:#define __clk_put(clk) do { } while (0)
> v2.6.29-rc1:arch/arm/mach-integrator/include/mach/clkdev.h:static inline void __clk_put(struct clk *clk)
> v2.6.29-rc1:arch/arm/mach-pxa/include/mach/clkdev.h:#define __clk_put(clk) do { } while (0)
> v2.6.29-rc1:arch/arm/mach-realview/include/mach/clkdev.h:#define __clk_put(clk) do { } while (0)
> v2.6.29-rc1:arch/arm/mach-versatile/include/mach/clkdev.h:#define __clk_put(clk) do { } while (0)
> 
> Genesis seems to be 0318e693d3a56
> 
> The clkdev API expected platforms to export a __clk_put method?

Along with __clk_get(), these were the interfaces from the cross-
platform clkdev code to the clk API implementation specific code.
__clk_get() no longer exists as its uses were eliminated, but
__clk_put() remains.

It's quite logical if you read the patch which your above commit ID
references.
Marc Gonzalez Dec. 12, 2019, 4:59 p.m. UTC | #14
On 12/12/2019 15:47, Robin Murphy wrote:

> On 12/12/2019 1:53 pm, Marc Gonzalez wrote:
>
>> On 11/12/2019 23:28, Dmitry Torokhov wrote:
>>
>>> On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
>>>
>>>> What is the rationale for the devm_add_action API?
>>>
>>> For one-off and maybe complex unwind actions in drivers that wish to use
>>> devm API (as mixing devm and manual release is verboten). Also is often
>>> used when some core subsystem does not provide enough devm APIs.
>>
>> Thanks for the insight, Dmitry. Thanks to Robin too.
>>
>> This is what I understand so far:
>>
>> devm_add_action() is nice because it hides/factorizes the complexity
>> of the devres API, but it incurs a small storage overhead of one
>> pointer per call, which makes it unfit for frequently used actions,
>> such as clk_get.
>>
>> Is that correct?
>>
>> My question is: why not design the API without the small overhead?
> 
> Probably because on most architectures, ARCH_KMALLOC_MINALIGN is at 
> least as big as two pointers anyway, so this "overhead" should mostly be 
> free in practice. Plus the devres API is almost entirely about being 
> able to write simple robust code, rather than absolute efficiency - I 
> mean, struct devres itself is already 5 pointers large at the absolute 
> minimum ;)

(3 pointers: 1 list_head + 1 function pointer)

I'm confused. The first patch was criticized for potentially adding
an extra pointer for every devm_clk_get (e.g. 800 bytes on a 64-bit
platform with 100 clocks).

Let's see. On arm64, ARCH_KMALLOC_MINALIGN is 128.

So basically, a struct devres looks like this on arm64:

	list_head.next
	list_head.prev
	dr_release_t
		.
		.
		.
	104 bytes of padding
		.
		.
		.
	data (flexible array)
		.
		.
		.
	padding up to 256 bytes


Basically, on arm64, every struct devres occupies 256 bytes, most of it
(typically 104 + 112 = 216) wasted as padding.

Hmmm, given how many devm stuff goes on in a modern platform, there
might be large savings to be had...

Assuming 10,000 calls to devres_alloc_node(), we would be wasting ~2 MB
of RAM. Not sure it's worth trying to save that?

$ git grep '#define ARCH_DMA_MINALIGN'
arch/arc/include/asm/cache.h:#define ARCH_DMA_MINALIGN  SMP_CACHE_BYTES
arch/arm/include/asm/cache.h:#define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
arch/arm64/include/asm/cache.h:#define ARCH_DMA_MINALIGN        (128)
arch/c6x/include/asm/cache.h:#define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
arch/csky/include/asm/cache.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
arch/hexagon/include/asm/cache.h:#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES
arch/m68k/include/asm/cache.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
arch/microblaze/include/asm/page.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
arch/mips/include/asm/mach-generic/kmalloc.h:#define ARCH_DMA_MINALIGN  128
arch/mips/include/asm/mach-ip32/kmalloc.h:#define ARCH_DMA_MINALIGN     32
arch/mips/include/asm/mach-ip32/kmalloc.h:#define ARCH_DMA_MINALIGN     128
arch/mips/include/asm/mach-tx49xx/kmalloc.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
arch/nds32/include/asm/cache.h:#define ARCH_DMA_MINALIGN   L1_CACHE_BYTES
arch/nios2/include/asm/cache.h:#define ARCH_DMA_MINALIGN        L1_CACHE_BYTES
arch/parisc/include/asm/cache.h:#define ARCH_DMA_MINALIGN       L1_CACHE_BYTES
arch/powerpc/include/asm/page_32.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
arch/sh/include/asm/page.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
arch/unicore32/include/asm/cache.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
arch/xtensa/include/asm/cache.h:#define ARCH_DMA_MINALIGN       L1_CACHE_BYTES

Hmmm, how does arch/x86 do it?

Regards.
Russell King (Oracle) Dec. 12, 2019, 5:05 p.m. UTC | #15
On Thu, Dec 12, 2019 at 05:59:04PM +0100, Marc Gonzalez wrote:
> On 12/12/2019 15:47, Robin Murphy wrote:
> 
> > On 12/12/2019 1:53 pm, Marc Gonzalez wrote:
> >
> >> On 11/12/2019 23:28, Dmitry Torokhov wrote:
> >>
> >>> On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
> >>>
> >>>> What is the rationale for the devm_add_action API?
> >>>
> >>> For one-off and maybe complex unwind actions in drivers that wish to use
> >>> devm API (as mixing devm and manual release is verboten). Also is often
> >>> used when some core subsystem does not provide enough devm APIs.
> >>
> >> Thanks for the insight, Dmitry. Thanks to Robin too.
> >>
> >> This is what I understand so far:
> >>
> >> devm_add_action() is nice because it hides/factorizes the complexity
> >> of the devres API, but it incurs a small storage overhead of one
> >> pointer per call, which makes it unfit for frequently used actions,
> >> such as clk_get.
> >>
> >> Is that correct?
> >>
> >> My question is: why not design the API without the small overhead?
> > 
> > Probably because on most architectures, ARCH_KMALLOC_MINALIGN is at 
> > least as big as two pointers anyway, so this "overhead" should mostly be 
> > free in practice. Plus the devres API is almost entirely about being 
> > able to write simple robust code, rather than absolute efficiency - I 
> > mean, struct devres itself is already 5 pointers large at the absolute 
> > minimum ;)
> 
> (3 pointers: 1 list_head + 1 function pointer)
> 
> I'm confused. The first patch was criticized for potentially adding
> an extra pointer for every devm_clk_get (e.g. 800 bytes on a 64-bit
> platform with 100 clocks).
> 
> Let's see. On arm64, ARCH_KMALLOC_MINALIGN is 128.
> 
> So basically, a struct devres looks like this on arm64:
> 
> 	list_head.next
> 	list_head.prev
> 	dr_release_t
> 		.
> 		.
> 		.
> 	104 bytes of padding
> 		.
> 		.
> 		.
> 	data (flexible array)
> 		.
> 		.
> 		.
> 	padding up to 256 bytes
> 
> 
> Basically, on arm64, every struct devres occupies 256 bytes, most of it
> (typically 104 + 112 = 216) wasted as padding.
> 
> Hmmm, given how many devm stuff goes on in a modern platform, there
> might be large savings to be had...
> 
> Assuming 10,000 calls to devres_alloc_node(), we would be wasting ~2 MB
> of RAM. Not sure it's worth trying to save that?
> 
> $ git grep '#define ARCH_DMA_MINALIGN'
> arch/arc/include/asm/cache.h:#define ARCH_DMA_MINALIGN  SMP_CACHE_BYTES
> arch/arm/include/asm/cache.h:#define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
> arch/arm64/include/asm/cache.h:#define ARCH_DMA_MINALIGN        (128)
> arch/c6x/include/asm/cache.h:#define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
> arch/csky/include/asm/cache.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
> arch/hexagon/include/asm/cache.h:#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES
> arch/m68k/include/asm/cache.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
> arch/microblaze/include/asm/page.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
> arch/mips/include/asm/mach-generic/kmalloc.h:#define ARCH_DMA_MINALIGN  128
> arch/mips/include/asm/mach-ip32/kmalloc.h:#define ARCH_DMA_MINALIGN     32
> arch/mips/include/asm/mach-ip32/kmalloc.h:#define ARCH_DMA_MINALIGN     128
> arch/mips/include/asm/mach-tx49xx/kmalloc.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
> arch/nds32/include/asm/cache.h:#define ARCH_DMA_MINALIGN   L1_CACHE_BYTES
> arch/nios2/include/asm/cache.h:#define ARCH_DMA_MINALIGN        L1_CACHE_BYTES
> arch/parisc/include/asm/cache.h:#define ARCH_DMA_MINALIGN       L1_CACHE_BYTES
> arch/powerpc/include/asm/page_32.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
> arch/sh/include/asm/page.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
> arch/unicore32/include/asm/cache.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
> arch/xtensa/include/asm/cache.h:#define ARCH_DMA_MINALIGN       L1_CACHE_BYTES
> 
> Hmmm, how does arch/x86 do it?

As I understand it, x86 tends to be fully coherent, so has no there
is not much requirement for DMA to be aligned to cachelines.
Robin Murphy Dec. 12, 2019, 6:15 p.m. UTC | #16
On 12/12/2019 4:59 pm, Marc Gonzalez wrote:
> On 12/12/2019 15:47, Robin Murphy wrote:
> 
>> On 12/12/2019 1:53 pm, Marc Gonzalez wrote:
>>
>>> On 11/12/2019 23:28, Dmitry Torokhov wrote:
>>>
>>>> On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
>>>>
>>>>> What is the rationale for the devm_add_action API?
>>>>
>>>> For one-off and maybe complex unwind actions in drivers that wish to use
>>>> devm API (as mixing devm and manual release is verboten). Also is often
>>>> used when some core subsystem does not provide enough devm APIs.
>>>
>>> Thanks for the insight, Dmitry. Thanks to Robin too.
>>>
>>> This is what I understand so far:
>>>
>>> devm_add_action() is nice because it hides/factorizes the complexity
>>> of the devres API, but it incurs a small storage overhead of one
>>> pointer per call, which makes it unfit for frequently used actions,
>>> such as clk_get.
>>>
>>> Is that correct?
>>>
>>> My question is: why not design the API without the small overhead?
>>
>> Probably because on most architectures, ARCH_KMALLOC_MINALIGN is at
>> least as big as two pointers anyway, so this "overhead" should mostly be
>> free in practice. Plus the devres API is almost entirely about being
>> able to write simple robust code, rather than absolute efficiency - I
>> mean, struct devres itself is already 5 pointers large at the absolute
>> minimum ;)
> 
> (3 pointers: 1 list_head + 1 function pointer)

Ah yes, I failed to mentally preprocess the debug config :)

> I'm confused. The first patch was criticized for potentially adding
> an extra pointer for every devm_clk_get (e.g. 800 bytes on a 64-bit
> platform with 100 clocks).

I'm not sure it was a criticism so much as an observation of an aspect 
that deserved consideration (certainly it was on my part, and I read 
Dmitry's "It might still, ..." as implying the same). I'd say by this 
point it has been thoroughly considered, and personally I'm now happy 
with the conclusion that the kind of embedded platforms that will have 
many dozens of clocks are also the kind that will tend to have enough 
padding to make it moot, and thus the code simplification probably is 
worthwhile overall.

Robin.

> Let's see. On arm64, ARCH_KMALLOC_MINALIGN is 128.
> 
> So basically, a struct devres looks like this on arm64:
> 
> 	list_head.next
> 	list_head.prev
> 	dr_release_t
> 		.
> 		.
> 		.
> 	104 bytes of padding
> 		.
> 		.
> 		.
> 	data (flexible array)
> 		.
> 		.
> 		.
> 	padding up to 256 bytes
> 
> 
> Basically, on arm64, every struct devres occupies 256 bytes, most of it
> (typically 104 + 112 = 216) wasted as padding.
> 
> Hmmm, given how many devm stuff goes on in a modern platform, there
> might be large savings to be had...
> 
> Assuming 10,000 calls to devres_alloc_node(), we would be wasting ~2 MB
> of RAM. Not sure it's worth trying to save that?
> 
> $ git grep '#define ARCH_DMA_MINALIGN'
> arch/arc/include/asm/cache.h:#define ARCH_DMA_MINALIGN  SMP_CACHE_BYTES
> arch/arm/include/asm/cache.h:#define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
> arch/arm64/include/asm/cache.h:#define ARCH_DMA_MINALIGN        (128)
> arch/c6x/include/asm/cache.h:#define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
> arch/csky/include/asm/cache.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
> arch/hexagon/include/asm/cache.h:#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES
> arch/m68k/include/asm/cache.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
> arch/microblaze/include/asm/page.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
> arch/mips/include/asm/mach-generic/kmalloc.h:#define ARCH_DMA_MINALIGN  128
> arch/mips/include/asm/mach-ip32/kmalloc.h:#define ARCH_DMA_MINALIGN     32
> arch/mips/include/asm/mach-ip32/kmalloc.h:#define ARCH_DMA_MINALIGN     128
> arch/mips/include/asm/mach-tx49xx/kmalloc.h:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
> arch/nds32/include/asm/cache.h:#define ARCH_DMA_MINALIGN   L1_CACHE_BYTES
> arch/nios2/include/asm/cache.h:#define ARCH_DMA_MINALIGN        L1_CACHE_BYTES
> arch/parisc/include/asm/cache.h:#define ARCH_DMA_MINALIGN       L1_CACHE_BYTES
> arch/powerpc/include/asm/page_32.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
> arch/sh/include/asm/page.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
> arch/unicore32/include/asm/cache.h:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
> arch/xtensa/include/asm/cache.h:#define ARCH_DMA_MINALIGN       L1_CACHE_BYTES
> 
> Hmmm, how does arch/x86 do it?
> 
> Regards.
>
Dmitry Torokhov Dec. 12, 2019, 7:10 p.m. UTC | #17
On Thu, Dec 12, 2019 at 06:15:16PM +0000, Robin Murphy wrote:
> On 12/12/2019 4:59 pm, Marc Gonzalez wrote:
> > On 12/12/2019 15:47, Robin Murphy wrote:
> > 
> > > On 12/12/2019 1:53 pm, Marc Gonzalez wrote:
> > > 
> > > > On 11/12/2019 23:28, Dmitry Torokhov wrote:
> > > > 
> > > > > On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
> > > > > 
> > > > > > What is the rationale for the devm_add_action API?
> > > > > 
> > > > > For one-off and maybe complex unwind actions in drivers that wish to use
> > > > > devm API (as mixing devm and manual release is verboten). Also is often
> > > > > used when some core subsystem does not provide enough devm APIs.
> > > > 
> > > > Thanks for the insight, Dmitry. Thanks to Robin too.
> > > > 
> > > > This is what I understand so far:
> > > > 
> > > > devm_add_action() is nice because it hides/factorizes the complexity
> > > > of the devres API, but it incurs a small storage overhead of one
> > > > pointer per call, which makes it unfit for frequently used actions,
> > > > such as clk_get.
> > > > 
> > > > Is that correct?
> > > > 
> > > > My question is: why not design the API without the small overhead?
> > > 
> > > Probably because on most architectures, ARCH_KMALLOC_MINALIGN is at
> > > least as big as two pointers anyway, so this "overhead" should mostly be
> > > free in practice. Plus the devres API is almost entirely about being
> > > able to write simple robust code, rather than absolute efficiency - I
> > > mean, struct devres itself is already 5 pointers large at the absolute
> > > minimum ;)
> > 
> > (3 pointers: 1 list_head + 1 function pointer)
> 
> Ah yes, I failed to mentally preprocess the debug config :)
> 
> > I'm confused. The first patch was criticized for potentially adding
> > an extra pointer for every devm_clk_get (e.g. 800 bytes on a 64-bit
> > platform with 100 clocks).
> 
> I'm not sure it was a criticism so much as an observation of an aspect that
> deserved consideration (certainly it was on my part, and I read Dmitry's "It
> might still, ..." as implying the same). I'd say by this point it has been
> thoroughly considered, and personally I'm now happy with the conclusion that
> the kind of embedded platforms that will have many dozens of clocks are also
> the kind that will tend to have enough padding to make it moot, and thus the
> code simplification probably is worthwhile overall.

I wonder if we could actually avoid allocating the data with
ARCH_KMALLOC_MINALIGN in all the cases. It is definitely needed for the
devm_k*alloc() group of functions as they are direct replacement for
k*alloc() APIs that give users aligned memory, but for other data
structures (clocks, regulators, etc, etc) it is not required.

Thanks.
Robin Murphy Dec. 12, 2019, 9:08 p.m. UTC | #18
On 2019-12-12 7:10 pm, Dmitry Torokhov wrote:
> On Thu, Dec 12, 2019 at 06:15:16PM +0000, Robin Murphy wrote:
>> On 12/12/2019 4:59 pm, Marc Gonzalez wrote:
>>> On 12/12/2019 15:47, Robin Murphy wrote:
>>>
>>>> On 12/12/2019 1:53 pm, Marc Gonzalez wrote:
>>>>
>>>>> On 11/12/2019 23:28, Dmitry Torokhov wrote:
>>>>>
>>>>>> On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
>>>>>>
>>>>>>> What is the rationale for the devm_add_action API?
>>>>>>
>>>>>> For one-off and maybe complex unwind actions in drivers that wish to use
>>>>>> devm API (as mixing devm and manual release is verboten). Also is often
>>>>>> used when some core subsystem does not provide enough devm APIs.
>>>>>
>>>>> Thanks for the insight, Dmitry. Thanks to Robin too.
>>>>>
>>>>> This is what I understand so far:
>>>>>
>>>>> devm_add_action() is nice because it hides/factorizes the complexity
>>>>> of the devres API, but it incurs a small storage overhead of one
>>>>> pointer per call, which makes it unfit for frequently used actions,
>>>>> such as clk_get.
>>>>>
>>>>> Is that correct?
>>>>>
>>>>> My question is: why not design the API without the small overhead?
>>>>
>>>> Probably because on most architectures, ARCH_KMALLOC_MINALIGN is at
>>>> least as big as two pointers anyway, so this "overhead" should mostly be
>>>> free in practice. Plus the devres API is almost entirely about being
>>>> able to write simple robust code, rather than absolute efficiency - I
>>>> mean, struct devres itself is already 5 pointers large at the absolute
>>>> minimum ;)
>>>
>>> (3 pointers: 1 list_head + 1 function pointer)
>>
>> Ah yes, I failed to mentally preprocess the debug config :)
>>
>>> I'm confused. The first patch was criticized for potentially adding
>>> an extra pointer for every devm_clk_get (e.g. 800 bytes on a 64-bit
>>> platform with 100 clocks).
>>
>> I'm not sure it was a criticism so much as an observation of an aspect that
>> deserved consideration (certainly it was on my part, and I read Dmitry's "It
>> might still, ..." as implying the same). I'd say by this point it has been
>> thoroughly considered, and personally I'm now happy with the conclusion that
>> the kind of embedded platforms that will have many dozens of clocks are also
>> the kind that will tend to have enough padding to make it moot, and thus the
>> code simplification probably is worthwhile overall.
> 
> I wonder if we could actually avoid allocating the data with
> ARCH_KMALLOC_MINALIGN in all the cases. It is definitely needed for the
> devm_k*alloc() group of functions as they are direct replacement for
> k*alloc() APIs that give users aligned memory, but for other data
> structures (clocks, regulators, etc, etc) it is not required.

That's a very good point - perhaps something like this (only done properly)?

Robin.

diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 0bbb328bd17f..2382f963abbe 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -26,14 +26,7 @@ struct devres_node {

  struct devres {
         struct devres_node              node;
-       /*
-        * Some archs want to perform DMA into kmalloc caches
-        * and need a guaranteed alignment larger than
-        * the alignment of a 64-bit integer.
-        * Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same
-        * buffer alignment as if it was allocated by plain kmalloc().
-        */
-       u8 __aligned(ARCH_KMALLOC_MINALIGN) data[];
+       u8                              data[];
  };

  struct devres_group {
@@ -810,6 +803,17 @@ static int devm_kmalloc_match(struct device *dev, 
void *res, void *data)
  void * devm_kmalloc(struct device *dev, size_t size, gfp_t gfp)
  {
         struct devres *dr;
+       size_t align;
+
+       /*
+        * Some archs want to perform DMA into kmalloc caches
+        * and need a guaranteed alignment larger than
+        * the alignment of a 64-bit integer.
+        * Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same
+        * buffer alignment as if it was allocated by plain kmalloc().
+        */
+       align = (ARCH_KMALLOC_MINALIGN - sizeof(*dr)) % 
ARCH_KMALLOC_MINALIGN;
+       size += align;

         /* use raw alloc_dr for kmalloc caller tracing */
         dr = alloc_dr(devm_kmalloc_release, size, gfp, dev_to_node(dev));
@@ -822,7 +826,7 @@ void * devm_kmalloc(struct device *dev, size_t size, 
gfp_t gfp)
          */
         set_node_dbginfo(&dr->node, "devm_kzalloc_release", size);
         devres_add(dev, dr->data);
-       return dr->data;
+       return dr->data + align;
  }
  EXPORT_SYMBOL_GPL(devm_kmalloc);
Dmitry Torokhov Dec. 13, 2019, 12:16 a.m. UTC | #19
On Thu, Dec 12, 2019 at 09:08:04PM +0000, Robin Murphy wrote:
> On 2019-12-12 7:10 pm, Dmitry Torokhov wrote:
> > On Thu, Dec 12, 2019 at 06:15:16PM +0000, Robin Murphy wrote:
> > > On 12/12/2019 4:59 pm, Marc Gonzalez wrote:
> > > > On 12/12/2019 15:47, Robin Murphy wrote:
> > > > 
> > > > > On 12/12/2019 1:53 pm, Marc Gonzalez wrote:
> > > > > 
> > > > > > On 11/12/2019 23:28, Dmitry Torokhov wrote:
> > > > > > 
> > > > > > > On Wed, Dec 11, 2019 at 05:17:28PM +0100, Marc Gonzalez wrote:
> > > > > > > 
> > > > > > > > What is the rationale for the devm_add_action API?
> > > > > > > 
> > > > > > > For one-off and maybe complex unwind actions in drivers that wish to use
> > > > > > > devm API (as mixing devm and manual release is verboten). Also is often
> > > > > > > used when some core subsystem does not provide enough devm APIs.
> > > > > > 
> > > > > > Thanks for the insight, Dmitry. Thanks to Robin too.
> > > > > > 
> > > > > > This is what I understand so far:
> > > > > > 
> > > > > > devm_add_action() is nice because it hides/factorizes the complexity
> > > > > > of the devres API, but it incurs a small storage overhead of one
> > > > > > pointer per call, which makes it unfit for frequently used actions,
> > > > > > such as clk_get.
> > > > > > 
> > > > > > Is that correct?
> > > > > > 
> > > > > > My question is: why not design the API without the small overhead?
> > > > > 
> > > > > Probably because on most architectures, ARCH_KMALLOC_MINALIGN is at
> > > > > least as big as two pointers anyway, so this "overhead" should mostly be
> > > > > free in practice. Plus the devres API is almost entirely about being
> > > > > able to write simple robust code, rather than absolute efficiency - I
> > > > > mean, struct devres itself is already 5 pointers large at the absolute
> > > > > minimum ;)
> > > > 
> > > > (3 pointers: 1 list_head + 1 function pointer)
> > > 
> > > Ah yes, I failed to mentally preprocess the debug config :)
> > > 
> > > > I'm confused. The first patch was criticized for potentially adding
> > > > an extra pointer for every devm_clk_get (e.g. 800 bytes on a 64-bit
> > > > platform with 100 clocks).
> > > 
> > > I'm not sure it was a criticism so much as an observation of an aspect that
> > > deserved consideration (certainly it was on my part, and I read Dmitry's "It
> > > might still, ..." as implying the same). I'd say by this point it has been
> > > thoroughly considered, and personally I'm now happy with the conclusion that
> > > the kind of embedded platforms that will have many dozens of clocks are also
> > > the kind that will tend to have enough padding to make it moot, and thus the
> > > code simplification probably is worthwhile overall.
> > 
> > I wonder if we could actually avoid allocating the data with
> > ARCH_KMALLOC_MINALIGN in all the cases. It is definitely needed for the
> > devm_k*alloc() group of functions as they are direct replacement for
> > k*alloc() APIs that give users aligned memory, but for other data
> > structures (clocks, regulators, etc, etc) it is not required.
> 
> That's a very good point - perhaps something like this (only done properly)?

Yes, but it has to be done carefully.

> 
> Robin.
> 
> diff --git a/drivers/base/devres.c b/drivers/base/devres.c
> index 0bbb328bd17f..2382f963abbe 100644
> --- a/drivers/base/devres.c
> +++ b/drivers/base/devres.c
> @@ -26,14 +26,7 @@ struct devres_node {
> 
>  struct devres {
>         struct devres_node              node;
> -       /*
> -        * Some archs want to perform DMA into kmalloc caches
> -        * and need a guaranteed alignment larger than
> -        * the alignment of a 64-bit integer.
> -        * Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same
> -        * buffer alignment as if it was allocated by plain kmalloc().
> -        */
> -       u8 __aligned(ARCH_KMALLOC_MINALIGN) data[];
> +       u8                              data[];
>  };
> 
>  struct devres_group {
> @@ -810,6 +803,17 @@ static int devm_kmalloc_match(struct device *dev, void
> *res, void *data)
>  void * devm_kmalloc(struct device *dev, size_t size, gfp_t gfp)
>  {
>         struct devres *dr;
> +       size_t align;
> +
> +       /*
> +        * Some archs want to perform DMA into kmalloc caches
> +        * and need a guaranteed alignment larger than
> +        * the alignment of a 64-bit integer.
> +        * Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same
> +        * buffer alignment as if it was allocated by plain kmalloc().
> +        */
> +       align = (ARCH_KMALLOC_MINALIGN - sizeof(*dr)) %
> ARCH_KMALLOC_MINALIGN;
> +       size += align;
> 
>         /* use raw alloc_dr for kmalloc caller tracing */
>         dr = alloc_dr(devm_kmalloc_release, size, gfp, dev_to_node(dev));
> @@ -822,7 +826,7 @@ void * devm_kmalloc(struct device *dev, size_t size,
> gfp_t gfp)
>          */
>         set_node_dbginfo(&dr->node, "devm_kzalloc_release", size);
>         devres_add(dev, dr->data);

I think it has to be "devres_add(dev, dr->data + align);" here, as match
function checks the pointer passed to devm_kfree() with one stored in
devres structure.

> -       return dr->data;
> +       return dr->data + align;
>  }
>  EXPORT_SYMBOL_GPL(devm_kmalloc);

Thanks.
diff mbox series

Patch

diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index be160764911b..04379c1f203e 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -4,31 +4,29 @@ 
 #include <linux/export.h>
 #include <linux/gfp.h>
 
-static void devm_clk_release(struct device *dev, void *res)
+static void __clk_put(void *clk)
 {
-	clk_put(*(struct clk **)res);
+	clk_put(clk);
 }
 
 struct clk *devm_clk_get(struct device *dev, const char *id)
 {
-	struct clk **ptr, *clk;
+	struct clk *clk = clk_get(dev, id);
 
-	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
-	if (!ptr)
-		return ERR_PTR(-ENOMEM);
-
-	clk = clk_get(dev, id);
-	if (!IS_ERR(clk)) {
-		*ptr = clk;
-		devres_add(dev, ptr);
-	} else {
-		devres_free(ptr);
-	}
+	if (!IS_ERR(clk))
+		if (devm_add_action_or_reset(dev, __clk_put, clk))
+			clk = ERR_PTR(-ENOMEM);
 
 	return clk;
 }
 EXPORT_SYMBOL(devm_clk_get);
 
+void devm_clk_put(struct device *dev, struct clk *clk)
+{
+	devm_release_action(dev, __clk_put, clk);
+}
+EXPORT_SYMBOL(devm_clk_put);
+
 struct clk *devm_clk_get_optional(struct device *dev, const char *id)
 {
 	struct clk *clk = devm_clk_get(dev, id);
@@ -116,42 +114,14 @@  int __must_check devm_clk_bulk_get_all(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_clk_bulk_get_all);
 
-static int devm_clk_match(struct device *dev, void *res, void *data)
-{
-	struct clk **c = res;
-	if (!c || !*c) {
-		WARN_ON(!c || !*c);
-		return 0;
-	}
-	return *c == data;
-}
-
-void devm_clk_put(struct device *dev, struct clk *clk)
-{
-	int ret;
-
-	ret = devres_release(dev, devm_clk_release, devm_clk_match, clk);
-
-	WARN_ON(ret);
-}
-EXPORT_SYMBOL(devm_clk_put);
-
 struct clk *devm_get_clk_from_child(struct device *dev,
 				    struct device_node *np, const char *con_id)
 {
-	struct clk **ptr, *clk;
-
-	ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
-	if (!ptr)
-		return ERR_PTR(-ENOMEM);
+	struct clk *clk = of_clk_get_by_name(np, con_id);
 
-	clk = of_clk_get_by_name(np, con_id);
-	if (!IS_ERR(clk)) {
-		*ptr = clk;
-		devres_add(dev, ptr);
-	} else {
-		devres_free(ptr);
-	}
+	if (!IS_ERR(clk))
+		if (devm_add_action_or_reset(dev, __clk_put, clk))
+			clk = ERR_PTR(-ENOMEM);
 
 	return clk;
 }