diff mbox series

[v2,1/2] dmaengine: qcom: gpi: Add GPI immediate DMA support

Message ID 20241128133351.24593-2-quic_jseerapu@quicinc.com (mailing list archive)
State Changes Requested
Headers show
Series Add GPI immediate DMA support for SPI | expand

Commit Message

Jyothi Kumar Seerapu Nov. 28, 2024, 1:33 p.m. UTC
The DMA TRE(Transfer ring element) buffer contains the DMA
buffer address. Accessing data from this address can cause
significant delays in SPI transfers, which can be mitigated to
some extent by utilizing immediate DMA support.

QCOM GPI DMA hardware supports an immediate DMA feature for data
up to 8 bytes, storing the data directly in the DMA TRE buffer
instead of the DMA buffer address. This enhancement enables faster
SPI data transfers.

This optimization reduces the average transfer time from 25 us to
16 us for a single SPI transfer of 8 bytes length, with a clock
frequency of 50 MHz.

Signed-off-by: Jyothi Kumar Seerapu <quic_jseerapu@quicinc.com>
---
v1 -> v2:
   - Separated the patches to dmaengine and spi subsystems
   - Removed the changes which are not required for this feature from
     qcom-gpi-dma.h file.
   - Removed the type conversions used in gpi_create_spi_tre.

 drivers/dma/qcom/gpi.c           | 32 +++++++++++++++++++++++++++-----
 include/linux/dma/qcom-gpi-dma.h |  6 ++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

Comments

Dmitry Baryshkov Nov. 28, 2024, 2:07 p.m. UTC | #1
On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
> The DMA TRE(Transfer ring element) buffer contains the DMA
> buffer address. Accessing data from this address can cause
> significant delays in SPI transfers, which can be mitigated to
> some extent by utilizing immediate DMA support.
> 
> QCOM GPI DMA hardware supports an immediate DMA feature for data
> up to 8 bytes, storing the data directly in the DMA TRE buffer
> instead of the DMA buffer address. This enhancement enables faster
> SPI data transfers.
> 
> This optimization reduces the average transfer time from 25 us to
> 16 us for a single SPI transfer of 8 bytes length, with a clock
> frequency of 50 MHz.
> 
> Signed-off-by: Jyothi Kumar Seerapu <quic_jseerapu@quicinc.com>
> ---
> v1 -> v2:
>    - Separated the patches to dmaengine and spi subsystems
>    - Removed the changes which are not required for this feature from
>      qcom-gpi-dma.h file.
>    - Removed the type conversions used in gpi_create_spi_tre.
> 
>  drivers/dma/qcom/gpi.c           | 32 +++++++++++++++++++++++++++-----
>  include/linux/dma/qcom-gpi-dma.h |  6 ++++++
>  2 files changed, 33 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
> index 52a7c8f2498f..4c5df696ddd8 100644
> --- a/drivers/dma/qcom/gpi.c
> +++ b/drivers/dma/qcom/gpi.c
> @@ -27,6 +27,7 @@
>  #define TRE_FLAGS_IEOT		BIT(9)
>  #define TRE_FLAGS_BEI		BIT(10)
>  #define TRE_FLAGS_LINK		BIT(11)
> +#define TRE_FLAGS_IMMEDIATE_DMA	BIT(16)
>  #define TRE_FLAGS_TYPE		GENMASK(23, 16)
>  
>  /* SPI CONFIG0 WD0 */
> @@ -64,6 +65,7 @@
>  
>  /* DMA TRE */
>  #define TRE_DMA_LEN		GENMASK(23, 0)
> +#define TRE_DMA_IMMEDIATE_LEN	GENMASK(3, 0)
>  
>  /* Register offsets from gpi-top */
>  #define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
> @@ -1711,6 +1713,8 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>  	dma_addr_t address;
>  	struct gpi_tre *tre;
>  	unsigned int i;
> +	u8 *buf;
> +	int len = 0;
>  
>  	/* first create config tre if applicable */
>  	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
> @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>  	tre_idx++;
>  
>  	address = sg_dma_address(sgl);
> -	tre->dword[0] = lower_32_bits(address);
> -	tre->dword[1] = upper_32_bits(address);
> +	len = sg_dma_len(sgl);
>  
> -	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
> +	/* Support Immediate dma for write transfers for data length up to 8 bytes */
> +	if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == DMA_MEM_TO_DEV) {

Please defer applying the patch until the discussion on v1 comes to
conclusion.

> +		buf = sg_virt(sgl);
>  
> -	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
> -	if (direction == DMA_MEM_TO_DEV)
> +		/* memcpy may not always be length of 8, hence pre-fill both dword's with 0 */
> +		tre->dword[0] = 0;
> +		tre->dword[1] = 0;
> +		memcpy(&tre->dword[0], buf, len);
> +
> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_IMMEDIATE_LEN);
> +
> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>  		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
> +		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IMMEDIATE_DMA);
> +	} else {
> +		tre->dword[0] = lower_32_bits(address);
> +		tre->dword[1] = upper_32_bits(address);
> +
> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_LEN);
> +
> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
> +		if (direction == DMA_MEM_TO_DEV)
> +			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
> +	}
>  
>  	for (i = 0; i < tre_idx; i++)
>  		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
> diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
> index 6680dd1a43c6..84598848d53a 100644
> --- a/include/linux/dma/qcom-gpi-dma.h
> +++ b/include/linux/dma/qcom-gpi-dma.h
> @@ -15,6 +15,10 @@ enum spi_transfer_cmd {
>  	SPI_DUPLEX,
>  };
>  
> +#define QCOM_GPI_IMMEDIATE_DMA		BIT(1)
> +
> +#define QCOM_GPI_IMMEDIATE_DMA_LEN	8
> +
>  /**
>   * struct gpi_spi_config - spi config for peripheral
>   *
> @@ -30,6 +34,7 @@ enum spi_transfer_cmd {
>   * @cs: chip select toggle
>   * @set_config: set peripheral config
>   * @rx_len: receive length for buffer
> + * @flags: true for immediate dma support
>   */
>  struct gpi_spi_config {
>  	u8 set_config;
> @@ -44,6 +49,7 @@ struct gpi_spi_config {
>  	u32 clk_src;
>  	enum spi_transfer_cmd cmd;
>  	u32 rx_len;
> +	u8 flags;
>  };
>  
>  enum i2c_op {
> -- 
> 2.17.1
>
Bjorn Andersson Nov. 28, 2024, 3:23 p.m. UTC | #2
On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
> The DMA TRE(Transfer ring element) buffer contains the DMA
> buffer address. Accessing data from this address can cause
> significant delays in SPI transfers, which can be mitigated to
> some extent by utilizing immediate DMA support.
> 
> QCOM GPI DMA hardware supports an immediate DMA feature for data
> up to 8 bytes, storing the data directly in the DMA TRE buffer
> instead of the DMA buffer address. This enhancement enables faster
> SPI data transfers.
> 
> This optimization reduces the average transfer time from 25 us to
> 16 us for a single SPI transfer of 8 bytes length, with a clock
> frequency of 50 MHz.
> 
> Signed-off-by: Jyothi Kumar Seerapu <quic_jseerapu@quicinc.com>
> ---
> v1 -> v2:
>    - Separated the patches to dmaengine and spi subsystems
>    - Removed the changes which are not required for this feature from
>      qcom-gpi-dma.h file.
>    - Removed the type conversions used in gpi_create_spi_tre.
> 
>  drivers/dma/qcom/gpi.c           | 32 +++++++++++++++++++++++++++-----
>  include/linux/dma/qcom-gpi-dma.h |  6 ++++++
>  2 files changed, 33 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
> index 52a7c8f2498f..4c5df696ddd8 100644
> --- a/drivers/dma/qcom/gpi.c
> +++ b/drivers/dma/qcom/gpi.c
> @@ -27,6 +27,7 @@
>  #define TRE_FLAGS_IEOT		BIT(9)
>  #define TRE_FLAGS_BEI		BIT(10)
>  #define TRE_FLAGS_LINK		BIT(11)
> +#define TRE_FLAGS_IMMEDIATE_DMA	BIT(16)
>  #define TRE_FLAGS_TYPE		GENMASK(23, 16)
>  
>  /* SPI CONFIG0 WD0 */
> @@ -64,6 +65,7 @@
>  
>  /* DMA TRE */
>  #define TRE_DMA_LEN		GENMASK(23, 0)
> +#define TRE_DMA_IMMEDIATE_LEN	GENMASK(3, 0)
>  
>  /* Register offsets from gpi-top */
>  #define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
> @@ -1711,6 +1713,8 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>  	dma_addr_t address;
>  	struct gpi_tre *tre;
>  	unsigned int i;
> +	u8 *buf;
> +	int len = 0;

First use of "len" is an assignment, so you shouldn't zero-initialize it
here.

>  
>  	/* first create config tre if applicable */
>  	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
> @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>  	tre_idx++;
>  
>  	address = sg_dma_address(sgl);
> -	tre->dword[0] = lower_32_bits(address);
> -	tre->dword[1] = upper_32_bits(address);
> +	len = sg_dma_len(sgl);
>  
> -	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
> +	/* Support Immediate dma for write transfers for data length up to 8 bytes */

And what happens if the developer writing the SPI driver forgets to read
this comment and sets QCOM_GPI_IMMEDIATE_DMA for a 9 byte transfer?

> +	if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == DMA_MEM_TO_DEV) {

Why is this flag introduced?

If I understand the next patch, all DMA_MEM_TO_DEV transfers of <=
QCOM_GPI_IMMEDIATE_DMA_LEN can use the immediate mode, so why not move
the condition here?

Also ordering[1].

	if (direction == DMA_MEM_TO_DEV && len <= 2 * sizeof(tre->dword[0]))


[1] Compare "all transfers of length 8 or less, which are mem to device"
vs "all transfers which are mem to device, with a length of 8 or less".
The bigger "selection criteria" is the direction, then that's fine tuned
by the length query.

> +		buf = sg_virt(sgl);

It's a question of style, but I think you could just put the sg_virt()
directly in the memcpy() call and avoid the extra variable.

>  
> -	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
> -	if (direction == DMA_MEM_TO_DEV)
> +		/* memcpy may not always be length of 8, hence pre-fill both dword's with 0 */

The implementation of memcpy() is always more than 8 bytes, it's buf
that might be less than 8 bytes ;)

Also you're not "pre-filling", you're "zero-initializing", or just
"initialize".


That said, does it matter? Will the QUP read beyond the
TRE_DMA_IMMEDIATE_LEN bytes? If so, please put _that_ in the comment
("QUP reads beyond the provided len, so additional content needs to be
cleared", or similar)

> +		tre->dword[0] = 0;
> +		tre->dword[1] = 0;
> +		memcpy(&tre->dword[0], buf, len);
> +
> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_IMMEDIATE_LEN);

Does the format of tre->dword[2] really change when
TRE_FLAGS_IMMEDIATE_DMA is set, or is TRE_DMA_IMMEDIATE_LEN just a
mask to highlight that len can't be more than 4 bits?

It seems like you could drop TRE_DMA_IMMEDIATE_LEN and just use
TRE_DMA_LEN here? (But it should match what the hardware programming
guide states)


Perhaps you could reduce the scope of this if/else then as well, as the
assignment of of dword[2] and dword[3] is mostly the same with and
without immediate mode (just the one bit to enable it)

> +
> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>  		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
> +		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IMMEDIATE_DMA);
> +	} else {
> +		tre->dword[0] = lower_32_bits(address);
> +		tre->dword[1] = upper_32_bits(address);
> +
> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_LEN);
> +
> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
> +		if (direction == DMA_MEM_TO_DEV)
> +			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
> +	}
>  
>  	for (i = 0; i < tre_idx; i++)
>  		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
> diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
> index 6680dd1a43c6..84598848d53a 100644
> --- a/include/linux/dma/qcom-gpi-dma.h
> +++ b/include/linux/dma/qcom-gpi-dma.h
> @@ -15,6 +15,10 @@ enum spi_transfer_cmd {
>  	SPI_DUPLEX,
>  };
>  
> +#define QCOM_GPI_IMMEDIATE_DMA		BIT(1)
> +
> +#define QCOM_GPI_IMMEDIATE_DMA_LEN	8
> +
>  /**
>   * struct gpi_spi_config - spi config for peripheral
>   *
> @@ -30,6 +34,7 @@ enum spi_transfer_cmd {
>   * @cs: chip select toggle
>   * @set_config: set peripheral config
>   * @rx_len: receive length for buffer
> + * @flags: true for immediate dma support

Per above I think you can remove this flag, but "true for immediate DMA
support" doesn't match what you have written in the code. (Also in
general u8 shouldn't be "true")

Regards,
Bjorn

>   */
>  struct gpi_spi_config {
>  	u8 set_config;
> @@ -44,6 +49,7 @@ struct gpi_spi_config {
>  	u32 clk_src;
>  	enum spi_transfer_cmd cmd;
>  	u32 rx_len;
> +	u8 flags;
>  };
>  
>  enum i2c_op {
> -- 
> 2.17.1
> 
>
Jyothi Kumar Seerapu Nov. 29, 2024, 11:14 a.m. UTC | #3
On 11/28/2024 7:37 PM, Dmitry Baryshkov wrote:
> On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
>> The DMA TRE(Transfer ring element) buffer contains the DMA
>> buffer address. Accessing data from this address can cause
>> significant delays in SPI transfers, which can be mitigated to
>> some extent by utilizing immediate DMA support.
>>
>> QCOM GPI DMA hardware supports an immediate DMA feature for data
>> up to 8 bytes, storing the data directly in the DMA TRE buffer
>> instead of the DMA buffer address. This enhancement enables faster
>> SPI data transfers.
>>
>> This optimization reduces the average transfer time from 25 us to
>> 16 us for a single SPI transfer of 8 bytes length, with a clock
>> frequency of 50 MHz.
>>
>> Signed-off-by: Jyothi Kumar Seerapu <quic_jseerapu@quicinc.com>
>> ---
>> v1 -> v2:
>>     - Separated the patches to dmaengine and spi subsystems
>>     - Removed the changes which are not required for this feature from
>>       qcom-gpi-dma.h file.
>>     - Removed the type conversions used in gpi_create_spi_tre.
>>
>>   drivers/dma/qcom/gpi.c           | 32 +++++++++++++++++++++++++++-----
>>   include/linux/dma/qcom-gpi-dma.h |  6 ++++++
>>   2 files changed, 33 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
>> index 52a7c8f2498f..4c5df696ddd8 100644
>> --- a/drivers/dma/qcom/gpi.c
>> +++ b/drivers/dma/qcom/gpi.c
>> @@ -27,6 +27,7 @@
>>   #define TRE_FLAGS_IEOT		BIT(9)
>>   #define TRE_FLAGS_BEI		BIT(10)
>>   #define TRE_FLAGS_LINK		BIT(11)
>> +#define TRE_FLAGS_IMMEDIATE_DMA	BIT(16)
>>   #define TRE_FLAGS_TYPE		GENMASK(23, 16)
>>   
>>   /* SPI CONFIG0 WD0 */
>> @@ -64,6 +65,7 @@
>>   
>>   /* DMA TRE */
>>   #define TRE_DMA_LEN		GENMASK(23, 0)
>> +#define TRE_DMA_IMMEDIATE_LEN	GENMASK(3, 0)
>>   
>>   /* Register offsets from gpi-top */
>>   #define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
>> @@ -1711,6 +1713,8 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>>   	dma_addr_t address;
>>   	struct gpi_tre *tre;
>>   	unsigned int i;
>> +	u8 *buf;
>> +	int len = 0;
>>   
>>   	/* first create config tre if applicable */
>>   	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
>> @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>>   	tre_idx++;
>>   
>>   	address = sg_dma_address(sgl);
>> -	tre->dword[0] = lower_32_bits(address);
>> -	tre->dword[1] = upper_32_bits(address);
>> +	len = sg_dma_len(sgl);
>>   
>> -	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
>> +	/* Support Immediate dma for write transfers for data length up to 8 bytes */
>> +	if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == DMA_MEM_TO_DEV) {
> 
> Please defer applying the patch until the discussion on v1 comes to
> conclusion.
Sure.

> 
>> +		buf = sg_virt(sgl);
>>   
>> -	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>> -	if (direction == DMA_MEM_TO_DEV)
>> +		/* memcpy may not always be length of 8, hence pre-fill both dword's with 0 */
>> +		tre->dword[0] = 0;
>> +		tre->dword[1] = 0;
>> +		memcpy(&tre->dword[0], buf, len);
>> +
>> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_IMMEDIATE_LEN);
>> +
>> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>>   		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
>> +		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IMMEDIATE_DMA);
>> +	} else {
>> +		tre->dword[0] = lower_32_bits(address);
>> +		tre->dword[1] = upper_32_bits(address);
>> +
>> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_LEN);
>> +
>> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>> +		if (direction == DMA_MEM_TO_DEV)
>> +			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
>> +	}
>>   
>>   	for (i = 0; i < tre_idx; i++)
>>   		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
>> diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
>> index 6680dd1a43c6..84598848d53a 100644
>> --- a/include/linux/dma/qcom-gpi-dma.h
>> +++ b/include/linux/dma/qcom-gpi-dma.h
>> @@ -15,6 +15,10 @@ enum spi_transfer_cmd {
>>   	SPI_DUPLEX,
>>   };
>>   
>> +#define QCOM_GPI_IMMEDIATE_DMA		BIT(1)
>> +
>> +#define QCOM_GPI_IMMEDIATE_DMA_LEN	8
>> +
>>   /**
>>    * struct gpi_spi_config - spi config for peripheral
>>    *
>> @@ -30,6 +34,7 @@ enum spi_transfer_cmd {
>>    * @cs: chip select toggle
>>    * @set_config: set peripheral config
>>    * @rx_len: receive length for buffer
>> + * @flags: true for immediate dma support
>>    */
>>   struct gpi_spi_config {
>>   	u8 set_config;
>> @@ -44,6 +49,7 @@ struct gpi_spi_config {
>>   	u32 clk_src;
>>   	enum spi_transfer_cmd cmd;
>>   	u32 rx_len;
>> +	u8 flags;
>>   };
>>   
>>   enum i2c_op {
>> -- 
>> 2.17.1
>>
>
Jyothi Kumar Seerapu Nov. 29, 2024, 11:32 a.m. UTC | #4
On 11/28/2024 8:53 PM, Bjorn Andersson wrote:
> On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
>> The DMA TRE(Transfer ring element) buffer contains the DMA
>> buffer address. Accessing data from this address can cause
>> significant delays in SPI transfers, which can be mitigated to
>> some extent by utilizing immediate DMA support.
>>
>> QCOM GPI DMA hardware supports an immediate DMA feature for data
>> up to 8 bytes, storing the data directly in the DMA TRE buffer
>> instead of the DMA buffer address. This enhancement enables faster
>> SPI data transfers.
>>
>> This optimization reduces the average transfer time from 25 us to
>> 16 us for a single SPI transfer of 8 bytes length, with a clock
>> frequency of 50 MHz.
>>
>> Signed-off-by: Jyothi Kumar Seerapu <quic_jseerapu@quicinc.com>
>> ---
>> v1 -> v2:
>>     - Separated the patches to dmaengine and spi subsystems
>>     - Removed the changes which are not required for this feature from
>>       qcom-gpi-dma.h file.
>>     - Removed the type conversions used in gpi_create_spi_tre.
>>
>>   drivers/dma/qcom/gpi.c           | 32 +++++++++++++++++++++++++++-----
>>   include/linux/dma/qcom-gpi-dma.h |  6 ++++++
>>   2 files changed, 33 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
>> index 52a7c8f2498f..4c5df696ddd8 100644
>> --- a/drivers/dma/qcom/gpi.c
>> +++ b/drivers/dma/qcom/gpi.c
>> @@ -27,6 +27,7 @@
>>   #define TRE_FLAGS_IEOT		BIT(9)
>>   #define TRE_FLAGS_BEI		BIT(10)
>>   #define TRE_FLAGS_LINK		BIT(11)
>> +#define TRE_FLAGS_IMMEDIATE_DMA	BIT(16)
>>   #define TRE_FLAGS_TYPE		GENMASK(23, 16)
>>   
>>   /* SPI CONFIG0 WD0 */
>> @@ -64,6 +65,7 @@
>>   
>>   /* DMA TRE */
>>   #define TRE_DMA_LEN		GENMASK(23, 0)
>> +#define TRE_DMA_IMMEDIATE_LEN	GENMASK(3, 0)
>>   
>>   /* Register offsets from gpi-top */
>>   #define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
>> @@ -1711,6 +1713,8 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>>   	dma_addr_t address;
>>   	struct gpi_tre *tre;
>>   	unsigned int i;
>> +	u8 *buf;
>> +	int len = 0;
> 
> First use of "len" is an assignment, so you shouldn't zero-initialize it
> here.
Sure, will do it in V3.
> 
>>   
>>   	/* first create config tre if applicable */
>>   	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
>> @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>>   	tre_idx++;
>>   
>>   	address = sg_dma_address(sgl);
>> -	tre->dword[0] = lower_32_bits(address);
>> -	tre->dword[1] = upper_32_bits(address);
>> +	len = sg_dma_len(sgl);
>>   
>> -	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
>> +	/* Support Immediate dma for write transfers for data length up to 8 bytes */
> 
> And what happens if the developer writing the SPI driver forgets to read
> this comment and sets QCOM_GPI_IMMEDIATE_DMA for a 9 byte transfer?
In V2 patch, QCOM_GPI_IMMEDIATE_DMA is set based on 
QCOM_GPI_IMMEDIATE_DMA_LEN only.

As per Hardware programming guide, immediate dma support is for up to 8 
bytes only.
Need to check what is the behavior if we want to handle 9 bytes using 
immediate dma feature support.

> 
>> +	if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == DMA_MEM_TO_DEV) {
> 
> Why is this flag introduced?
> 
> If I understand the next patch, all DMA_MEM_TO_DEV transfers of <=
> QCOM_GPI_IMMEDIATE_DMA_LEN can use the immediate mode, so why not move
> the condition here?
> 
> Also ordering[1].
> 
> 	if (direction == DMA_MEM_TO_DEV && len <= 2 * sizeof(tre->dword[0]))
> 
> 
Sure, thanks for the suggestion.
so, instead using "QCOM_GPI_IMMEDIATE_DMA_LEN" need to use " 2 * 
sizeof(tre->dword[0])" for 8 bytes length check.

> [1] Compare "all transfers of length 8 or less, which are mem to device"
> vs "all transfers which are mem to device, with a length of 8 or less".
> The bigger "selection criteria" is the direction, then that's fine tuned
> by the length query.
> 
>> +		buf = sg_virt(sgl);
> 
> It's a question of style, but I think you could just put the sg_virt()
> directly in the memcpy() call and avoid the extra variable.

Okay, i will directly put sg_virt() in memcpy().
> 
>>   
>> -	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>> -	if (direction == DMA_MEM_TO_DEV)
>> +		/* memcpy may not always be length of 8, hence pre-fill both dword's with 0 */
> 
> The implementation of memcpy() is always more than 8 bytes, it's buf
> that might be less than 8 bytes ;)
> 
> Also you're not "pre-filling", you're "zero-initializing", or just
> "initialize".
Okay, i will update it in V3.
> 
> 
> That said, does it matter? Will the QUP read beyond the
> TRE_DMA_IMMEDIATE_LEN bytes? If so, please put _that_ in the comment
> ("QUP reads beyond the provided len, so additional content needs to be
> cleared", or similar)
Okay, i will update it in V3.

> 
>> +		tre->dword[0] = 0;
>> +		tre->dword[1] = 0;
>> +		memcpy(&tre->dword[0], buf, len);
>> +
>> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_IMMEDIATE_LEN);
> 
> Does the format of tre->dword[2] really change when
> TRE_FLAGS_IMMEDIATE_DMA is set, or is TRE_DMA_IMMEDIATE_LEN just a
> mask to highlight that len can't be more than 4 bits?
> 
> It seems like you could drop TRE_DMA_IMMEDIATE_LEN and just use
> TRE_DMA_LEN here? (But it should match what the hardware programming
> guide states)
As per hardware programming guide, for Immediate dma, in dword2 length 
should be 4 bytes only and so need to use TRE_DMA_IMMEDIATE_LEN .
> 
> 
> Perhaps you could reduce the scope of this if/else then as well, as the
> assignment of of dword[2] and dword[3] is mostly the same with and
> without immediate mode (just the one bit to enable it)
sure, will check it and update in V3.
> 
>> +
>> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>>   		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
>> +		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IMMEDIATE_DMA);
>> +	} else {
>> +		tre->dword[0] = lower_32_bits(address);
>> +		tre->dword[1] = upper_32_bits(address);
>> +
>> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_LEN);
>> +
>> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>> +		if (direction == DMA_MEM_TO_DEV)
>> +			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
>> +	}
>>   
>>   	for (i = 0; i < tre_idx; i++)
>>   		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
>> diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
>> index 6680dd1a43c6..84598848d53a 100644
>> --- a/include/linux/dma/qcom-gpi-dma.h
>> +++ b/include/linux/dma/qcom-gpi-dma.h
>> @@ -15,6 +15,10 @@ enum spi_transfer_cmd {
>>   	SPI_DUPLEX,
>>   };
>>   
>> +#define QCOM_GPI_IMMEDIATE_DMA		BIT(1)
>> +
>> +#define QCOM_GPI_IMMEDIATE_DMA_LEN	8
>> +
>>   /**
>>    * struct gpi_spi_config - spi config for peripheral
>>    *
>> @@ -30,6 +34,7 @@ enum spi_transfer_cmd {
>>    * @cs: chip select toggle
>>    * @set_config: set peripheral config
>>    * @rx_len: receive length for buffer
>> + * @flags: true for immediate dma support
> 
> Per above I think you can remove this flag, but "true for immediate DMA
> support" doesn't match what you have written in the code. (Also in
> general u8 shouldn't be "true")

Sure, will update in V3.

Thanks for providing the review comments.
> 
> Regards,
> Bjorn
> 
>>    */
>>   struct gpi_spi_config {
>>   	u8 set_config;
>> @@ -44,6 +49,7 @@ struct gpi_spi_config {
>>   	u32 clk_src;
>>   	enum spi_transfer_cmd cmd;
>>   	u32 rx_len;
>> +	u8 flags;
>>   };
>>   
>>   enum i2c_op {
>> -- 
>> 2.17.1
>>
>>
Bjorn Andersson Nov. 30, 2024, 4:05 a.m. UTC | #5
On Fri, Nov 29, 2024 at 05:02:22PM +0530, Jyothi Kumar Seerapu wrote:
> On 11/28/2024 8:53 PM, Bjorn Andersson wrote:
> > On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
> > > diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
[..]
> > 
> > >   	/* first create config tre if applicable */
> > >   	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
> > > @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
> > >   	tre_idx++;
> > >   	address = sg_dma_address(sgl);
> > > -	tre->dword[0] = lower_32_bits(address);
> > > -	tre->dword[1] = upper_32_bits(address);
> > > +	len = sg_dma_len(sgl);
> > > -	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
> > > +	/* Support Immediate dma for write transfers for data length up to 8 bytes */
> > 
> > And what happens if the developer writing the SPI driver forgets to read
> > this comment and sets QCOM_GPI_IMMEDIATE_DMA for a 9 byte transfer?
> In V2 patch, QCOM_GPI_IMMEDIATE_DMA is set based on
> QCOM_GPI_IMMEDIATE_DMA_LEN only.
> 

I assume you mean "patch 2/2". So, what happens if someone refactors the
SPI driver in the future, will they read this comment?

> As per Hardware programming guide, immediate dma support is for up to 8
> bytes only.
> Need to check what is the behavior if we want to handle 9 bytes using
> immediate dma feature support.
> 

I'm saying that you have a comment here which says that the caller must
not pass len > 8. Write that comment in code to avoid mistakes - either
now or in the future.

> > 
> > > +	if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == DMA_MEM_TO_DEV) {
> > 
> > Why is this flag introduced?
> > 
> > If I understand the next patch, all DMA_MEM_TO_DEV transfers of <=
> > QCOM_GPI_IMMEDIATE_DMA_LEN can use the immediate mode, so why not move
> > the condition here?
> > 
> > Also ordering[1].
> > 
> > 	if (direction == DMA_MEM_TO_DEV && len <= 2 * sizeof(tre->dword[0]))
> > 
> > 
> Sure, thanks for the suggestion.
> so, instead using "QCOM_GPI_IMMEDIATE_DMA_LEN" need to use " 2 *
> sizeof(tre->dword[0])" for 8 bytes length check.
> 

Either one works, but I'm guessing that while 8 is the right number the
reason for 8 is that the data is passed in 2 * dword.


The important thing is that you're encoding the length check here, so
that the client can't by mistake trigger immediate mode with > 8 bytes.
As a side effect, you no longer need the QCOM_GPI_IMMEDIATE_DMA flag and
should be able to drop patch 2.

> > [1] Compare "all transfers of length 8 or less, which are mem to device"
> > vs "all transfers which are mem to device, with a length of 8 or less".
> > The bigger "selection criteria" is the direction, then that's fine tuned
> > by the length query.
> > 
> > > +		buf = sg_virt(sgl);
> > 
> > It's a question of style, but I think you could just put the sg_virt()
> > directly in the memcpy() call and avoid the extra variable.
> 
> Okay, i will directly put sg_virt() in memcpy().

Try it out, pick the option that look the best.

Regards,
Bjorn
Jyothi Kumar Seerapu Dec. 2, 2024, 5:31 a.m. UTC | #6
On 11/30/2024 9:35 AM, Bjorn Andersson wrote:
> On Fri, Nov 29, 2024 at 05:02:22PM +0530, Jyothi Kumar Seerapu wrote:
>> On 11/28/2024 8:53 PM, Bjorn Andersson wrote:
>>> On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
>>>> diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
> [..]
>>>
>>>>    	/* first create config tre if applicable */
>>>>    	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
>>>> @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>>>>    	tre_idx++;
>>>>    	address = sg_dma_address(sgl);
>>>> -	tre->dword[0] = lower_32_bits(address);
>>>> -	tre->dword[1] = upper_32_bits(address);
>>>> +	len = sg_dma_len(sgl);
>>>> -	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
>>>> +	/* Support Immediate dma for write transfers for data length up to 8 bytes */
>>>
>>> And what happens if the developer writing the SPI driver forgets to read
>>> this comment and sets QCOM_GPI_IMMEDIATE_DMA for a 9 byte transfer?
>> In V2 patch, QCOM_GPI_IMMEDIATE_DMA is set based on
>> QCOM_GPI_IMMEDIATE_DMA_LEN only.
>>
> 
> I assume you mean "patch 2/2". So, what happens if someone refactors the
> SPI driver in the future, will they read this comment?
> 
>> As per Hardware programming guide, immediate dma support is for up to 8
>> bytes only.
>> Need to check what is the behavior if we want to handle 9 bytes using
>> immediate dma feature support.
>>
> 
> I'm saying that you have a comment here which says that the caller must
> not pass len > 8. Write that comment in code to avoid mistakes - either
> now or in the future.

Sure, i will update the comment in V3.
> 
>>>
>>>> +	if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == DMA_MEM_TO_DEV) {
>>>
>>> Why is this flag introduced?
>>>
>>> If I understand the next patch, all DMA_MEM_TO_DEV transfers of <=
>>> QCOM_GPI_IMMEDIATE_DMA_LEN can use the immediate mode, so why not move
>>> the condition here?
>>>
>>> Also ordering[1].
>>>
>>> 	if (direction == DMA_MEM_TO_DEV && len <= 2 * sizeof(tre->dword[0]))
>>>
>>>
>> Sure, thanks for the suggestion.
>> so, instead using "QCOM_GPI_IMMEDIATE_DMA_LEN" need to use " 2 *
>> sizeof(tre->dword[0])" for 8 bytes length check.
>>
> 
> Either one works, but I'm guessing that while 8 is the right number the
> reason for 8 is that the data is passed in 2 * dword.
Okay, i will use "2 * sizeof(tre->dword[0]" which gives 8 only.
> 
> 
> The important thing is that you're encoding the length check here, so
> that the client can't by mistake trigger immediate mode with > 8 bytes.
> As a side effect, you no longer need the QCOM_GPI_IMMEDIATE_DMA flag and
> should be able to drop patch 2.

Sure thanks, will update the changes in V3.
> 
>>> [1] Compare "all transfers of length 8 or less, which are mem to device"
>>> vs "all transfers which are mem to device, with a length of 8 or less".
>>> The bigger "selection criteria" is the direction, then that's fine tuned
>>> by the length query.
>>>
>>>> +		buf = sg_virt(sgl);
>>>
>>> It's a question of style, but I think you could just put the sg_virt()
>>> directly in the memcpy() call and avoid the extra variable.
>>
>> Okay, i will directly put sg_virt() in memcpy().
> 
> Try it out, pick the option that look the best.
Yes, will do it in V3.

> 
> Regards,
> Bjorn
>
Jyothi Kumar Seerapu Dec. 3, 2024, 1:31 p.m. UTC | #7
On 12/2/2024 11:01 AM, Jyothi Kumar Seerapu wrote:
> 
> 
> On 11/30/2024 9:35 AM, Bjorn Andersson wrote:
>> On Fri, Nov 29, 2024 at 05:02:22PM +0530, Jyothi Kumar Seerapu wrote:
>>> On 11/28/2024 8:53 PM, Bjorn Andersson wrote:
>>>> On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
>>>>> diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
>> [..]
>>>>
>>>>>        /* first create config tre if applicable */
>>>>>        if (direction == DMA_MEM_TO_DEV && spi->set_config) {
>>>>> @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan 
>>>>> *chan, struct gpi_desc *desc,
>>>>>        tre_idx++;
>>>>>        address = sg_dma_address(sgl);
>>>>> -    tre->dword[0] = lower_32_bits(address);
>>>>> -    tre->dword[1] = upper_32_bits(address);
>>>>> +    len = sg_dma_len(sgl);
>>>>> -    tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
>>>>> +    /* Support Immediate dma for write transfers for data length 
>>>>> up to 8 bytes */
>>>>
>>>> And what happens if the developer writing the SPI driver forgets to 
>>>> read
>>>> this comment and sets QCOM_GPI_IMMEDIATE_DMA for a 9 byte transfer?
>>> In V2 patch, QCOM_GPI_IMMEDIATE_DMA is set based on
>>> QCOM_GPI_IMMEDIATE_DMA_LEN only.
>>>
>>
>> I assume you mean "patch 2/2". So, what happens if someone refactors the
>> SPI driver in the future, will they read this comment?
>>
>>> As per Hardware programming guide, immediate dma support is for up to 8
>>> bytes only.
>>> Need to check what is the behavior if we want to handle 9 bytes using
>>> immediate dma feature support.
>>>
>>
>> I'm saying that you have a comment here which says that the caller must
>> not pass len > 8. Write that comment in code to avoid mistakes - either
>> now or in the future.
> 
> Sure, i will update the comment in V3.

If the GPI driver has the control to process it using Immediate DMA or 
normal existing GPI-DMA transfers (else path) based on the length check 
and direction, then i think that the existing comment is fine as 
protocol driver (spi) has no role here to set the transfer type whether 
to handle using Immediate DMA or normal existing DMA method.
please let me know in case still it needs to update.

if (direction == DMA_MEM_TO_DEV && len <= 2 * sizeof(tre->dword[0]))
This change, i will update in V3 patch.

>>
>>>>
>>>>> +    if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == 
>>>>> DMA_MEM_TO_DEV) {
>>>>
>>>> Why is this flag introduced?
>>>>
>>>> If I understand the next patch, all DMA_MEM_TO_DEV transfers of <=
>>>> QCOM_GPI_IMMEDIATE_DMA_LEN can use the immediate mode, so why not move
>>>> the condition here?
>>>>
>>>> Also ordering[1].
>>>>
>>>>     if (direction == DMA_MEM_TO_DEV && len <= 2 * 
>>>> sizeof(tre->dword[0]))
>>>>
>>>>
>>> Sure, thanks for the suggestion.
>>> so, instead using "QCOM_GPI_IMMEDIATE_DMA_LEN" need to use " 2 *
>>> sizeof(tre->dword[0])" for 8 bytes length check.
>>>
>>
>> Either one works, but I'm guessing that while 8 is the right number the
>> reason for 8 is that the data is passed in 2 * dword.
> Okay, i will use "2 * sizeof(tre->dword[0]" which gives 8 only.
>>
>>
>> The important thing is that you're encoding the length check here, so
>> that the client can't by mistake trigger immediate mode with > 8 bytes.
>> As a side effect, you no longer need the QCOM_GPI_IMMEDIATE_DMA flag and
>> should be able to drop patch 2.
> 
> Sure thanks, will update the changes in V3.
>>
>>>> [1] Compare "all transfers of length 8 or less, which are mem to 
>>>> device"
>>>> vs "all transfers which are mem to device, with a length of 8 or less".
>>>> The bigger "selection criteria" is the direction, then that's fine 
>>>> tuned
>>>> by the length query.
>>>>
>>>>> +        buf = sg_virt(sgl);
>>>>
>>>> It's a question of style, but I think you could just put the sg_virt()
>>>> directly in the memcpy() call and avoid the extra variable.
>>>
>>> Okay, i will directly put sg_virt() in memcpy().
>>
>> Try it out, pick the option that look the best.
> Yes, will do it in V3.
> 
>>
>> Regards,
>> Bjorn
>>
>
Jyothi Kumar Seerapu Dec. 3, 2024, 2:14 p.m. UTC | #8
On 11/29/2024 5:02 PM, Jyothi Kumar Seerapu wrote:
> 
> 
> On 11/28/2024 8:53 PM, Bjorn Andersson wrote:
>> On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
>>> The DMA TRE(Transfer ring element) buffer contains the DMA
>>> buffer address. Accessing data from this address can cause
>>> significant delays in SPI transfers, which can be mitigated to
>>> some extent by utilizing immediate DMA support.
>>>
>>> QCOM GPI DMA hardware supports an immediate DMA feature for data
>>> up to 8 bytes, storing the data directly in the DMA TRE buffer
>>> instead of the DMA buffer address. This enhancement enables faster
>>> SPI data transfers.
>>>
>>> This optimization reduces the average transfer time from 25 us to
>>> 16 us for a single SPI transfer of 8 bytes length, with a clock
>>> frequency of 50 MHz.
>>>
>>> Signed-off-by: Jyothi Kumar Seerapu <quic_jseerapu@quicinc.com>
>>> ---
>>> v1 -> v2:
>>>     - Separated the patches to dmaengine and spi subsystems
>>>     - Removed the changes which are not required for this feature from
>>>       qcom-gpi-dma.h file.
>>>     - Removed the type conversions used in gpi_create_spi_tre.
>>>
>>>   drivers/dma/qcom/gpi.c           | 32 +++++++++++++++++++++++++++-----
>>>   include/linux/dma/qcom-gpi-dma.h |  6 ++++++
>>>   2 files changed, 33 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
>>> index 52a7c8f2498f..4c5df696ddd8 100644
>>> --- a/drivers/dma/qcom/gpi.c
>>> +++ b/drivers/dma/qcom/gpi.c
>>> @@ -27,6 +27,7 @@
>>>   #define TRE_FLAGS_IEOT        BIT(9)
>>>   #define TRE_FLAGS_BEI        BIT(10)
>>>   #define TRE_FLAGS_LINK        BIT(11)
>>> +#define TRE_FLAGS_IMMEDIATE_DMA    BIT(16)
>>>   #define TRE_FLAGS_TYPE        GENMASK(23, 16)
>>>   /* SPI CONFIG0 WD0 */
>>> @@ -64,6 +65,7 @@
>>>   /* DMA TRE */
>>>   #define TRE_DMA_LEN        GENMASK(23, 0)
>>> +#define TRE_DMA_IMMEDIATE_LEN    GENMASK(3, 0)
>>>   /* Register offsets from gpi-top */
>>>   #define GPII_n_CH_k_CNTXT_0_OFFS(n, k)    (0x20000 + (0x4000 * (n)) 
>>> + (0x80 * (k)))
>>> @@ -1711,6 +1713,8 @@ static int gpi_create_spi_tre(struct gchan 
>>> *chan, struct gpi_desc *desc,
>>>       dma_addr_t address;
>>>       struct gpi_tre *tre;
>>>       unsigned int i;
>>> +    u8 *buf;
>>> +    int len = 0;
>>
>> First use of "len" is an assignment, so you shouldn't zero-initialize it
>> here.
> Sure, will do it in V3.
>>
>>>       /* first create config tre if applicable */
>>>       if (direction == DMA_MEM_TO_DEV && spi->set_config) {
>>> @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan 
>>> *chan, struct gpi_desc *desc,
>>>       tre_idx++;
>>>       address = sg_dma_address(sgl);
>>> -    tre->dword[0] = lower_32_bits(address);
>>> -    tre->dword[1] = upper_32_bits(address);
>>> +    len = sg_dma_len(sgl);
>>> -    tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
>>> +    /* Support Immediate dma for write transfers for data length up 
>>> to 8 bytes */
>>
>> And what happens if the developer writing the SPI driver forgets to read
>> this comment and sets QCOM_GPI_IMMEDIATE_DMA for a 9 byte transfer?
> In V2 patch, QCOM_GPI_IMMEDIATE_DMA is set based on 
> QCOM_GPI_IMMEDIATE_DMA_LEN only.
> 
> As per Hardware programming guide, immediate dma support is for up to 8 
> bytes only.
> Need to check what is the behavior if we want to handle 9 bytes using 
> immediate dma feature support.
> 
>>
>>> +    if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == 
>>> DMA_MEM_TO_DEV) {
>>
>> Why is this flag introduced?
>>
>> If I understand the next patch, all DMA_MEM_TO_DEV transfers of <=
>> QCOM_GPI_IMMEDIATE_DMA_LEN can use the immediate mode, so why not move
>> the condition here?
>>
>> Also ordering[1].
>>
>>     if (direction == DMA_MEM_TO_DEV && len <= 2 * sizeof(tre->dword[0]))
>>
>>
> Sure, thanks for the suggestion.
> so, instead using "QCOM_GPI_IMMEDIATE_DMA_LEN" need to use " 2 * 
> sizeof(tre->dword[0])" for 8 bytes length check.
> 
>> [1] Compare "all transfers of length 8 or less, which are mem to device"
>> vs "all transfers which are mem to device, with a length of 8 or less".
>> The bigger "selection criteria" is the direction, then that's fine tuned
>> by the length query.
>>
>>> +        buf = sg_virt(sgl);
>>
>> It's a question of style, but I think you could just put the sg_virt()
>> directly in the memcpy() call and avoid the extra variable.
> 
> Okay, i will directly put sg_virt() in memcpy().
>>
>>> -    tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>>> -    if (direction == DMA_MEM_TO_DEV)
>>> +        /* memcpy may not always be length of 8, hence pre-fill both 
>>> dword's with 0 */
>>
>> The implementation of memcpy() is always more than 8 bytes, it's buf
>> that might be less than 8 bytes ;)
>>
>> Also you're not "pre-filling", you're "zero-initializing", or just
>> "initialize".
 >
Sure, will correct it to use initialize instead pre-fill.  i will update 
it in V3.
 >
>>
>>
>> That said, does it matter? Will the QUP read beyond the
>> TRE_DMA_IMMEDIATE_LEN bytes? If so, please put _that_ in the comment
>> ("QUP reads beyond the provided len, so additional content needs to be
>> cleared", or similar)
 >
Data lengths upto 8 bytes(len <=8) can be handled using immediate dma.
QUP won't handle beyond 8 bytes in immediate dma, if more than 8 bytes 
then it process using the logic mentioned in else path.

Each dword is of size 4 bytes and so in this immediate dma case, dword0 
and dword1 is used to hold 8 bytes data.In scenarios like data size 
lesss then 8 bytes, for example if the spi data transfer size is 4bytes 
then 4 bytes data will be updated into dword0 and dword1 might contain 
some garbage data and so make sure to initialize both dword0 and dword1 
with 0 and then memcpy the actual spi transfer data based on the length 
to the dword's.

> 
>>
>>> +        tre->dword[0] = 0;
>>> +        tre->dword[1] = 0;
>>> +        memcpy(&tre->dword[0], buf, len);
>>> +
>>> +        tre->dword[2] = u32_encode_bits(len, TRE_DMA_IMMEDIATE_LEN);
>>
>> Does the format of tre->dword[2] really change when
>> TRE_FLAGS_IMMEDIATE_DMA is set, or is TRE_DMA_IMMEDIATE_LEN just a
>> mask to highlight that len can't be more than 4 bits?
>>
>> It seems like you could drop TRE_DMA_IMMEDIATE_LEN and just use
>> TRE_DMA_LEN here? (But it should match what the hardware programming
>> guide states)
> As per hardware programming guide, for Immediate dma, in dword2 length 
> should be 4 bytes only and so need to use TRE_DMA_IMMEDIATE_LEN .
>>
>>
>> Perhaps you could reduce the scope of this if/else then as well, as the
>> assignment of of dword[2] and dword[3] is mostly the same with and
>> without immediate mode (just the one bit to enable it)
> sure, will check it and update in V3.
>>
>>> +
>>> +        tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>>>           tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
>>> +        tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IMMEDIATE_DMA);
>>> +    } else {
>>> +        tre->dword[0] = lower_32_bits(address);
>>> +        tre->dword[1] = upper_32_bits(address);
>>> +
>>> +        tre->dword[2] = u32_encode_bits(len, TRE_DMA_LEN);
>>> +
>>> +        tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>>> +        if (direction == DMA_MEM_TO_DEV)
>>> +            tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
>>> +    }
>>>       for (i = 0; i < tre_idx; i++)
>>>           dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
>>> diff --git a/include/linux/dma/qcom-gpi-dma.h 
>>> b/include/linux/dma/qcom-gpi-dma.h
>>> index 6680dd1a43c6..84598848d53a 100644
>>> --- a/include/linux/dma/qcom-gpi-dma.h
>>> +++ b/include/linux/dma/qcom-gpi-dma.h
>>> @@ -15,6 +15,10 @@ enum spi_transfer_cmd {
>>>       SPI_DUPLEX,
>>>   };
>>> +#define QCOM_GPI_IMMEDIATE_DMA        BIT(1)
>>> +
>>> +#define QCOM_GPI_IMMEDIATE_DMA_LEN    8
>>> +
>>>   /**
>>>    * struct gpi_spi_config - spi config for peripheral
>>>    *
>>> @@ -30,6 +34,7 @@ enum spi_transfer_cmd {
>>>    * @cs: chip select toggle
>>>    * @set_config: set peripheral config
>>>    * @rx_len: receive length for buffer
>>> + * @flags: true for immediate dma support
>>
>> Per above I think you can remove this flag, but "true for immediate DMA
>> support" doesn't match what you have written in the code. (Also in
>> general u8 shouldn't be "true")
> 
> Sure, will update in V3.
> 
> Thanks for providing the review comments.
>>
>> Regards,
>> Bjorn
>>
>>>    */
>>>   struct gpi_spi_config {
>>>       u8 set_config;
>>> @@ -44,6 +49,7 @@ struct gpi_spi_config {
>>>       u32 clk_src;
>>>       enum spi_transfer_cmd cmd;
>>>       u32 rx_len;
>>> +    u8 flags;
>>>   };
>>>   enum i2c_op {
>>> -- 
>>> 2.17.1
>>>
>>>
>
Jyothi Kumar Seerapu Dec. 3, 2024, 2:15 p.m. UTC | #9
On 11/28/2024 8:53 PM, Bjorn Andersson wrote:
> On Thu, Nov 28, 2024 at 07:03:50PM +0530, Jyothi Kumar Seerapu wrote:
>> The DMA TRE(Transfer ring element) buffer contains the DMA
>> buffer address. Accessing data from this address can cause
>> significant delays in SPI transfers, which can be mitigated to
>> some extent by utilizing immediate DMA support.
>>
>> QCOM GPI DMA hardware supports an immediate DMA feature for data
>> up to 8 bytes, storing the data directly in the DMA TRE buffer
>> instead of the DMA buffer address. This enhancement enables faster
>> SPI data transfers.
>>
>> This optimization reduces the average transfer time from 25 us to
>> 16 us for a single SPI transfer of 8 bytes length, with a clock
>> frequency of 50 MHz.
>>
>> Signed-off-by: Jyothi Kumar Seerapu <quic_jseerapu@quicinc.com>
>> ---
>> v1 -> v2:
>>     - Separated the patches to dmaengine and spi subsystems
>>     - Removed the changes which are not required for this feature from
>>       qcom-gpi-dma.h file.
>>     - Removed the type conversions used in gpi_create_spi_tre.
>>
>>   drivers/dma/qcom/gpi.c           | 32 +++++++++++++++++++++++++++-----
>>   include/linux/dma/qcom-gpi-dma.h |  6 ++++++
>>   2 files changed, 33 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
>> index 52a7c8f2498f..4c5df696ddd8 100644
>> --- a/drivers/dma/qcom/gpi.c
>> +++ b/drivers/dma/qcom/gpi.c
>> @@ -27,6 +27,7 @@
>>   #define TRE_FLAGS_IEOT		BIT(9)
>>   #define TRE_FLAGS_BEI		BIT(10)
>>   #define TRE_FLAGS_LINK		BIT(11)
>> +#define TRE_FLAGS_IMMEDIATE_DMA	BIT(16)
>>   #define TRE_FLAGS_TYPE		GENMASK(23, 16)
>>   
>>   /* SPI CONFIG0 WD0 */
>> @@ -64,6 +65,7 @@
>>   
>>   /* DMA TRE */
>>   #define TRE_DMA_LEN		GENMASK(23, 0)
>> +#define TRE_DMA_IMMEDIATE_LEN	GENMASK(3, 0)
>>   
>>   /* Register offsets from gpi-top */
>>   #define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
>> @@ -1711,6 +1713,8 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>>   	dma_addr_t address;
>>   	struct gpi_tre *tre;
>>   	unsigned int i;
>> +	u8 *buf;
>> +	int len = 0;
> 
> First use of "len" is an assignment, so you shouldn't zero-initialize it
> here.
> 
>>   
>>   	/* first create config tre if applicable */
>>   	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
>> @@ -1763,14 +1767,32 @@ static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
>>   	tre_idx++;
>>   
>>   	address = sg_dma_address(sgl);
>> -	tre->dword[0] = lower_32_bits(address);
>> -	tre->dword[1] = upper_32_bits(address);
>> +	len = sg_dma_len(sgl);
>>   
>> -	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
>> +	/* Support Immediate dma for write transfers for data length up to 8 bytes */
> 
> And what happens if the developer writing the SPI driver forgets to read
> this comment and sets QCOM_GPI_IMMEDIATE_DMA for a 9 byte transfer?
> 
>> +	if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == DMA_MEM_TO_DEV) {
> 
> Why is this flag introduced?
> 
> If I understand the next patch, all DMA_MEM_TO_DEV transfers of <=
> QCOM_GPI_IMMEDIATE_DMA_LEN can use the immediate mode, so why not move
> the condition here?
> 
> Also ordering[1].
> 
> 	if (direction == DMA_MEM_TO_DEV && len <= 2 * sizeof(tre->dword[0]))
> 
> 
> [1] Compare "all transfers of length 8 or less, which are mem to device"
> vs "all transfers which are mem to device, with a length of 8 or less".
> The bigger "selection criteria" is the direction, then that's fine tuned
> by the length query.
> 
>> +		buf = sg_virt(sgl);
> 
> It's a question of style, but I think you could just put the sg_virt()
> directly in the memcpy() call and avoid the extra variable.
> 
>>   
>> -	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>> -	if (direction == DMA_MEM_TO_DEV)
>> +		/* memcpy may not always be length of 8, hence pre-fill both dword's with 0 */
> 
> The implementation of memcpy() is always more than 8 bytes, it's buf
> that might be less than 8 bytes ;)
> 
> Also you're not "pre-filling", you're "zero-initializing", or just
> "initialize".
Sure, will correct it to initialize instead pre-fill.
> 
> 
> That said, does it matter? Will the QUP read beyond the
> TRE_DMA_IMMEDIATE_LEN bytes? If so, please put _that_ in the comment
> ("QUP reads beyond the provided len, so additional content needs to be
> cleared", or similar)

Data lengths upto 8 bytes(len <=8) can be handled using immediate dma.
QUP won't handle beyond 8 bytes in immediate dma, if more than 8 bytes 
then it process the logic mentioned in else path.

Each dword is of size 4 bytes and so in this immediate dma case, dword0 
and dword1 is used to hold 8 bytes datae. In scenarios like data size if 
lesss then 8 bytes, for example if the spi data transfer size is 4bytes 
then it will update in dword0 and dword1 might contain some garbage data 
and so make sure to initialize both dword0 and dword1 with 0 and then 
memcpy the actual spi transfer data based on the legth to the dword's.
> 
>> +		tre->dword[0] = 0;
>> +		tre->dword[1] = 0;
>> +		memcpy(&tre->dword[0], buf, len);
>> +
>> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_IMMEDIATE_LEN);
> 
> Does the format of tre->dword[2] really change when
> TRE_FLAGS_IMMEDIATE_DMA is set, or is TRE_DMA_IMMEDIATE_LEN just a
> mask to highlight that len can't be more than 4 bits?
> 
> It seems like you could drop TRE_DMA_IMMEDIATE_LEN and just use
> TRE_DMA_LEN here? (But it should match what the hardware programming
> guide states)
> 
> 
> Perhaps you could reduce the scope of this if/else then as well, as the
> assignment of of dword[2] and dword[3] is mostly the same with and
> without immediate mode (just the one bit to enable it)
> 
>> +
>> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>>   		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
>> +		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IMMEDIATE_DMA);
>> +	} else {
>> +		tre->dword[0] = lower_32_bits(address);
>> +		tre->dword[1] = upper_32_bits(address);
>> +
>> +		tre->dword[2] = u32_encode_bits(len, TRE_DMA_LEN);
>> +
>> +		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
>> +		if (direction == DMA_MEM_TO_DEV)
>> +			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
>> +	}
>>   
>>   	for (i = 0; i < tre_idx; i++)
>>   		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
>> diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
>> index 6680dd1a43c6..84598848d53a 100644
>> --- a/include/linux/dma/qcom-gpi-dma.h
>> +++ b/include/linux/dma/qcom-gpi-dma.h
>> @@ -15,6 +15,10 @@ enum spi_transfer_cmd {
>>   	SPI_DUPLEX,
>>   };
>>   
>> +#define QCOM_GPI_IMMEDIATE_DMA		BIT(1)
>> +
>> +#define QCOM_GPI_IMMEDIATE_DMA_LEN	8
>> +
>>   /**
>>    * struct gpi_spi_config - spi config for peripheral
>>    *
>> @@ -30,6 +34,7 @@ enum spi_transfer_cmd {
>>    * @cs: chip select toggle
>>    * @set_config: set peripheral config
>>    * @rx_len: receive length for buffer
>> + * @flags: true for immediate dma support
> 
> Per above I think you can remove this flag, but "true for immediate DMA
> support" doesn't match what you have written in the code. (Also in
> general u8 shouldn't be "true")
> 
> Regards,
> Bjorn
> 
>>    */
>>   struct gpi_spi_config {
>>   	u8 set_config;
>> @@ -44,6 +49,7 @@ struct gpi_spi_config {
>>   	u32 clk_src;
>>   	enum spi_transfer_cmd cmd;
>>   	u32 rx_len;
>> +	u8 flags;
>>   };
>>   
>>   enum i2c_op {
>> -- 
>> 2.17.1
>>
>>
diff mbox series

Patch

diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
index 52a7c8f2498f..4c5df696ddd8 100644
--- a/drivers/dma/qcom/gpi.c
+++ b/drivers/dma/qcom/gpi.c
@@ -27,6 +27,7 @@ 
 #define TRE_FLAGS_IEOT		BIT(9)
 #define TRE_FLAGS_BEI		BIT(10)
 #define TRE_FLAGS_LINK		BIT(11)
+#define TRE_FLAGS_IMMEDIATE_DMA	BIT(16)
 #define TRE_FLAGS_TYPE		GENMASK(23, 16)
 
 /* SPI CONFIG0 WD0 */
@@ -64,6 +65,7 @@ 
 
 /* DMA TRE */
 #define TRE_DMA_LEN		GENMASK(23, 0)
+#define TRE_DMA_IMMEDIATE_LEN	GENMASK(3, 0)
 
 /* Register offsets from gpi-top */
 #define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
@@ -1711,6 +1713,8 @@  static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
 	dma_addr_t address;
 	struct gpi_tre *tre;
 	unsigned int i;
+	u8 *buf;
+	int len = 0;
 
 	/* first create config tre if applicable */
 	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
@@ -1763,14 +1767,32 @@  static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
 	tre_idx++;
 
 	address = sg_dma_address(sgl);
-	tre->dword[0] = lower_32_bits(address);
-	tre->dword[1] = upper_32_bits(address);
+	len = sg_dma_len(sgl);
 
-	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
+	/* Support Immediate dma for write transfers for data length up to 8 bytes */
+	if ((spi->flags & QCOM_GPI_IMMEDIATE_DMA) && direction == DMA_MEM_TO_DEV) {
+		buf = sg_virt(sgl);
 
-	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
-	if (direction == DMA_MEM_TO_DEV)
+		/* memcpy may not always be length of 8, hence pre-fill both dword's with 0 */
+		tre->dword[0] = 0;
+		tre->dword[1] = 0;
+		memcpy(&tre->dword[0], buf, len);
+
+		tre->dword[2] = u32_encode_bits(len, TRE_DMA_IMMEDIATE_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
 		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IMMEDIATE_DMA);
+	} else {
+		tre->dword[0] = lower_32_bits(address);
+		tre->dword[1] = upper_32_bits(address);
+
+		tre->dword[2] = u32_encode_bits(len, TRE_DMA_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
+		if (direction == DMA_MEM_TO_DEV)
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
+	}
 
 	for (i = 0; i < tre_idx; i++)
 		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
index 6680dd1a43c6..84598848d53a 100644
--- a/include/linux/dma/qcom-gpi-dma.h
+++ b/include/linux/dma/qcom-gpi-dma.h
@@ -15,6 +15,10 @@  enum spi_transfer_cmd {
 	SPI_DUPLEX,
 };
 
+#define QCOM_GPI_IMMEDIATE_DMA		BIT(1)
+
+#define QCOM_GPI_IMMEDIATE_DMA_LEN	8
+
 /**
  * struct gpi_spi_config - spi config for peripheral
  *
@@ -30,6 +34,7 @@  enum spi_transfer_cmd {
  * @cs: chip select toggle
  * @set_config: set peripheral config
  * @rx_len: receive length for buffer
+ * @flags: true for immediate dma support
  */
 struct gpi_spi_config {
 	u8 set_config;
@@ -44,6 +49,7 @@  struct gpi_spi_config {
 	u32 clk_src;
 	enum spi_transfer_cmd cmd;
 	u32 rx_len;
+	u8 flags;
 };
 
 enum i2c_op {