diff mbox series

[v3,4/7] dmaengine: stm32-dma: Add DMA/MDMA chaining support

Message ID 1538139715-24406-5-git-send-email-pierre-yves.mordret@st.com (mailing list archive)
State New, archived
Headers show
Series Add-DMA-MDMA-chaining-support | expand

Commit Message

Pierre Yves MORDRET Sept. 28, 2018, 1:01 p.m. UTC
This patch adds support of DMA/MDMA chaining support.
It introduces an intermediate transfer between peripherals and STM32 DMA.
This intermediate transfer is triggered by SW for single M2D transfer and
by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).

A generic SRAM allocator is used for this intermediate buffer
Each DMA channel will be able to define its SRAM needs to achieve chaining
feature : (2 ^ order) * PAGE_SIZE.
For cyclic, SRAM buffer is derived from period length (rounded on
PAGE_SIZE).

Signed-off-by: Pierre-Yves MORDRET <pierre-yves.mordret@st.com>
---
  Version history:
    v3:
       * Solve KBuild warning
    v2:
    v1:
       * Initial
---
---
 drivers/dma/stm32-dma.c | 879 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 772 insertions(+), 107 deletions(-)

Comments

Vinod Koul Oct. 7, 2018, 4 p.m. UTC | #1
On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
> This patch adds support of DMA/MDMA chaining support.
> It introduces an intermediate transfer between peripherals and STM32 DMA.
> This intermediate transfer is triggered by SW for single M2D transfer and
> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
> 
> A generic SRAM allocator is used for this intermediate buffer
> Each DMA channel will be able to define its SRAM needs to achieve chaining
> feature : (2 ^ order) * PAGE_SIZE.
> For cyclic, SRAM buffer is derived from period length (rounded on
> PAGE_SIZE).

So IIUC, you chain two dma txns together and transfer data via an SRAM?

> 
> Signed-off-by: Pierre-Yves MORDRET <pierre-yves.mordret@st.com>
> ---
>   Version history:
>     v3:
>        * Solve KBuild warning
>     v2:
>     v1:
>        * Initial
> ---
> ---
>  drivers/dma/stm32-dma.c | 879 ++++++++++++++++++++++++++++++++++++++++++------

that is a lot of change for a driver, consider splitting it up
logically in smaller changes...

>  1 file changed, 772 insertions(+), 107 deletions(-)
> 
> diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
> index 379e8d5..85e81c4 100644
> --- a/drivers/dma/stm32-dma.c
> +++ b/drivers/dma/stm32-dma.c
> @@ -15,11 +15,14 @@
>  #include <linux/dmaengine.h>
>  #include <linux/dma-mapping.h>
>  #include <linux/err.h>
> +#include <linux/genalloc.h>
>  #include <linux/init.h>
> +#include <linux/iopoll.h>
>  #include <linux/jiffies.h>
>  #include <linux/list.h>
>  #include <linux/module.h>
>  #include <linux/of.h>
> +#include <linux/of_address.h>
>  #include <linux/of_device.h>
>  #include <linux/of_dma.h>
>  #include <linux/platform_device.h>
> @@ -118,6 +121,7 @@
>  #define STM32_DMA_FIFO_THRESHOLD_FULL			0x03
>  
>  #define STM32_DMA_MAX_DATA_ITEMS	0xffff
> +#define STM32_DMA_SRAM_GRANULARITY	PAGE_SIZE
>  /*
>   * Valid transfer starts from @0 to @0xFFFE leading to unaligned scatter
>   * gather at boundary. Thus it's safer to round down this value on FIFO
> @@ -135,6 +139,12 @@
>  /* DMA Features */
>  #define STM32_DMA_THRESHOLD_FTR_MASK	GENMASK(1, 0)
>  #define STM32_DMA_THRESHOLD_FTR_GET(n)	((n) & STM32_DMA_THRESHOLD_FTR_MASK)
> +#define STM32_DMA_MDMA_CHAIN_FTR_MASK	BIT(2)
> +#define STM32_DMA_MDMA_CHAIN_FTR_GET(n)	(((n) & STM32_DMA_MDMA_CHAIN_FTR_MASK) \
> +					 >> 2)
> +#define STM32_DMA_MDMA_SRAM_SIZE_MASK	GENMASK(4, 3)
> +#define STM32_DMA_MDMA_SRAM_SIZE_GET(n)	(((n) & STM32_DMA_MDMA_SRAM_SIZE_MASK) \
> +					 >> 3)
>  
>  enum stm32_dma_width {
>  	STM32_DMA_BYTE,
> @@ -176,15 +186,31 @@ struct stm32_dma_chan_reg {
>  	u32 dma_sfcr;
>  };
>  
> +struct stm32_dma_mdma_desc {
> +	struct sg_table sgt;
> +	struct dma_async_tx_descriptor *desc;
> +};
> +
> +struct stm32_dma_mdma {
> +	struct dma_chan *chan;
> +	enum dma_transfer_direction dir;
> +	dma_addr_t sram_buf;
> +	u32 sram_period;
> +	u32 num_sgs;
> +};
> +
>  struct stm32_dma_sg_req {
> -	u32 len;
> +	struct scatterlist stm32_sgl_req;
>  	struct stm32_dma_chan_reg chan_reg;
> +	struct stm32_dma_mdma_desc m_desc;
>  };
>  
>  struct stm32_dma_desc {
>  	struct virt_dma_desc vdesc;
>  	bool cyclic;
>  	u32 num_sgs;
> +	dma_addr_t dma_buf;
> +	void *dma_buf_cpu;
>  	struct stm32_dma_sg_req sg_req[];
>  };
>  
> @@ -201,6 +227,10 @@ struct stm32_dma_chan {
>  	u32 threshold;
>  	u32 mem_burst;
>  	u32 mem_width;
> +	struct stm32_dma_mdma mchan;
> +	u32 use_mdma;
> +	u32 sram_size;
> +	u32 residue_after_drain;
>  };
>  
>  struct stm32_dma_device {
> @@ -210,6 +240,7 @@ struct stm32_dma_device {
>  	struct reset_control *rst;
>  	bool mem2mem;
>  	struct stm32_dma_chan chan[STM32_DMA_MAX_CHANNELS];
> +	struct gen_pool *sram_pool;
>  };
>  
>  static struct stm32_dma_device *stm32_dma_get_dev(struct stm32_dma_chan *chan)
> @@ -497,11 +528,15 @@ static void stm32_dma_stop(struct stm32_dma_chan *chan)
>  static int stm32_dma_terminate_all(struct dma_chan *c)
>  {
>  	struct stm32_dma_chan *chan = to_stm32_dma_chan(c);
> +	struct stm32_dma_mdma *mchan = &chan->mchan;
>  	unsigned long flags;
>  	LIST_HEAD(head);
>  
>  	spin_lock_irqsave(&chan->vchan.lock, flags);
>  
> +	if (chan->use_mdma)
> +		dmaengine_terminate_async(mchan->chan);
> +
>  	if (chan->busy) {
>  		stm32_dma_stop(chan);
>  		chan->desc = NULL;
> @@ -514,9 +549,96 @@ static int stm32_dma_terminate_all(struct dma_chan *c)
>  	return 0;
>  }
>  
> +static u32 stm32_dma_get_remaining_bytes(struct stm32_dma_chan *chan)
> +{
> +	u32 dma_scr, width, ndtr;
> +	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
> +
> +	dma_scr = stm32_dma_read(dmadev, STM32_DMA_SCR(chan->id));
> +	width = STM32_DMA_SCR_PSIZE_GET(dma_scr);
> +	ndtr = stm32_dma_read(dmadev, STM32_DMA_SNDTR(chan->id));
> +
> +	return ndtr << width;
> +}
> +
> +static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan)
> +{
> +	struct stm32_dma_mdma *mchan = &chan->mchan;
> +	struct stm32_dma_sg_req *sg_req;
> +	struct dma_device *ddev = mchan->chan->device;
> +	struct dma_async_tx_descriptor *desc = NULL;
> +	enum dma_status status;
> +	dma_addr_t src_buf, dst_buf;
> +	u32 mdma_residue, mdma_wrote, dma_to_write, len;
> +	struct dma_tx_state state;
> +	int ret;
> +
> +	/* DMA/MDMA chain: drain remaining data in SRAM */
> +
> +	/* Get the residue on MDMA side */
> +	status = dmaengine_tx_status(mchan->chan, mchan->chan->cookie, &state);
> +	if (status == DMA_COMPLETE)
> +		return status;
> +
> +	mdma_residue = state.residue;
> +	sg_req = &chan->desc->sg_req[chan->next_sg - 1];
> +	len = sg_dma_len(&sg_req->stm32_sgl_req);
> +
> +	/*
> +	 * Total = mdma blocks * sram_period + rest (< sram_period)
> +	 * so mdma blocks * sram_period = len - mdma residue - rest
> +	 */
> +	mdma_wrote = len - mdma_residue - (len % mchan->sram_period);
> +
> +	/* Remaining data stuck in SRAM */
> +	dma_to_write = mchan->sram_period - stm32_dma_get_remaining_bytes(chan);
> +	if (dma_to_write > 0) {
> +		/* Stop DMA current operation */
> +		stm32_dma_disable_chan(chan);
> +
> +		/* Terminate current MDMA to initiate a new one */
> +		dmaengine_terminate_all(mchan->chan);
> +
> +		/* Double buffer management */
> +		src_buf = mchan->sram_buf +
> +			  ((mdma_wrote / mchan->sram_period) & 0x1) *
> +			  mchan->sram_period;
> +		dst_buf = sg_dma_address(&sg_req->stm32_sgl_req) + mdma_wrote;
> +
> +		desc = ddev->device_prep_dma_memcpy(mchan->chan,
> +						    dst_buf, src_buf,
> +						    dma_to_write,
> +						    DMA_PREP_INTERRUPT);

why would you do that?

If at all you need to create anothe txn, I think it would be good to
prepare a new descriptor and chain it, not call the dmaengine APIs..
Pierre Yves MORDRET Oct. 9, 2018, 8:40 a.m. UTC | #2
On 10/07/2018 06:00 PM, Vinod wrote:
> On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
>> This patch adds support of DMA/MDMA chaining support.
>> It introduces an intermediate transfer between peripherals and STM32 DMA.
>> This intermediate transfer is triggered by SW for single M2D transfer and
>> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
>>
>> A generic SRAM allocator is used for this intermediate buffer
>> Each DMA channel will be able to define its SRAM needs to achieve chaining
>> feature : (2 ^ order) * PAGE_SIZE.
>> For cyclic, SRAM buffer is derived from period length (rounded on
>> PAGE_SIZE).
> 
> So IIUC, you chain two dma txns together and transfer data via an SRAM?

Correct. one DMA is DMAv2 (stm32-dma) and the other is MDMA(stm32-mdma).
Intermediate transfer is between device and memory.
This intermediate transfer is using SDRAM.

> 
>>
>> Signed-off-by: Pierre-Yves MORDRET <pierre-yves.mordret@st.com>
>> ---
>>   Version history:
>>     v3:
>>        * Solve KBuild warning
>>     v2:
>>     v1:
>>        * Initial
>> ---
>> ---
>>  drivers/dma/stm32-dma.c | 879 ++++++++++++++++++++++++++++++++++++++++++------
> 
> that is a lot of change for a driver, consider splitting it up
> logically in smaller changes...
> 

This feature is rather monolithic. Difficult to split up.
All the code is required at once.

>>  1 file changed, 772 insertions(+), 107 deletions(-)
>>
>> diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
>> index 379e8d5..85e81c4 100644
>> --- a/drivers/dma/stm32-dma.c
>> +++ b/drivers/dma/stm32-dma.c
>> @@ -15,11 +15,14 @@
>>  #include <linux/dmaengine.h>
>>  #include <linux/dma-mapping.h>
>>  #include <linux/err.h>
>> +#include <linux/genalloc.h>
>>  #include <linux/init.h>
>> +#include <linux/iopoll.h>
>>  #include <linux/jiffies.h>
>>  #include <linux/list.h>
>>  #include <linux/module.h>
>>  #include <linux/of.h>
>> +#include <linux/of_address.h>
>>  #include <linux/of_device.h>
>>  #include <linux/of_dma.h>
>>  #include <linux/platform_device.h>
>> @@ -118,6 +121,7 @@
>>  #define STM32_DMA_FIFO_THRESHOLD_FULL			0x03
>>  
>>  #define STM32_DMA_MAX_DATA_ITEMS	0xffff
>> +#define STM32_DMA_SRAM_GRANULARITY	PAGE_SIZE
>>  /*
>>   * Valid transfer starts from @0 to @0xFFFE leading to unaligned scatter
>>   * gather at boundary. Thus it's safer to round down this value on FIFO
>> @@ -135,6 +139,12 @@
>>  /* DMA Features */
>>  #define STM32_DMA_THRESHOLD_FTR_MASK	GENMASK(1, 0)
>>  #define STM32_DMA_THRESHOLD_FTR_GET(n)	((n) & STM32_DMA_THRESHOLD_FTR_MASK)
>> +#define STM32_DMA_MDMA_CHAIN_FTR_MASK	BIT(2)
>> +#define STM32_DMA_MDMA_CHAIN_FTR_GET(n)	(((n) & STM32_DMA_MDMA_CHAIN_FTR_MASK) \
>> +					 >> 2)
>> +#define STM32_DMA_MDMA_SRAM_SIZE_MASK	GENMASK(4, 3)
>> +#define STM32_DMA_MDMA_SRAM_SIZE_GET(n)	(((n) & STM32_DMA_MDMA_SRAM_SIZE_MASK) \
>> +					 >> 3)
>>  
>>  enum stm32_dma_width {
>>  	STM32_DMA_BYTE,
>> @@ -176,15 +186,31 @@ struct stm32_dma_chan_reg {
>>  	u32 dma_sfcr;
>>  };
>>  
>> +struct stm32_dma_mdma_desc {
>> +	struct sg_table sgt;
>> +	struct dma_async_tx_descriptor *desc;
>> +};
>> +
>> +struct stm32_dma_mdma {
>> +	struct dma_chan *chan;
>> +	enum dma_transfer_direction dir;
>> +	dma_addr_t sram_buf;
>> +	u32 sram_period;
>> +	u32 num_sgs;
>> +};
>> +
>>  struct stm32_dma_sg_req {
>> -	u32 len;
>> +	struct scatterlist stm32_sgl_req;
>>  	struct stm32_dma_chan_reg chan_reg;
>> +	struct stm32_dma_mdma_desc m_desc;
>>  };
>>  
>>  struct stm32_dma_desc {
>>  	struct virt_dma_desc vdesc;
>>  	bool cyclic;
>>  	u32 num_sgs;
>> +	dma_addr_t dma_buf;
>> +	void *dma_buf_cpu;
>>  	struct stm32_dma_sg_req sg_req[];
>>  };
>>  
>> @@ -201,6 +227,10 @@ struct stm32_dma_chan {
>>  	u32 threshold;
>>  	u32 mem_burst;
>>  	u32 mem_width;
>> +	struct stm32_dma_mdma mchan;
>> +	u32 use_mdma;
>> +	u32 sram_size;
>> +	u32 residue_after_drain;
>>  };
>>  
>>  struct stm32_dma_device {
>> @@ -210,6 +240,7 @@ struct stm32_dma_device {
>>  	struct reset_control *rst;
>>  	bool mem2mem;
>>  	struct stm32_dma_chan chan[STM32_DMA_MAX_CHANNELS];
>> +	struct gen_pool *sram_pool;
>>  };
>>  
>>  static struct stm32_dma_device *stm32_dma_get_dev(struct stm32_dma_chan *chan)
>> @@ -497,11 +528,15 @@ static void stm32_dma_stop(struct stm32_dma_chan *chan)
>>  static int stm32_dma_terminate_all(struct dma_chan *c)
>>  {
>>  	struct stm32_dma_chan *chan = to_stm32_dma_chan(c);
>> +	struct stm32_dma_mdma *mchan = &chan->mchan;
>>  	unsigned long flags;
>>  	LIST_HEAD(head);
>>  
>>  	spin_lock_irqsave(&chan->vchan.lock, flags);
>>  
>> +	if (chan->use_mdma)
>> +		dmaengine_terminate_async(mchan->chan);
>> +
>>  	if (chan->busy) {
>>  		stm32_dma_stop(chan);
>>  		chan->desc = NULL;
>> @@ -514,9 +549,96 @@ static int stm32_dma_terminate_all(struct dma_chan *c)
>>  	return 0;
>>  }
>>  
>> +static u32 stm32_dma_get_remaining_bytes(struct stm32_dma_chan *chan)
>> +{
>> +	u32 dma_scr, width, ndtr;
>> +	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
>> +
>> +	dma_scr = stm32_dma_read(dmadev, STM32_DMA_SCR(chan->id));
>> +	width = STM32_DMA_SCR_PSIZE_GET(dma_scr);
>> +	ndtr = stm32_dma_read(dmadev, STM32_DMA_SNDTR(chan->id));
>> +
>> +	return ndtr << width;
>> +}
>> +
>> +static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan)
>> +{
>> +	struct stm32_dma_mdma *mchan = &chan->mchan;
>> +	struct stm32_dma_sg_req *sg_req;
>> +	struct dma_device *ddev = mchan->chan->device;
>> +	struct dma_async_tx_descriptor *desc = NULL;
>> +	enum dma_status status;
>> +	dma_addr_t src_buf, dst_buf;
>> +	u32 mdma_residue, mdma_wrote, dma_to_write, len;
>> +	struct dma_tx_state state;
>> +	int ret;
>> +
>> +	/* DMA/MDMA chain: drain remaining data in SRAM */
>> +
>> +	/* Get the residue on MDMA side */
>> +	status = dmaengine_tx_status(mchan->chan, mchan->chan->cookie, &state);
>> +	if (status == DMA_COMPLETE)
>> +		return status;
>> +
>> +	mdma_residue = state.residue;
>> +	sg_req = &chan->desc->sg_req[chan->next_sg - 1];
>> +	len = sg_dma_len(&sg_req->stm32_sgl_req);
>> +
>> +	/*
>> +	 * Total = mdma blocks * sram_period + rest (< sram_period)
>> +	 * so mdma blocks * sram_period = len - mdma residue - rest
>> +	 */
>> +	mdma_wrote = len - mdma_residue - (len % mchan->sram_period);
>> +
>> +	/* Remaining data stuck in SRAM */
>> +	dma_to_write = mchan->sram_period - stm32_dma_get_remaining_bytes(chan);
>> +	if (dma_to_write > 0) {
>> +		/* Stop DMA current operation */
>> +		stm32_dma_disable_chan(chan);
>> +
>> +		/* Terminate current MDMA to initiate a new one */
>> +		dmaengine_terminate_all(mchan->chan);
>> +
>> +		/* Double buffer management */
>> +		src_buf = mchan->sram_buf +
>> +			  ((mdma_wrote / mchan->sram_period) & 0x1) *
>> +			  mchan->sram_period;
>> +		dst_buf = sg_dma_address(&sg_req->stm32_sgl_req) + mdma_wrote;
>> +
>> +		desc = ddev->device_prep_dma_memcpy(mchan->chan,
>> +						    dst_buf, src_buf,
>> +						    dma_to_write,
>> +						    DMA_PREP_INTERRUPT);
> 
> why would you do that?
> 
> If at all you need to create anothe txn, I think it would be good to
> prepare a new descriptor and chain it, not call the dmaengine APIs..
> 

In this UC, DMAv2 is configured in cyclic mode because this DMA doesn't work
with hw LLI only sw. This is really for performances reason we use this cyclic mode.
This very last txn is to flush remaining bytes stick in SDRAM.
I don't believe I can chain cyclic and this last txn.
Vinod Koul Oct. 10, 2018, 4:03 a.m. UTC | #3
On 09-10-18, 10:40, Pierre Yves MORDRET wrote:
> 
> 
> On 10/07/2018 06:00 PM, Vinod wrote:
> > On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
> >> This patch adds support of DMA/MDMA chaining support.
> >> It introduces an intermediate transfer between peripherals and STM32 DMA.
> >> This intermediate transfer is triggered by SW for single M2D transfer and
> >> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
> >>
> >> A generic SRAM allocator is used for this intermediate buffer
> >> Each DMA channel will be able to define its SRAM needs to achieve chaining
> >> feature : (2 ^ order) * PAGE_SIZE.
> >> For cyclic, SRAM buffer is derived from period length (rounded on
> >> PAGE_SIZE).
> > 
> > So IIUC, you chain two dma txns together and transfer data via an SRAM?
> 
> Correct. one DMA is DMAv2 (stm32-dma) and the other is MDMA(stm32-mdma).
> Intermediate transfer is between device and memory.
> This intermediate transfer is using SDRAM.

Ah so you use dma calls to setup mdma xtfers? I dont think that is a
good idea. How do you know you should use mdma for subsequent transfer?


> >>  drivers/dma/stm32-dma.c | 879 ++++++++++++++++++++++++++++++++++++++++++------
> > 
> > that is a lot of change for a driver, consider splitting it up
> > logically in smaller changes...
> > 
> 
> This feature is rather monolithic. Difficult to split up.
> All the code is required at once.

It can be enabled at last but split up logically. Intrusive changes to a
driver make it hard to review..
Pierre Yves MORDRET Oct. 10, 2018, 7:02 a.m. UTC | #4
On 10/10/2018 06:03 AM, Vinod wrote:
> On 09-10-18, 10:40, Pierre Yves MORDRET wrote:
>>
>>
>> On 10/07/2018 06:00 PM, Vinod wrote:
>>> On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
>>>> This patch adds support of DMA/MDMA chaining support.
>>>> It introduces an intermediate transfer between peripherals and STM32 DMA.
>>>> This intermediate transfer is triggered by SW for single M2D transfer and
>>>> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
>>>>
>>>> A generic SRAM allocator is used for this intermediate buffer
>>>> Each DMA channel will be able to define its SRAM needs to achieve chaining
>>>> feature : (2 ^ order) * PAGE_SIZE.
>>>> For cyclic, SRAM buffer is derived from period length (rounded on
>>>> PAGE_SIZE).
>>>
>>> So IIUC, you chain two dma txns together and transfer data via an SRAM?
>>
>> Correct. one DMA is DMAv2 (stm32-dma) and the other is MDMA(stm32-mdma).
>> Intermediate transfer is between device and memory.
>> This intermediate transfer is using SDRAM.
> 
> Ah so you use dma calls to setup mdma xtfers? I dont think that is a
> good idea. How do you know you should use mdma for subsequent transfer?
> 

When user bindings told to setup chaining intermediate MDMA transfers are always
triggers.
For instance if a user requests a Dev2Mem transfer with chaining. From client
pov this is still a prep_slave_sg. Internally DMAv2 is setup in cyclic mode (in
double buffer mode indeed => 2 buffer of PAGE_SIZE/2) and destination is SDRAM.
DMAv2 will flip/flop on those 2 buffers.
At the same time DMAv2 driver prepares a MDMA SG that will fetch data from those
2 buffers in SDRAM and fills final destination memory.

> 
>>>>  drivers/dma/stm32-dma.c | 879 ++++++++++++++++++++++++++++++++++++++++++------
>>>
>>> that is a lot of change for a driver, consider splitting it up
>>> logically in smaller changes...
>>>
>>
>> This feature is rather monolithic. Difficult to split up.
>> All the code is required at once.
> 
> It can be enabled at last but split up logically. Intrusive changes to a
> driver make it hard to review..
> 
Ok. I will to think about it how to proceed.
Pierre Yves MORDRET Oct. 12, 2018, 9:03 a.m. UTC | #5
Hi all,

I should add more explanations on how this feature works.

As I told earlier in my cover letter, the STM32 DMA don't have the ability to
generate burst transfer on the DDR as it only embeds a 4-word FIFO although the
minimal burst length on the DDR is 8 words.
This a big flaw when performances is at sack.
Even more some IP like camera is very aggressive in term or real time
constraints. The amount of data is huge (5Mpix) and DMAv2 is limited to 256KiB
for a single transfer that leads to overrun at Camera IP level when loading new
DMA descriptors: DMAv2 doesn't implement Hw LLI this is pure sw.

Thus our Hw has been devised to overcome this weakness. MDMA FIFO is bigger
(128Bytes) and burst length is more accurate in such cases of performances.
Unfortunately MDMA can't access some IPs like I2C, Camera, UART, SPI, ...
This is why DMAv2 is used to transfer data from device to/from SDRAM (DMAv2 can
burst on SDRAM) and MDMA to transfer data from SDRAM to/from DDR.
For M2D: DDR --> MDMA --> SRAM --> DMA  --> IP
For D2M: IP  --> DMA  --> SRAM --> MDMA --> DDR

When user request a DMA txn (SG or cyclic)
* DMAv2 will be set in double buffer cyclic mode. DMAv2 will toggle upon 2
buffers in SDRAM (size defined by device tree: 4KiB, 8Kib, 16KiB, 32KiB). In
cyclic mode DMAv2 will reload automatically its configuration without sw support.
* On the other side MDMA will be used to fetch those 2 buffers with an Hw LLI to
fill up final destination DDR buffer.

DMAv2 and MDMA synchronization is purely Hw ! At DMAv2 side there are NO
interrupts generated at CPU level. Internally the DMAv2 transfer complete irq is
connected upon MDMA request line. At MDMA side, it has the ability to
acknowledge DMAv2 irq throughout programmable register.
This means when DMAv2 has ended up a SRAM buffer, a Transfer complete is
generation upon MDMA request line. While MDMA will fetch this newly SRAM buffer,
DMAv2 gathers a new SRAM buffer. This process goes on until the DDR is filled
up. MDMA will fire the final transfer complete that will notify user for
transfer completion.

Hope it clarifies.
Regards


On 10/10/2018 09:02 AM, Pierre Yves MORDRET wrote:
> 
> 
> On 10/10/2018 06:03 AM, Vinod wrote:
>> On 09-10-18, 10:40, Pierre Yves MORDRET wrote:
>>>
>>>
>>> On 10/07/2018 06:00 PM, Vinod wrote:
>>>> On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
>>>>> This patch adds support of DMA/MDMA chaining support.
>>>>> It introduces an intermediate transfer between peripherals and STM32 DMA.
>>>>> This intermediate transfer is triggered by SW for single M2D transfer and
>>>>> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
>>>>>
>>>>> A generic SRAM allocator is used for this intermediate buffer
>>>>> Each DMA channel will be able to define its SRAM needs to achieve chaining
>>>>> feature : (2 ^ order) * PAGE_SIZE.
>>>>> For cyclic, SRAM buffer is derived from period length (rounded on
>>>>> PAGE_SIZE).
>>>>
>>>> So IIUC, you chain two dma txns together and transfer data via an SRAM?
>>>
>>> Correct. one DMA is DMAv2 (stm32-dma) and the other is MDMA(stm32-mdma).
>>> Intermediate transfer is between device and memory.
>>> This intermediate transfer is using SDRAM.
>>
>> Ah so you use dma calls to setup mdma xtfers? I dont think that is a
>> good idea. How do you know you should use mdma for subsequent transfer?
>>
> 
> When user bindings told to setup chaining intermediate MDMA transfers are always
> triggers.
> For instance if a user requests a Dev2Mem transfer with chaining. From client
> pov this is still a prep_slave_sg. Internally DMAv2 is setup in cyclic mode (in
> double buffer mode indeed => 2 buffer of PAGE_SIZE/2) and destination is SDRAM.
> DMAv2 will flip/flop on those 2 buffers.
> At the same time DMAv2 driver prepares a MDMA SG that will fetch data from those
> 2 buffers in SDRAM and fills final destination memory.
> 
>>
>>>>>  drivers/dma/stm32-dma.c | 879 ++++++++++++++++++++++++++++++++++++++++++------
>>>>
>>>> that is a lot of change for a driver, consider splitting it up
>>>> logically in smaller changes...
>>>>
>>>
>>> This feature is rather monolithic. Difficult to split up.
>>> All the code is required at once.
>>
>> It can be enabled at last but split up logically. Intrusive changes to a
>> driver make it hard to review..
>>
> Ok. I will to think about it how to proceed.
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Vinod Koul Oct. 15, 2018, 5:14 p.m. UTC | #6
On 10-10-18, 09:02, Pierre Yves MORDRET wrote:
> 
> 
> On 10/10/2018 06:03 AM, Vinod wrote:
> > On 09-10-18, 10:40, Pierre Yves MORDRET wrote:
> >>
> >>
> >> On 10/07/2018 06:00 PM, Vinod wrote:
> >>> On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
> >>>> This patch adds support of DMA/MDMA chaining support.
> >>>> It introduces an intermediate transfer between peripherals and STM32 DMA.
> >>>> This intermediate transfer is triggered by SW for single M2D transfer and
> >>>> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
> >>>>
> >>>> A generic SRAM allocator is used for this intermediate buffer
> >>>> Each DMA channel will be able to define its SRAM needs to achieve chaining
> >>>> feature : (2 ^ order) * PAGE_SIZE.
> >>>> For cyclic, SRAM buffer is derived from period length (rounded on
> >>>> PAGE_SIZE).
> >>>
> >>> So IIUC, you chain two dma txns together and transfer data via an SRAM?
> >>
> >> Correct. one DMA is DMAv2 (stm32-dma) and the other is MDMA(stm32-mdma).
> >> Intermediate transfer is between device and memory.
> >> This intermediate transfer is using SDRAM.
> > 
> > Ah so you use dma calls to setup mdma xtfers? I dont think that is a
> > good idea. How do you know you should use mdma for subsequent transfer?
> > 
> 
> When user bindings told to setup chaining intermediate MDMA transfers are always
> triggers.
> For instance if a user requests a Dev2Mem transfer with chaining. From client
> pov this is still a prep_slave_sg. Internally DMAv2 is setup in cyclic mode (in
> double buffer mode indeed => 2 buffer of PAGE_SIZE/2) and destination is SDRAM.
> DMAv2 will flip/flop on those 2 buffers.
> At the same time DMAv2 driver prepares a MDMA SG that will fetch data from those
> 2 buffers in SDRAM and fills final destination memory.

I am not able to follow is why does it need to be internal, why should
the client not set the two transfers and trigger them?
Pierre Yves MORDRET Oct. 16, 2018, 9:19 a.m. UTC | #7
On 10/15/18 7:14 PM, Vinod wrote:
> On 10-10-18, 09:02, Pierre Yves MORDRET wrote:
>>
>>
>> On 10/10/2018 06:03 AM, Vinod wrote:
>>> On 09-10-18, 10:40, Pierre Yves MORDRET wrote:
>>>>
>>>>
>>>> On 10/07/2018 06:00 PM, Vinod wrote:
>>>>> On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
>>>>>> This patch adds support of DMA/MDMA chaining support.
>>>>>> It introduces an intermediate transfer between peripherals and STM32 DMA.
>>>>>> This intermediate transfer is triggered by SW for single M2D transfer and
>>>>>> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
>>>>>>
>>>>>> A generic SRAM allocator is used for this intermediate buffer
>>>>>> Each DMA channel will be able to define its SRAM needs to achieve chaining
>>>>>> feature : (2 ^ order) * PAGE_SIZE.
>>>>>> For cyclic, SRAM buffer is derived from period length (rounded on
>>>>>> PAGE_SIZE).
>>>>>
>>>>> So IIUC, you chain two dma txns together and transfer data via an SRAM?
>>>>
>>>> Correct. one DMA is DMAv2 (stm32-dma) and the other is MDMA(stm32-mdma).
>>>> Intermediate transfer is between device and memory.
>>>> This intermediate transfer is using SDRAM.
>>>
>>> Ah so you use dma calls to setup mdma xtfers? I dont think that is a
>>> good idea. How do you know you should use mdma for subsequent transfer?
>>>
>>
>> When user bindings told to setup chaining intermediate MDMA transfers are always
>> triggers.
>> For instance if a user requests a Dev2Mem transfer with chaining. From client
>> pov this is still a prep_slave_sg. Internally DMAv2 is setup in cyclic mode (in
>> double buffer mode indeed => 2 buffer of PAGE_SIZE/2) and destination is SDRAM.
>> DMAv2 will flip/flop on those 2 buffers.
>> At the same time DMAv2 driver prepares a MDMA SG that will fetch data from those
>> 2 buffers in SDRAM and fills final destination memory.
> 
> I am not able to follow is why does it need to be internal, why should
> the client not set the two transfers and trigger them?
> 

Client may use or not chaining: defined within DT. API and dynamic are same at
driver client level. Moreover driver exposes only DMAv2 and not both DMAv2 and
MDMA. This is totally hidden for client. If client sets both this would imply
changing all drivers that may want use chaining. Even more to deal with DMAv2
and MDMA at its level.
Since DMAv2 deals with MDMA, all drivers are same as before. no changes required.

Regards
Vinod Koul Oct. 16, 2018, 2:44 p.m. UTC | #8
On 16-10-18, 11:19, Pierre Yves MORDRET wrote:
> 
> 
> On 10/15/18 7:14 PM, Vinod wrote:
> > On 10-10-18, 09:02, Pierre Yves MORDRET wrote:
> >>
> >>
> >> On 10/10/2018 06:03 AM, Vinod wrote:
> >>> On 09-10-18, 10:40, Pierre Yves MORDRET wrote:
> >>>>
> >>>>
> >>>> On 10/07/2018 06:00 PM, Vinod wrote:
> >>>>> On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
> >>>>>> This patch adds support of DMA/MDMA chaining support.
> >>>>>> It introduces an intermediate transfer between peripherals and STM32 DMA.
> >>>>>> This intermediate transfer is triggered by SW for single M2D transfer and
> >>>>>> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
> >>>>>>
> >>>>>> A generic SRAM allocator is used for this intermediate buffer
> >>>>>> Each DMA channel will be able to define its SRAM needs to achieve chaining
> >>>>>> feature : (2 ^ order) * PAGE_SIZE.
> >>>>>> For cyclic, SRAM buffer is derived from period length (rounded on
> >>>>>> PAGE_SIZE).
> >>>>>
> >>>>> So IIUC, you chain two dma txns together and transfer data via an SRAM?
> >>>>
> >>>> Correct. one DMA is DMAv2 (stm32-dma) and the other is MDMA(stm32-mdma).
> >>>> Intermediate transfer is between device and memory.
> >>>> This intermediate transfer is using SDRAM.
> >>>
> >>> Ah so you use dma calls to setup mdma xtfers? I dont think that is a
> >>> good idea. How do you know you should use mdma for subsequent transfer?
> >>>
> >>
> >> When user bindings told to setup chaining intermediate MDMA transfers are always
> >> triggers.
> >> For instance if a user requests a Dev2Mem transfer with chaining. From client
> >> pov this is still a prep_slave_sg. Internally DMAv2 is setup in cyclic mode (in
> >> double buffer mode indeed => 2 buffer of PAGE_SIZE/2) and destination is SDRAM.
> >> DMAv2 will flip/flop on those 2 buffers.
> >> At the same time DMAv2 driver prepares a MDMA SG that will fetch data from those
> >> 2 buffers in SDRAM and fills final destination memory.
> > 
> > I am not able to follow is why does it need to be internal, why should
> > the client not set the two transfers and trigger them?
> > 
> 
> Client may use or not chaining: defined within DT. API and dynamic are same at

That should be upto client... As a dmaengine driver you should enable
data transfer from src to dstn.

> driver client level. Moreover driver exposes only DMAv2 and not both DMAv2 and
> MDMA. This is totally hidden for client. If client sets both this would imply

Why should a controller be hidden from user, I dont see why that would
be a good thing

> changing all drivers that may want use chaining. Even more to deal with DMAv2
> and MDMA at its level.
> Since DMAv2 deals with MDMA, all drivers are same as before. no changes required.

It is not about changes, it is about the SW model you want to have.

The intermediate SRAM transfers should not be made within DMAengine
driver, client can chose to have two transfers and couple or not, it is
upto them to choose. Sorry I do not like this abstraction and would like
to see a cleaner approach
Pierre Yves MORDRET Oct. 19, 2018, 9:21 a.m. UTC | #9
On 10/16/18 4:44 PM, Vinod wrote:
> On 16-10-18, 11:19, Pierre Yves MORDRET wrote:
>>
>>
>> On 10/15/18 7:14 PM, Vinod wrote:
>>> On 10-10-18, 09:02, Pierre Yves MORDRET wrote:
>>>>
>>>>
>>>> On 10/10/2018 06:03 AM, Vinod wrote:
>>>>> On 09-10-18, 10:40, Pierre Yves MORDRET wrote:
>>>>>>
>>>>>>
>>>>>> On 10/07/2018 06:00 PM, Vinod wrote:
>>>>>>> On 28-09-18, 15:01, Pierre-Yves MORDRET wrote:
>>>>>>>> This patch adds support of DMA/MDMA chaining support.
>>>>>>>> It introduces an intermediate transfer between peripherals and STM32 DMA.
>>>>>>>> This intermediate transfer is triggered by SW for single M2D transfer and
>>>>>>>> by STM32 DMA IP for all other modes (sg, cyclic) and direction (D2M).
>>>>>>>>
>>>>>>>> A generic SRAM allocator is used for this intermediate buffer
>>>>>>>> Each DMA channel will be able to define its SRAM needs to achieve chaining
>>>>>>>> feature : (2 ^ order) * PAGE_SIZE.
>>>>>>>> For cyclic, SRAM buffer is derived from period length (rounded on
>>>>>>>> PAGE_SIZE).
>>>>>>>
>>>>>>> So IIUC, you chain two dma txns together and transfer data via an SRAM?
>>>>>>
>>>>>> Correct. one DMA is DMAv2 (stm32-dma) and the other is MDMA(stm32-mdma).
>>>>>> Intermediate transfer is between device and memory.
>>>>>> This intermediate transfer is using SDRAM.
>>>>>
>>>>> Ah so you use dma calls to setup mdma xtfers? I dont think that is a
>>>>> good idea. How do you know you should use mdma for subsequent transfer?
>>>>>
>>>>
>>>> When user bindings told to setup chaining intermediate MDMA transfers are always
>>>> triggers.
>>>> For instance if a user requests a Dev2Mem transfer with chaining. From client
>>>> pov this is still a prep_slave_sg. Internally DMAv2 is setup in cyclic mode (in
>>>> double buffer mode indeed => 2 buffer of PAGE_SIZE/2) and destination is SDRAM.
>>>> DMAv2 will flip/flop on those 2 buffers.
>>>> At the same time DMAv2 driver prepares a MDMA SG that will fetch data from those
>>>> 2 buffers in SDRAM and fills final destination memory.
>>>
>>> I am not able to follow is why does it need to be internal, why should
>>> the client not set the two transfers and trigger them?
>>>
>>
>> Client may use or not chaining: defined within DT. API and dynamic are same at
> 
> That should be upto client... As a dmaengine driver you should enable
> data transfer from src to dstn.
> 
>> driver client level. Moreover driver exposes only DMAv2 and not both DMAv2 and
>> MDMA. This is totally hidden for client. If client sets both this would imply
> 
> Why should a controller be hidden from user, I dont see why that would
> be a good thing
> 
>> changing all drivers that may want use chaining. Even more to deal with DMAv2
>> and MDMA at its level.
>> Since DMAv2 deals with MDMA, all drivers are same as before. no changes required.
> 
> It is not about changes, it is about the SW model you want to have.
> 
> The intermediate SRAM transfers should not be made within DMAengine
> driver, client can chose to have two transfers and couple or not, it is
> upto them to choose. Sorry I do not like this abstraction and would like
> to see a cleaner approach
> 

What we have done it to hide all the complexity related to DMA engine:
synchronization, residue and many other topics solved by this approach. If this
is up to client to perform intermediate transfer, each client drivers using
chaining will need to duplicate the required sw.
This approach is present as a feature from driver pov.

Regards
diff mbox series

Patch

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index 379e8d5..85e81c4 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -15,11 +15,14 @@ 
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/err.h>
+#include <linux/genalloc.h>
 #include <linux/init.h>
+#include <linux/iopoll.h>
 #include <linux/jiffies.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_dma.h>
 #include <linux/platform_device.h>
@@ -118,6 +121,7 @@ 
 #define STM32_DMA_FIFO_THRESHOLD_FULL			0x03
 
 #define STM32_DMA_MAX_DATA_ITEMS	0xffff
+#define STM32_DMA_SRAM_GRANULARITY	PAGE_SIZE
 /*
  * Valid transfer starts from @0 to @0xFFFE leading to unaligned scatter
  * gather at boundary. Thus it's safer to round down this value on FIFO
@@ -135,6 +139,12 @@ 
 /* DMA Features */
 #define STM32_DMA_THRESHOLD_FTR_MASK	GENMASK(1, 0)
 #define STM32_DMA_THRESHOLD_FTR_GET(n)	((n) & STM32_DMA_THRESHOLD_FTR_MASK)
+#define STM32_DMA_MDMA_CHAIN_FTR_MASK	BIT(2)
+#define STM32_DMA_MDMA_CHAIN_FTR_GET(n)	(((n) & STM32_DMA_MDMA_CHAIN_FTR_MASK) \
+					 >> 2)
+#define STM32_DMA_MDMA_SRAM_SIZE_MASK	GENMASK(4, 3)
+#define STM32_DMA_MDMA_SRAM_SIZE_GET(n)	(((n) & STM32_DMA_MDMA_SRAM_SIZE_MASK) \
+					 >> 3)
 
 enum stm32_dma_width {
 	STM32_DMA_BYTE,
@@ -176,15 +186,31 @@  struct stm32_dma_chan_reg {
 	u32 dma_sfcr;
 };
 
+struct stm32_dma_mdma_desc {
+	struct sg_table sgt;
+	struct dma_async_tx_descriptor *desc;
+};
+
+struct stm32_dma_mdma {
+	struct dma_chan *chan;
+	enum dma_transfer_direction dir;
+	dma_addr_t sram_buf;
+	u32 sram_period;
+	u32 num_sgs;
+};
+
 struct stm32_dma_sg_req {
-	u32 len;
+	struct scatterlist stm32_sgl_req;
 	struct stm32_dma_chan_reg chan_reg;
+	struct stm32_dma_mdma_desc m_desc;
 };
 
 struct stm32_dma_desc {
 	struct virt_dma_desc vdesc;
 	bool cyclic;
 	u32 num_sgs;
+	dma_addr_t dma_buf;
+	void *dma_buf_cpu;
 	struct stm32_dma_sg_req sg_req[];
 };
 
@@ -201,6 +227,10 @@  struct stm32_dma_chan {
 	u32 threshold;
 	u32 mem_burst;
 	u32 mem_width;
+	struct stm32_dma_mdma mchan;
+	u32 use_mdma;
+	u32 sram_size;
+	u32 residue_after_drain;
 };
 
 struct stm32_dma_device {
@@ -210,6 +240,7 @@  struct stm32_dma_device {
 	struct reset_control *rst;
 	bool mem2mem;
 	struct stm32_dma_chan chan[STM32_DMA_MAX_CHANNELS];
+	struct gen_pool *sram_pool;
 };
 
 static struct stm32_dma_device *stm32_dma_get_dev(struct stm32_dma_chan *chan)
@@ -497,11 +528,15 @@  static void stm32_dma_stop(struct stm32_dma_chan *chan)
 static int stm32_dma_terminate_all(struct dma_chan *c)
 {
 	struct stm32_dma_chan *chan = to_stm32_dma_chan(c);
+	struct stm32_dma_mdma *mchan = &chan->mchan;
 	unsigned long flags;
 	LIST_HEAD(head);
 
 	spin_lock_irqsave(&chan->vchan.lock, flags);
 
+	if (chan->use_mdma)
+		dmaengine_terminate_async(mchan->chan);
+
 	if (chan->busy) {
 		stm32_dma_stop(chan);
 		chan->desc = NULL;
@@ -514,9 +549,96 @@  static int stm32_dma_terminate_all(struct dma_chan *c)
 	return 0;
 }
 
+static u32 stm32_dma_get_remaining_bytes(struct stm32_dma_chan *chan)
+{
+	u32 dma_scr, width, ndtr;
+	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
+
+	dma_scr = stm32_dma_read(dmadev, STM32_DMA_SCR(chan->id));
+	width = STM32_DMA_SCR_PSIZE_GET(dma_scr);
+	ndtr = stm32_dma_read(dmadev, STM32_DMA_SNDTR(chan->id));
+
+	return ndtr << width;
+}
+
+static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan)
+{
+	struct stm32_dma_mdma *mchan = &chan->mchan;
+	struct stm32_dma_sg_req *sg_req;
+	struct dma_device *ddev = mchan->chan->device;
+	struct dma_async_tx_descriptor *desc = NULL;
+	enum dma_status status;
+	dma_addr_t src_buf, dst_buf;
+	u32 mdma_residue, mdma_wrote, dma_to_write, len;
+	struct dma_tx_state state;
+	int ret;
+
+	/* DMA/MDMA chain: drain remaining data in SRAM */
+
+	/* Get the residue on MDMA side */
+	status = dmaengine_tx_status(mchan->chan, mchan->chan->cookie, &state);
+	if (status == DMA_COMPLETE)
+		return status;
+
+	mdma_residue = state.residue;
+	sg_req = &chan->desc->sg_req[chan->next_sg - 1];
+	len = sg_dma_len(&sg_req->stm32_sgl_req);
+
+	/*
+	 * Total = mdma blocks * sram_period + rest (< sram_period)
+	 * so mdma blocks * sram_period = len - mdma residue - rest
+	 */
+	mdma_wrote = len - mdma_residue - (len % mchan->sram_period);
+
+	/* Remaining data stuck in SRAM */
+	dma_to_write = mchan->sram_period - stm32_dma_get_remaining_bytes(chan);
+	if (dma_to_write > 0) {
+		/* Stop DMA current operation */
+		stm32_dma_disable_chan(chan);
+
+		/* Terminate current MDMA to initiate a new one */
+		dmaengine_terminate_all(mchan->chan);
+
+		/* Double buffer management */
+		src_buf = mchan->sram_buf +
+			  ((mdma_wrote / mchan->sram_period) & 0x1) *
+			  mchan->sram_period;
+		dst_buf = sg_dma_address(&sg_req->stm32_sgl_req) + mdma_wrote;
+
+		desc = ddev->device_prep_dma_memcpy(mchan->chan,
+						    dst_buf, src_buf,
+						    dma_to_write,
+						    DMA_PREP_INTERRUPT);
+		if (!desc)
+			return -EINVAL;
+
+		ret = dma_submit_error(dmaengine_submit(desc));
+		if (ret < 0)
+			return ret;
+
+		status = dma_wait_for_async_tx(desc);
+		if (status != DMA_COMPLETE) {
+			dev_err(chan2dev(chan), "flush() dma_wait_for_async_tx error\n");
+			dmaengine_terminate_async(mchan->chan);
+			return -EBUSY;
+		}
+
+		/* We need to store residue for tx_status() */
+		chan->residue_after_drain = len - (mdma_wrote + dma_to_write);
+	}
+
+	return 0;
+}
+
 static void stm32_dma_synchronize(struct dma_chan *c)
 {
 	struct stm32_dma_chan *chan = to_stm32_dma_chan(c);
+	struct stm32_dma_mdma *mchan = &chan->mchan;
+
+	if (chan->desc && chan->use_mdma && mchan->dir == DMA_DEV_TO_MEM)
+		if (stm32_dma_mdma_drain(chan))
+			dev_err(chan2dev(chan), "%s: can't drain DMA\n",
+				__func__);
 
 	vchan_synchronize(&chan->vchan);
 }
@@ -539,62 +661,232 @@  static void stm32_dma_dump_reg(struct stm32_dma_chan *chan)
 	dev_dbg(chan2dev(chan), "SFCR:  0x%08x\n", sfcr);
 }
 
-static void stm32_dma_configure_next_sg(struct stm32_dma_chan *chan);
-
-static void stm32_dma_start_transfer(struct stm32_dma_chan *chan)
+static int stm32_dma_dummy_memcpy_xfer(struct stm32_dma_chan *chan)
 {
 	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
-	struct virt_dma_desc *vdesc;
+	struct dma_device *ddev = &dmadev->ddev;
+	struct stm32_dma_chan_reg reg;
+	u8 src_buf, dst_buf;
+	dma_addr_t dma_src_buf, dma_dst_buf;
+	u32 ndtr, status;
+	int len, ret;
+
+	ret = 0;
+	src_buf = 0;
+	len = 1;
+
+	dma_src_buf = dma_map_single(ddev->dev, &src_buf, len, DMA_TO_DEVICE);
+	ret = dma_mapping_error(ddev->dev, dma_src_buf);
+	if (ret < 0) {
+		dev_err(chan2dev(chan), "Source buffer map failed\n");
+		return ret;
+	}
+
+	dma_dst_buf = dma_map_single(ddev->dev, &dst_buf, len, DMA_FROM_DEVICE);
+	ret = dma_mapping_error(ddev->dev, dma_dst_buf);
+	if (ret < 0) {
+		dev_err(chan2dev(chan), "Destination buffer map failed\n");
+		dma_unmap_single(ddev->dev, dma_src_buf, len, DMA_TO_DEVICE);
+		return ret;
+	}
+
+	reg.dma_scr =	STM32_DMA_SCR_DIR(STM32_DMA_MEM_TO_MEM) |
+			STM32_DMA_SCR_PBURST(STM32_DMA_BURST_SINGLE) |
+			STM32_DMA_SCR_MBURST(STM32_DMA_BURST_SINGLE) |
+			STM32_DMA_SCR_MINC |
+			STM32_DMA_SCR_PINC |
+			STM32_DMA_SCR_TEIE;
+	reg.dma_spar = dma_src_buf;
+	reg.dma_sm0ar = dma_dst_buf;
+	reg.dma_sfcr = STM32_DMA_SFCR_MASK |
+		STM32_DMA_SFCR_FTH(STM32_DMA_FIFO_THRESHOLD_FULL);
+	reg.dma_sm1ar = dma_dst_buf;
+	reg.dma_sndtr = 1;
+
+	stm32_dma_write(dmadev, STM32_DMA_SCR(chan->id), reg.dma_scr);
+	stm32_dma_write(dmadev, STM32_DMA_SPAR(chan->id), reg.dma_spar);
+	stm32_dma_write(dmadev, STM32_DMA_SM0AR(chan->id), reg.dma_sm0ar);
+	stm32_dma_write(dmadev, STM32_DMA_SFCR(chan->id), reg.dma_sfcr);
+	stm32_dma_write(dmadev, STM32_DMA_SM1AR(chan->id), reg.dma_sm1ar);
+	stm32_dma_write(dmadev, STM32_DMA_SNDTR(chan->id), reg.dma_sndtr);
+
+	/* Clear interrupt status if it is there */
+	status = stm32_dma_irq_status(chan);
+	if (status)
+		stm32_dma_irq_clear(chan, status);
+
+	stm32_dma_dump_reg(chan);
+
+	chan->busy = true;
+	/* Start DMA */
+	reg.dma_scr |= STM32_DMA_SCR_EN;
+	stm32_dma_write(dmadev, STM32_DMA_SCR(chan->id), reg.dma_scr);
+
+	ret = readl_relaxed_poll_timeout_atomic(dmadev->base +
+						STM32_DMA_SNDTR(chan->id),
+						ndtr, !ndtr, 10, 1000);
+	if (ret) {
+		dev_err(chan2dev(chan), "%s: timeout!\n", __func__);
+		ret = -EBUSY;
+	}
+
+	chan->busy = false;
+
+	ret = stm32_dma_disable_chan(chan);
+	status = stm32_dma_irq_status(chan);
+	if (status)
+		stm32_dma_irq_clear(chan, status);
+
+	dma_unmap_single(ddev->dev, dma_src_buf, len, DMA_TO_DEVICE);
+	dma_unmap_single(ddev->dev, dma_dst_buf, len, DMA_FROM_DEVICE);
+
+	return ret;
+}
+
+static int stm32_dma_mdma_flush_remaining(struct stm32_dma_chan *chan)
+{
+	struct stm32_dma_mdma *mchan = &chan->mchan;
 	struct stm32_dma_sg_req *sg_req;
-	struct stm32_dma_chan_reg *reg;
-	u32 status;
+	struct dma_device *ddev = mchan->chan->device;
+	struct dma_async_tx_descriptor *desc = NULL;
+	enum dma_status status;
+	dma_addr_t src_buf, dst_buf;
+	u32 residue, remain, len;
 	int ret;
 
-	ret = stm32_dma_disable_chan(chan);
-	if (ret < 0)
-		return;
+	sg_req = &chan->desc->sg_req[chan->next_sg - 1];
 
-	if (!chan->desc) {
-		vdesc = vchan_next_desc(&chan->vchan);
-		if (!vdesc)
-			return;
+	residue = stm32_dma_get_remaining_bytes(chan);
+	len = sg_dma_len(&sg_req->stm32_sgl_req);
+	remain = len % mchan->sram_period;
 
-		chan->desc = to_stm32_dma_desc(vdesc);
-		chan->next_sg = 0;
+	if (residue > 0 && len > mchan->sram_period &&
+	    ((len % mchan->sram_period) != 0)) {
+		unsigned long dma_sync_wait_timeout =
+			jiffies + msecs_to_jiffies(5000);
+
+		while (residue > 0 &&
+		       residue > (mchan->sram_period - remain)) {
+			if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
+				dev_err(chan2dev(chan),
+					"%s timeout waiting for last bytes\n",
+					__func__);
+				break;
+			}
+			cpu_relax();
+			residue = stm32_dma_get_remaining_bytes(chan);
+		}
+		stm32_dma_disable_chan(chan);
+
+		src_buf = mchan->sram_buf + ((len / mchan->sram_period) & 0x1)
+			* mchan->sram_period;
+		dst_buf = sg_dma_address(&sg_req->stm32_sgl_req) + len -
+			(len % mchan->sram_period);
+
+		desc = ddev->device_prep_dma_memcpy(mchan->chan,
+						    dst_buf, src_buf,
+						    len % mchan->sram_period,
+						    DMA_PREP_INTERRUPT);
+
+		if (!desc)
+			return -EINVAL;
+
+		ret = dma_submit_error(dmaengine_submit(desc));
+		if (ret < 0)
+			return ret;
+
+		status = dma_wait_for_async_tx(desc);
+		if (status != DMA_COMPLETE) {
+			dmaengine_terminate_async(mchan->chan);
+			return -EBUSY;
+		}
 	}
 
-	if (chan->next_sg == chan->desc->num_sgs)
-		chan->next_sg = 0;
+	return 0;
+}
 
-	sg_req = &chan->desc->sg_req[chan->next_sg];
-	reg = &sg_req->chan_reg;
+static void stm32_dma_start_transfer(struct stm32_dma_chan *chan);
 
-	stm32_dma_write(dmadev, STM32_DMA_SCR(chan->id), reg->dma_scr);
-	stm32_dma_write(dmadev, STM32_DMA_SPAR(chan->id), reg->dma_spar);
-	stm32_dma_write(dmadev, STM32_DMA_SM0AR(chan->id), reg->dma_sm0ar);
-	stm32_dma_write(dmadev, STM32_DMA_SFCR(chan->id), reg->dma_sfcr);
-	stm32_dma_write(dmadev, STM32_DMA_SM1AR(chan->id), reg->dma_sm1ar);
-	stm32_dma_write(dmadev, STM32_DMA_SNDTR(chan->id), reg->dma_sndtr);
+static void stm32_mdma_chan_complete(void *param,
+				     const struct dmaengine_result *result)
+{
+	struct stm32_dma_chan *chan = param;
 
-	chan->next_sg++;
+	chan->busy = false;
+	if (result->result == DMA_TRANS_NOERROR) {
+		if (stm32_dma_mdma_flush_remaining(chan)) {
+			dev_err(chan2dev(chan), "Can't flush DMA\n");
+			return;
+		}
 
-	/* Clear interrupt status if it is there */
-	status = stm32_dma_irq_status(chan);
-	if (status)
-		stm32_dma_irq_clear(chan, status);
+		if (chan->next_sg == chan->desc->num_sgs) {
+			list_del(&chan->desc->vdesc.node);
+			vchan_cookie_complete(&chan->desc->vdesc);
+			chan->desc = NULL;
+		}
+		stm32_dma_start_transfer(chan);
+	} else {
+		dev_err(chan2dev(chan), "MDMA transfer error: %d\n",
+			result->result);
+	}
+}
 
-	if (chan->desc->cyclic)
-		stm32_dma_configure_next_sg(chan);
+static int stm32_dma_mdma_start(struct stm32_dma_chan *chan,
+				struct stm32_dma_sg_req *sg_req)
+{
+	struct stm32_dma_mdma *mchan = &chan->mchan;
+	struct stm32_dma_mdma_desc *m_desc = &sg_req->m_desc;
+	struct dma_slave_config config;
+	int ret;
 
-	stm32_dma_dump_reg(chan);
+	/* Configure MDMA channel */
+	memset(&config, 0, sizeof(config));
+	if (mchan->dir == DMA_MEM_TO_DEV)
+		config.dst_addr = mchan->sram_buf;
+	else
+		config.src_addr = mchan->sram_buf;
 
-	/* Start DMA */
-	reg->dma_scr |= STM32_DMA_SCR_EN;
-	stm32_dma_write(dmadev, STM32_DMA_SCR(chan->id), reg->dma_scr);
+	ret = dmaengine_slave_config(mchan->chan, &config);
+	if (ret < 0)
+		goto error;
+
+	 /* Prepare MDMA descriptor */
+	m_desc->desc = dmaengine_prep_slave_sg(mchan->chan, m_desc->sgt.sgl,
+					       m_desc->sgt.nents, mchan->dir,
+					       DMA_PREP_INTERRUPT);
+	if (!m_desc->desc) {
+		ret = -EINVAL;
+		goto error;
+	}
 
-	chan->busy = true;
+	if (mchan->dir != DMA_MEM_TO_DEV) {
+		m_desc->desc->callback_result = stm32_mdma_chan_complete;
+		m_desc->desc->callback_param = chan;
+	}
 
-	dev_dbg(chan2dev(chan), "vchan %pK: started\n", &chan->vchan);
+	ret = dma_submit_error(dmaengine_submit(m_desc->desc));
+	if (ret < 0) {
+		dev_err(chan2dev(chan), "MDMA submit failed\n");
+		goto error;
+	}
+
+	dma_async_issue_pending(mchan->chan);
+
+	/*
+	 * In case of M2D transfer, we have to generate dummy DMA transfer to
+	 * copy 1st sg data into SRAM
+	 */
+	if (mchan->dir == DMA_MEM_TO_DEV) {
+		ret = stm32_dma_dummy_memcpy_xfer(chan);
+		if (ret < 0) {
+			dmaengine_terminate_async(mchan->chan);
+			goto error;
+		}
+	}
+
+	return 0;
+error:
+	return ret;
 }
 
 static void stm32_dma_configure_next_sg(struct stm32_dma_chan *chan)
@@ -626,23 +918,132 @@  static void stm32_dma_configure_next_sg(struct stm32_dma_chan *chan)
 	}
 }
 
-static void stm32_dma_handle_chan_done(struct stm32_dma_chan *chan)
+static void stm32_dma_start_transfer(struct stm32_dma_chan *chan)
 {
-	if (chan->desc) {
-		if (chan->desc->cyclic) {
-			vchan_cyclic_callback(&chan->desc->vdesc);
-			chan->next_sg++;
-			stm32_dma_configure_next_sg(chan);
+	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
+	struct virt_dma_desc *vdesc;
+	struct stm32_dma_sg_req *sg_req;
+	struct stm32_dma_chan_reg *reg;
+	u32 status;
+	int ret;
+
+	ret = stm32_dma_disable_chan(chan);
+	if (ret < 0)
+		return;
+
+	if (!chan->desc) {
+		vdesc = vchan_next_desc(&chan->vchan);
+		if (!vdesc)
+			return;
+
+		chan->desc = to_stm32_dma_desc(vdesc);
+		chan->next_sg = 0;
+	} else {
+		vdesc = &chan->desc->vdesc;
+	}
+
+	if (chan->next_sg == chan->desc->num_sgs)
+		chan->next_sg = 0;
+
+	sg_req = &chan->desc->sg_req[chan->next_sg];
+	reg = &sg_req->chan_reg;
+
+	/* Clear interrupt status if it is there */
+	status = stm32_dma_irq_status(chan);
+	if (status)
+		stm32_dma_irq_clear(chan, status);
+
+	if (chan->use_mdma) {
+		if (chan->next_sg == 0) {
+			struct stm32_dma_mdma_desc *m_desc;
+
+			m_desc = &sg_req->m_desc;
+			if (chan->desc->cyclic) {
+				/*
+				 * If one callback is set, it will be called by
+				 * MDMA driver.
+				 */
+				if (vdesc->tx.callback) {
+					m_desc->desc->callback =
+						vdesc->tx.callback;
+					m_desc->desc->callback_param =
+						vdesc->tx.callback_param;
+					vdesc->tx.callback = NULL;
+					vdesc->tx.callback_param = NULL;
+				}
+			}
+		}
+
+		if (chan->mchan.dir == DMA_MEM_TO_DEV) {
+			ret = stm32_dma_dummy_memcpy_xfer(chan);
+			if (ret < 0) {
+				dmaengine_terminate_async(chan->mchan.chan);
+				chan->desc = NULL;
+				return;
+			}
 		} else {
-			chan->busy = false;
-			if (chan->next_sg == chan->desc->num_sgs) {
-				list_del(&chan->desc->vdesc.node);
-				vchan_cookie_complete(&chan->desc->vdesc);
+			reg->dma_scr &= ~STM32_DMA_SCR_TCIE;
+		}
+
+		if (!chan->desc->cyclic) {
+			/*  MDMA already started */
+			if (chan->mchan.dir != DMA_MEM_TO_DEV &&
+			    sg_dma_len(&sg_req->stm32_sgl_req) >
+			    chan->mchan.sram_period)
+				reg->dma_scr |= STM32_DMA_SCR_DBM;
+			ret = stm32_dma_mdma_start(chan, sg_req);
+			if (ret < 0) {
 				chan->desc = NULL;
+				return;
 			}
-			stm32_dma_start_transfer(chan);
 		}
 	}
+
+	chan->next_sg++;
+
+	stm32_dma_write(dmadev, STM32_DMA_SCR(chan->id), reg->dma_scr);
+	stm32_dma_write(dmadev, STM32_DMA_SPAR(chan->id), reg->dma_spar);
+	stm32_dma_write(dmadev, STM32_DMA_SM0AR(chan->id), reg->dma_sm0ar);
+	stm32_dma_write(dmadev, STM32_DMA_SFCR(chan->id), reg->dma_sfcr);
+	stm32_dma_write(dmadev, STM32_DMA_SM1AR(chan->id), reg->dma_sm1ar);
+	stm32_dma_write(dmadev, STM32_DMA_SNDTR(chan->id), reg->dma_sndtr);
+
+	if (chan->desc->cyclic)
+		stm32_dma_configure_next_sg(chan);
+
+	stm32_dma_dump_reg(chan);
+
+	/* Start DMA */
+	chan->busy = true;
+	reg->dma_scr |= STM32_DMA_SCR_EN;
+	stm32_dma_write(dmadev, STM32_DMA_SCR(chan->id), reg->dma_scr);
+
+	dev_dbg(chan2dev(chan), "vchan %pK: started\n", &chan->vchan);
+}
+
+static void stm32_dma_handle_chan_done(struct stm32_dma_chan *chan)
+{
+	if (!chan->desc)
+		return;
+
+	if (chan->desc->cyclic) {
+		vchan_cyclic_callback(&chan->desc->vdesc);
+		if (chan->use_mdma)
+			return;
+		chan->next_sg++;
+		stm32_dma_configure_next_sg(chan);
+	} else {
+		chan->busy = false;
+		if (chan->use_mdma && chan->mchan.dir != DMA_MEM_TO_DEV)
+			return;
+		if (chan->next_sg == chan->desc->num_sgs) {
+			list_del(&chan->desc->vdesc.node);
+			vchan_cookie_complete(&chan->desc->vdesc);
+			chan->desc = NULL;
+		}
+
+		stm32_dma_start_transfer(chan);
+	}
 }
 
 static irqreturn_t stm32_dma_chan_irq(int irq, void *devid)
@@ -695,7 +1096,6 @@  static void stm32_dma_issue_pending(struct dma_chan *c)
 	if (vchan_issue_pending(&chan->vchan) && !chan->desc && !chan->busy) {
 		dev_dbg(chan2dev(chan), "vchan %pK: issued\n", &chan->vchan);
 		stm32_dma_start_transfer(chan);
-
 	}
 	spin_unlock_irqrestore(&chan->vchan.lock, flags);
 }
@@ -836,16 +1236,128 @@  static void stm32_dma_clear_reg(struct stm32_dma_chan_reg *regs)
 	memset(regs, 0, sizeof(struct stm32_dma_chan_reg));
 }
 
+static int stm32_dma_mdma_prep_slave_sg(struct stm32_dma_chan *chan,
+					struct scatterlist *sgl, u32 sg_len,
+					struct stm32_dma_desc *desc)
+{
+	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
+	struct scatterlist *sg, *m_sg;
+	dma_addr_t dma_buf;
+	u32 len, num_sgs, sram_period;
+	int i, j, ret;
+
+	desc->dma_buf_cpu = gen_pool_dma_alloc(dmadev->sram_pool,
+					       chan->sram_size,
+					       &desc->dma_buf);
+	if (!desc->dma_buf_cpu)
+		return -ENOMEM;
+
+	sram_period = chan->sram_size / 2;
+
+	for_each_sg(sgl, sg, sg_len, i) {
+		struct stm32_dma_mdma_desc *m_desc = &desc->sg_req[i].m_desc;
+
+		len = sg_dma_len(sg);
+		desc->sg_req[i].stm32_sgl_req = *sg;
+		num_sgs = 1;
+
+		if (chan->mchan.dir == DMA_MEM_TO_DEV) {
+			if (len > chan->sram_size) {
+				dev_err(chan2dev(chan),
+					"max buf size = %d bytes\n",
+					chan->sram_size);
+				goto free_alloc;
+			}
+		} else {
+			/*
+			 * Build new sg for MDMA transfer
+			 * Scatter DMA Req into several SDRAM transfer
+			 */
+			if (len > sram_period)
+				num_sgs = len / sram_period;
+		}
+
+		ret = sg_alloc_table(&m_desc->sgt, num_sgs, GFP_ATOMIC);
+		if (ret) {
+			dev_err(chan2dev(chan), "MDMA sg table alloc failed\n");
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		dma_buf = sg_dma_address(sg);
+		for_each_sg(m_desc->sgt.sgl, m_sg, num_sgs, j) {
+			size_t bytes = min_t(size_t, len, sram_period);
+
+			sg_dma_address(m_sg) = dma_buf;
+			sg_dma_len(m_sg) = bytes;
+			dma_buf += bytes;
+			len -= bytes;
+		}
+	}
+
+	chan->mchan.sram_buf = desc->dma_buf;
+	chan->mchan.sram_period = sram_period;
+	chan->mchan.num_sgs = num_sgs;
+
+	return 0;
+
+err:
+	for (j = 0; j < i; j++)
+		sg_free_table(&desc->sg_req[j].m_desc.sgt);
+free_alloc:
+	gen_pool_free(dmadev->sram_pool, (unsigned long)desc->dma_buf_cpu,
+		      chan->sram_size);
+	return ret;
+}
+
+static int stm32_dma_setup_sg_requests(struct stm32_dma_chan *chan,
+				       struct scatterlist *sgl,
+				       unsigned int sg_len,
+				       enum dma_transfer_direction direction,
+				       struct stm32_dma_desc *desc)
+{
+	struct scatterlist *sg;
+	u32 nb_data_items;
+	int i, ret;
+	enum dma_slave_buswidth buswidth;
+
+	for_each_sg(sgl, sg, sg_len, i) {
+		ret = stm32_dma_set_xfer_param(chan, direction, &buswidth,
+					       sg_dma_len(sg));
+		if (ret < 0)
+			return ret;
+
+		nb_data_items = sg_dma_len(sg) / buswidth;
+		if (nb_data_items > STM32_DMA_ALIGNED_MAX_DATA_ITEMS) {
+			dev_err(chan2dev(chan), "nb items not supported\n");
+			return -EINVAL;
+		}
+
+		stm32_dma_clear_reg(&desc->sg_req[i].chan_reg);
+		desc->sg_req[i].chan_reg.dma_scr = chan->chan_reg.dma_scr;
+		desc->sg_req[i].chan_reg.dma_sfcr = chan->chan_reg.dma_sfcr;
+		desc->sg_req[i].chan_reg.dma_spar = chan->chan_reg.dma_spar;
+		desc->sg_req[i].chan_reg.dma_sm0ar = sg_dma_address(sg);
+		desc->sg_req[i].chan_reg.dma_sm1ar = sg_dma_address(sg);
+		if (chan->use_mdma)
+			desc->sg_req[i].chan_reg.dma_sm1ar +=
+				chan->mchan.sram_period;
+		desc->sg_req[i].chan_reg.dma_sndtr = nb_data_items;
+	}
+
+	desc->num_sgs = sg_len;
+
+	return 0;
+}
+
 static struct dma_async_tx_descriptor *stm32_dma_prep_slave_sg(
 	struct dma_chan *c, struct scatterlist *sgl,
 	u32 sg_len, enum dma_transfer_direction direction,
 	unsigned long flags, void *context)
 {
 	struct stm32_dma_chan *chan = to_stm32_dma_chan(c);
+
 	struct stm32_dma_desc *desc;
-	struct scatterlist *sg;
-	enum dma_slave_buswidth buswidth;
-	u32 nb_data_items;
 	int i, ret;
 
 	if (!chan->config_init) {
@@ -868,48 +1380,141 @@  static struct dma_async_tx_descriptor *stm32_dma_prep_slave_sg(
 	else
 		chan->chan_reg.dma_scr &= ~STM32_DMA_SCR_PFCTRL;
 
-	for_each_sg(sgl, sg, sg_len, i) {
-		ret = stm32_dma_set_xfer_param(chan, direction, &buswidth,
-					       sg_dma_len(sg));
-		if (ret < 0)
-			goto err;
-
-		desc->sg_req[i].len = sg_dma_len(sg);
+	if (chan->use_mdma) {
+		struct sg_table new_sgt;
+		struct scatterlist *s, *_sgl;
 
-		nb_data_items = desc->sg_req[i].len / buswidth;
-		if (nb_data_items > STM32_DMA_ALIGNED_MAX_DATA_ITEMS) {
-			dev_err(chan2dev(chan), "nb items not supported\n");
-			goto err;
+		chan->mchan.dir = direction;
+		ret = stm32_dma_mdma_prep_slave_sg(chan, sgl, sg_len, desc);
+		if (ret < 0)
+			return NULL;
+
+		ret = sg_alloc_table(&new_sgt, sg_len, GFP_ATOMIC);
+		if (ret)
+			dev_err(chan2dev(chan), "DMA sg table alloc failed\n");
+
+		for_each_sg(new_sgt.sgl, s, sg_len, i) {
+			_sgl = sgl;
+			sg_dma_len(s) =
+				min(sg_dma_len(_sgl), chan->mchan.sram_period);
+			s->dma_address = chan->mchan.sram_buf;
+			_sgl = sg_next(_sgl);
 		}
 
-		stm32_dma_clear_reg(&desc->sg_req[i].chan_reg);
-		desc->sg_req[i].chan_reg.dma_scr = chan->chan_reg.dma_scr;
-		desc->sg_req[i].chan_reg.dma_sfcr = chan->chan_reg.dma_sfcr;
-		desc->sg_req[i].chan_reg.dma_spar = chan->chan_reg.dma_spar;
-		desc->sg_req[i].chan_reg.dma_sm0ar = sg_dma_address(sg);
-		desc->sg_req[i].chan_reg.dma_sm1ar = sg_dma_address(sg);
-		desc->sg_req[i].chan_reg.dma_sndtr = nb_data_items;
+		ret = stm32_dma_setup_sg_requests(chan, new_sgt.sgl, sg_len,
+						  direction, desc);
+		sg_free_table(&new_sgt);
+		if (ret < 0)
+			goto err;
+	} else {
+		/* Prepare a normal DMA transfer */
+		ret = stm32_dma_setup_sg_requests(chan, sgl, sg_len, direction,
+						  desc);
+		if (ret < 0)
+			goto err;
 	}
 
-	desc->num_sgs = sg_len;
 	desc->cyclic = false;
 
 	return vchan_tx_prep(&chan->vchan, &desc->vdesc, flags);
-
 err:
+	if (chan->use_mdma) {
+		struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
+
+		for (i = 0; i < sg_len; i++)
+			sg_free_table(&desc->sg_req[i].m_desc.sgt);
+
+		gen_pool_free(dmadev->sram_pool,
+			      (unsigned long)desc->dma_buf_cpu,
+			      chan->sram_size);
+	}
 	kfree(desc);
+
 	return NULL;
 }
 
+static int stm32_dma_mdma_prep_dma_cyclic(struct stm32_dma_chan *chan,
+					  dma_addr_t buf_addr, size_t buf_len,
+					  size_t period_len,
+					  struct stm32_dma_desc *desc)
+{
+	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
+	struct stm32_dma_mdma *mchan = &chan->mchan;
+	struct stm32_dma_mdma_desc *m_desc = &desc->sg_req[0].m_desc;
+	struct dma_slave_config config;
+	dma_addr_t mem;
+	int ret;
+
+	chan->sram_size = ALIGN(period_len, STM32_DMA_SRAM_GRANULARITY);
+	desc->dma_buf_cpu = gen_pool_dma_alloc(dmadev->sram_pool,
+					       2 * chan->sram_size,
+					       &desc->dma_buf);
+	if (!desc->dma_buf_cpu)
+		return -ENOMEM;
+
+	memset(&config, 0, sizeof(config));
+	mem = buf_addr;
+
+	/* Configure MDMA channel */
+	if (chan->mchan.dir == DMA_MEM_TO_DEV)
+		config.dst_addr = desc->dma_buf;
+	else
+		config.src_addr = desc->dma_buf;
+	ret = dmaengine_slave_config(mchan->chan, &config);
+	if (ret < 0)
+		goto err;
+
+	/* Prepare MDMA descriptor */
+	m_desc->desc = dmaengine_prep_dma_cyclic(mchan->chan, buf_addr, buf_len,
+						 period_len, chan->mchan.dir,
+						 DMA_PREP_INTERRUPT);
+
+	if (!m_desc->desc) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = dma_submit_error(dmaengine_submit(m_desc->desc));
+	if (ret < 0) {
+		dev_err(chan2dev(chan), "MDMA submit failed\n");
+		goto err;
+	}
+
+	dma_async_issue_pending(mchan->chan);
+
+	/*
+	 * In case of M2D transfer, we have to generate dummy DMA transfer to
+	 * copy 1 period of data into SRAM
+	 */
+	if (chan->mchan.dir == DMA_MEM_TO_DEV) {
+		ret = stm32_dma_dummy_memcpy_xfer(chan);
+		if (ret < 0) {
+			dev_err(chan2dev(chan),
+				"stm32_dma_dummy_memcpy_xfer failed\n");
+			dmaengine_terminate_async(mchan->chan);
+			goto err;
+		}
+	}
+
+	return 0;
+err:
+	gen_pool_free(dmadev->sram_pool,
+		      (unsigned long)desc->dma_buf_cpu,
+		      chan->sram_size);
+	return ret;
+}
+
 static struct dma_async_tx_descriptor *stm32_dma_prep_dma_cyclic(
 	struct dma_chan *c, dma_addr_t buf_addr, size_t buf_len,
 	size_t period_len, enum dma_transfer_direction direction,
 	unsigned long flags)
 {
 	struct stm32_dma_chan *chan = to_stm32_dma_chan(c);
+	struct stm32_dma_chan_reg *chan_reg = &chan->chan_reg;
 	struct stm32_dma_desc *desc;
 	enum dma_slave_buswidth buswidth;
 	u32 num_periods, nb_data_items;
+	dma_addr_t dma_buf = 0;
 	int i, ret;
 
 	if (!buf_len || !period_len) {
@@ -957,28 +1562,49 @@  static struct dma_async_tx_descriptor *stm32_dma_prep_dma_cyclic(
 	/* Clear periph ctrl if client set it */
 	chan->chan_reg.dma_scr &= ~STM32_DMA_SCR_PFCTRL;
 
-	num_periods = buf_len / period_len;
+	if (chan->use_mdma)
+		num_periods = 1;
+	else
+		num_periods = buf_len / period_len;
 
 	desc = stm32_dma_alloc_desc(num_periods);
 	if (!desc)
 		return NULL;
 
-	for (i = 0; i < num_periods; i++) {
-		desc->sg_req[i].len = period_len;
+	desc->num_sgs = num_periods;
+	desc->cyclic = true;
 
+	if (chan->use_mdma) {
+		chan->mchan.dir = direction;
+
+		ret = stm32_dma_mdma_prep_dma_cyclic(chan, buf_addr, buf_len,
+						     period_len, desc);
+		if (ret < 0)
+			return NULL;
+		dma_buf = desc->dma_buf;
+	} else {
+		dma_buf = buf_addr;
+	}
+
+	for (i = 0; i < num_periods; i++) {
+		sg_dma_len(&desc->sg_req[i].stm32_sgl_req) = period_len;
+		sg_dma_address(&desc->sg_req[i].stm32_sgl_req) = dma_buf;
 		stm32_dma_clear_reg(&desc->sg_req[i].chan_reg);
-		desc->sg_req[i].chan_reg.dma_scr = chan->chan_reg.dma_scr;
-		desc->sg_req[i].chan_reg.dma_sfcr = chan->chan_reg.dma_sfcr;
-		desc->sg_req[i].chan_reg.dma_spar = chan->chan_reg.dma_spar;
-		desc->sg_req[i].chan_reg.dma_sm0ar = buf_addr;
-		desc->sg_req[i].chan_reg.dma_sm1ar = buf_addr;
+		desc->sg_req[i].chan_reg.dma_scr = chan_reg->dma_scr;
+		desc->sg_req[i].chan_reg.dma_sfcr = chan_reg->dma_sfcr;
+		desc->sg_req[i].chan_reg.dma_spar = chan_reg->dma_spar;
+		if (chan->use_mdma) {
+			desc->sg_req[i].chan_reg.dma_sm0ar = desc->dma_buf;
+			desc->sg_req[i].chan_reg.dma_sm1ar = desc->dma_buf +
+				chan->sram_size;
+		} else {
+			desc->sg_req[i].chan_reg.dma_sm0ar = dma_buf;
+			desc->sg_req[i].chan_reg.dma_sm1ar = dma_buf;
+			dma_buf += period_len;
+		}
 		desc->sg_req[i].chan_reg.dma_sndtr = nb_data_items;
-		buf_addr += period_len;
 	}
 
-	desc->num_sgs = num_periods;
-	desc->cyclic = true;
-
 	return vchan_tx_prep(&chan->vchan, &desc->vdesc, flags);
 }
 
@@ -1019,13 +1645,13 @@  static struct dma_async_tx_descriptor *stm32_dma_prep_dma_memcpy(
 			STM32_DMA_SCR_PINC |
 			STM32_DMA_SCR_TCIE |
 			STM32_DMA_SCR_TEIE;
-		desc->sg_req[i].chan_reg.dma_sfcr |= STM32_DMA_SFCR_MASK;
+		desc->sg_req[i].chan_reg.dma_sfcr &= ~STM32_DMA_SFCR_MASK;
 		desc->sg_req[i].chan_reg.dma_sfcr |=
 			STM32_DMA_SFCR_FTH(threshold);
 		desc->sg_req[i].chan_reg.dma_spar = src + offset;
 		desc->sg_req[i].chan_reg.dma_sm0ar = dest + offset;
 		desc->sg_req[i].chan_reg.dma_sndtr = xfer_count;
-		desc->sg_req[i].len = xfer_count;
+		sg_dma_len(&desc->sg_req[i].stm32_sgl_req) = xfer_count;
 	}
 
 	desc->num_sgs = num_sgs;
@@ -1034,18 +1660,6 @@  static struct dma_async_tx_descriptor *stm32_dma_prep_dma_memcpy(
 	return vchan_tx_prep(&chan->vchan, &desc->vdesc, flags);
 }
 
-static u32 stm32_dma_get_remaining_bytes(struct stm32_dma_chan *chan)
-{
-	u32 dma_scr, width, ndtr;
-	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
-
-	dma_scr = stm32_dma_read(dmadev, STM32_DMA_SCR(chan->id));
-	width = STM32_DMA_SCR_PSIZE_GET(dma_scr);
-	ndtr = stm32_dma_read(dmadev, STM32_DMA_SNDTR(chan->id));
-
-	return ndtr << width;
-}
-
 static size_t stm32_dma_desc_residue(struct stm32_dma_chan *chan,
 				     struct stm32_dma_desc *desc,
 				     u32 next_sg)
@@ -1054,6 +1668,10 @@  static size_t stm32_dma_desc_residue(struct stm32_dma_chan *chan,
 	u32 residue = 0;
 	int i;
 
+	/* Drain case */
+	if (chan->residue_after_drain)
+		return chan->residue_after_drain;
+
 	/*
 	 * In cyclic mode, for the last period, residue = remaining bytes from
 	 * NDTR
@@ -1069,7 +1687,7 @@  static size_t stm32_dma_desc_residue(struct stm32_dma_chan *chan,
 	 * transferred
 	 */
 	for (i = next_sg; i < desc->num_sgs; i++)
-		residue += desc->sg_req[i].len;
+		residue += sg_dma_len(&desc->sg_req[i].stm32_sgl_req);
 	residue += stm32_dma_get_remaining_bytes(chan);
 
 end:
@@ -1089,11 +1707,23 @@  static enum dma_status stm32_dma_tx_status(struct dma_chan *c,
 					   struct dma_tx_state *state)
 {
 	struct stm32_dma_chan *chan = to_stm32_dma_chan(c);
+	struct stm32_dma_mdma *mchan = &chan->mchan;
 	struct virt_dma_desc *vdesc;
 	enum dma_status status;
 	unsigned long flags;
 	u32 residue = 0;
 
+	/*
+	 * When DMA/MDMA chain is used, we return the status of MDMA in cyclic
+	 * mode and for D2M transfer in sg mode in order to return the correct
+	 * residue if any
+	 */
+	if (chan->desc && chan->use_mdma &&
+	    (mchan->dir != DMA_MEM_TO_DEV || chan->desc->cyclic) &&
+	    !chan->residue_after_drain)
+		return dmaengine_tx_status(mchan->chan, mchan->chan->cookie,
+					   state);
+
 	status = dma_cookie_status(c, cookie, state);
 	if (status == DMA_COMPLETE || !state)
 		return status;
@@ -1155,21 +1785,34 @@  static void stm32_dma_free_chan_resources(struct dma_chan *c)
 
 static void stm32_dma_desc_free(struct virt_dma_desc *vdesc)
 {
-	kfree(container_of(vdesc, struct stm32_dma_desc, vdesc));
+	struct stm32_dma_desc *desc = to_stm32_dma_desc(vdesc);
+	struct stm32_dma_chan *chan = to_stm32_dma_chan(vdesc->tx.chan);
+	struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
+	int i;
+
+	if (chan->use_mdma) {
+		for (i = 0; i < desc->num_sgs; i++)
+			sg_free_table(&desc->sg_req[i].m_desc.sgt);
+
+		gen_pool_free(dmadev->sram_pool,
+			      (unsigned long)desc->dma_buf_cpu,
+			      chan->sram_size);
+	}
+
+	kfree(desc);
 }
 
 static void stm32_dma_set_config(struct stm32_dma_chan *chan,
 				 struct stm32_dma_cfg *cfg)
 {
 	stm32_dma_clear_reg(&chan->chan_reg);
-
 	chan->chan_reg.dma_scr = cfg->stream_config & STM32_DMA_SCR_CFG_MASK;
 	chan->chan_reg.dma_scr |= STM32_DMA_SCR_REQ(cfg->request_line);
-
-	/* Enable Interrupts  */
 	chan->chan_reg.dma_scr |= STM32_DMA_SCR_TEIE | STM32_DMA_SCR_TCIE;
-
 	chan->threshold = STM32_DMA_THRESHOLD_FTR_GET(cfg->features);
+	chan->use_mdma = STM32_DMA_MDMA_CHAIN_FTR_GET(cfg->features);
+	chan->sram_size = (1 << STM32_DMA_MDMA_SRAM_SIZE_GET(cfg->features)) *
+		STM32_DMA_SRAM_GRANULARITY;
 }
 
 static struct dma_chan *stm32_dma_of_xlate(struct of_phandle_args *dma_spec,
@@ -1207,6 +1850,9 @@  static struct dma_chan *stm32_dma_of_xlate(struct of_phandle_args *dma_spec,
 
 	stm32_dma_set_config(chan, &cfg);
 
+	if (!dmadev->sram_pool || !chan->mchan.chan)
+		chan->use_mdma = 0;
+
 	return c;
 }
 
@@ -1219,10 +1865,12 @@  MODULE_DEVICE_TABLE(of, stm32_dma_of_match);
 static int stm32_dma_probe(struct platform_device *pdev)
 {
 	struct stm32_dma_chan *chan;
+	struct stm32_dma_mdma *mchan;
 	struct stm32_dma_device *dmadev;
 	struct dma_device *dd;
 	const struct of_device_id *match;
 	struct resource *res;
+	char name[4];
 	int i, ret;
 
 	match = of_match_device(stm32_dma_of_match, &pdev->dev);
@@ -1258,6 +1906,13 @@  static int stm32_dma_probe(struct platform_device *pdev)
 		reset_control_deassert(dmadev->rst);
 	}
 
+	dmadev->sram_pool = of_gen_pool_get(pdev->dev.of_node, "sram", 0);
+	if (!dmadev->sram_pool)
+		dev_info(&pdev->dev, "no dma pool: can't use MDMA\n");
+	else
+		dev_dbg(&pdev->dev, "SRAM pool: %zu KiB\n",
+			gen_pool_size(dmadev->sram_pool) / 1024);
+
 	dma_cap_set(DMA_SLAVE, dd->cap_mask);
 	dma_cap_set(DMA_PRIVATE, dd->cap_mask);
 	dma_cap_set(DMA_CYCLIC, dd->cap_mask);
@@ -1293,6 +1948,16 @@  static int stm32_dma_probe(struct platform_device *pdev)
 		chan->id = i;
 		chan->vchan.desc_free = stm32_dma_desc_free;
 		vchan_init(&chan->vchan, dd);
+
+		mchan = &chan->mchan;
+		if (dmadev->sram_pool) {
+			snprintf(name, sizeof(name), "ch%d", chan->id);
+			mchan->chan = dma_request_slave_channel(dd->dev, name);
+			if (!mchan->chan)
+				dev_info(&pdev->dev,
+					 "can't request MDMA chan for %s\n",
+					 name);
+		}
 	}
 
 	ret = dma_async_device_register(dd);
@@ -1350,4 +2015,4 @@  static int __init stm32_dma_init(void)
 {
 	return platform_driver_probe(&stm32_dma_driver, stm32_dma_probe);
 }
-subsys_initcall(stm32_dma_init);
+device_initcall(stm32_dma_init);