diff mbox

dma: sun4i: expose block size and wait cycle configuration to DMA users

Message ID 1457344771-12946-1-git-send-email-boris.brezillon@free-electrons.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris BREZILLON March 7, 2016, 9:59 a.m. UTC
Some drivers might need to tweak the block size and wait cycles values
to get better performances.
Create and export the sun4i_dma_set_chan_config() to do that.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/dma/sun4i-dma.c       | 44 ++++++++++++++++++++++++++++++-------------
 include/linux/dma/sun4i-dma.h | 38 +++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/dma/sun4i-dma.h

Comments

Vinod Koul March 7, 2016, 2:54 p.m. UTC | #1
On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
> +/* Dedicated DMA parameter register layout */
> +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
> +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> +
> +/**
> + * struct sun4i_dma_chan_config - DMA channel config
> + *
> + * @para: contains information about block size and time before checking
> + *	  DRQ line. This is device specific and only applicable to dedicated
> + *	  DMA channels

What information, can you elobrate.. And why can't you use existing
dma_slave_config for this?

> + */
> +struct sun4i_dma_chan_config {
> +	u32 para;
> +};
> +
> +int sun4i_dma_set_chan_config(struct dma_chan *dchan,
> +			      const struct sun4i_dma_chan_config *cfg);
> +
> +#endif /* _SUN4I_DMA_H */
Boris BREZILLON March 7, 2016, 3:08 p.m. UTC | #2
Hi Vinod,

On Mon, 7 Mar 2016 20:24:29 +0530
Vinod Koul <vinod.koul@intel.com> wrote:

> On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
> > +/* Dedicated DMA parameter register layout */
> > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
> > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> > +
> > +/**
> > + * struct sun4i_dma_chan_config - DMA channel config
> > + *
> > + * @para: contains information about block size and time before checking
> > + *	  DRQ line. This is device specific and only applicable to dedicated
> > + *	  DMA channels
> 
> What information, can you elobrate.. And why can't you use existing
> dma_slave_config for this?

Block size is related to the device FIFO size. I guess it allows the
DMA channel to launch a transfer of X bytes without having to check the
DRQ line (the line telling the DMA engine it can transfer more data
to/from the device). The wait cycles information is apparently related
to the number of clks the engine should wait before polling/checking
the DRQ line status between each block transfer. I'm not sure what it
saves to put WAIT_CYCLES() to something != 1, but in their BSP,
Allwinner tweak that depending on the device.

Note that I'd be happy if the above configuration could go into the
generic dma_slave_config struct. This way we could avoid per-engine
specific APIs.

Best Regards,

Boris
Priit Laes March 7, 2016, 3:30 p.m. UTC | #3
On Mon, 2016-03-07 at 10:59 +0100, Boris Brezillon wrote:
> Some drivers might need to tweak the block size and wait cycles
> values
> to get better performances.
> Create and export the sun4i_dma_set_chan_config() to do that.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
> ---
>  drivers/dma/sun4i-dma.c       | 44 ++++++++++++++++++++++++++++++---
> ----------
>  include/linux/dma/sun4i-dma.h | 38
> +++++++++++++++++++++++++++++++++++++
>  2 files changed, 69 insertions(+), 13 deletions(-)
>  create mode 100644 include/linux/dma/sun4i-dma.h
> 
> diff --git a/drivers/dma/sun4i-dma.c b/drivers/dma/sun4i-dma.c
> index 1661d518..e48f537 100644
> --- a/drivers/dma/sun4i-dma.c
> +++ b/drivers/dma/sun4i-dma.c
> @@ -12,6 +12,7 @@
>  #include <linux/bitops.h>
>  #include <linux/clk.h>
>  #include <linux/dmaengine.h>
> +#include <linux/dma/sun4i-dma.h>
>  #include <linux/dmapool.h>
>  #include <linux/interrupt.h>
>  #include <linux/module.h>
> @@ -138,6 +139,7 @@ struct sun4i_dma_pchan {
>  struct sun4i_dma_vchan {
>  	struct virt_dma_chan		vc;
>  	struct dma_slave_config		cfg;
> +	struct sun4i_dma_chan_config	scfg;
>  	struct sun4i_dma_pchan		*pchan;
>  	struct sun4i_dma_promise	*processing;
>  	struct sun4i_dma_contract	*contract;
> @@ -779,7 +781,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan,
> struct scatterlist *sgl,
>  	u8 ram_type, io_mode, linear_mode;
>  	struct scatterlist *sg;
>  	dma_addr_t srcaddr, dstaddr;
> -	u32 endpoints, para;
> +	u32 endpoints;
>  	int i;
>  
>  	if (!sgl)
> @@ -825,17 +827,6 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan,
> struct scatterlist *sgl,
>  			dstaddr = sg_dma_address(sg);
>  		}
>  
> -		/*
> -		 * These are the magic DMA engine timings that keep
> SPI going.
> -		 * I haven't seen any interface on DMAEngine to
> configure
> -		 * timings, and so far they seem to work for
> everything we
> -		 * support, so I've kept them here. I don't know if
> other
> -		 * devices need different timings because, as usual,
> we only
> -		 * have the "para" bitfield meanings, but no comment
> on what
> -		 * the values should be when doing a certain
> operation :|
> -		 */
> -		para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS;
> -
>  		/* And make a suitable promise */
>  		if (vchan->is_dedicated)
>  			promise = generate_ddma_promise(chan,
> srcaddr, dstaddr,
> @@ -850,7 +841,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan,
> struct scatterlist *sgl,
>  			return NULL; /* TODO: should we free
> everything? */
>  
>  		promise->cfg |= endpoints;
> -		promise->para = para;
> +		promise->para = vchan->scfg.para;
>  
>  		/* Then add it to the contract */
>  		list_add_tail(&promise->list, &contract->demands);
> @@ -908,6 +899,21 @@ static int sun4i_dma_config(struct dma_chan
> *chan,
>  	return 0;
>  }
>  
> +int sun4i_dma_set_chan_config(struct dma_chan *dchan,
> +			      const struct sun4i_dma_chan_config
> *cfg)
> +{
> +	struct sun4i_dma_vchan *vchan = to_sun4i_dma_vchan(dchan);
> +
> +	if (!vchan->is_dedicated)
> +		return -ENOTSUPP;
> +
> +	/* TODO: control cfg value */
> +	vchan->scfg = *cfg;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(sun4i_dma_set_chan_config);
> +
>  static struct dma_chan *sun4i_dma_of_xlate(struct of_phandle_args
> *dma_spec,
>  					   struct of_dma *ofdma)
>  {
> @@ -1206,6 +1212,18 @@ static int sun4i_dma_probe(struct
> platform_device *pdev)
>  		spin_lock_init(&vchan->vc.lock);
>  		vchan->vc.desc_free = sun4i_dma_free_contract;
>  		vchan_init(&vchan->vc, &priv->slave);
> +
> +		/*
> +		 * These are the magic DMA engine timings that keep
> SPI going.
> +		 * I haven't seen any interface on DMAEngine to
> configure
> +		 * timings, and so far they seem to work for
> everything we
> +		 * support, so I've kept them here. I don't know if
> other
> +		 * devices need different timings because, as usual,
> we only
> +		 * have the "para" bitfield meanings, but no comment
> on what
> +		 * the values should be when doing a certain
> operation :|
> +		 */
> +		vchan->scfg.para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS;

Does SPI refer the Serial Peripheral Interface?

If yes, then I would point out that current sun4i SPI driver doesn't
actually use DMA [1]

http://lists.infradead.org/pipermail/linux-arm-kernel/2016-February/411
722.html


> +
>  	}
>  
>  	ret = clk_prepare_enable(priv->clk);
> diff --git a/include/linux/dma/sun4i-dma.h b/include/linux/dma/sun4i
> -dma.h
> new file mode 100644
> index 0000000..f643539
> --- /dev/null
> +++ b/include/linux/dma/sun4i-dma.h
> @@ -0,0 +1,38 @@
> +/*
> + * Sun4i DMA Engine drivers support header file
> + *
> + * Copyright (C) 2016 Free Electrons. All rights reserved.
> + *
> + * This is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published
> by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +#ifndef _SUN4I_DMA_H
> +#define _SUN4I_DMA_H
> +
> +#include <linux/dma-mapping.h>
> +#include <linux/dmaengine.h>
> +
> +/* Dedicated DMA parameter register layout */
> +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) <<
> 24)
> +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> +
> +/**
> + * struct sun4i_dma_chan_config - DMA channel config
> + *
> + * @para: contains information about block size and time before
> checking
> + *	  DRQ line. This is device specific and only applicable to
> dedicated
> + *	  DMA channels
> + */
> +struct sun4i_dma_chan_config {
> +	u32 para;
> +};
> +
> +int sun4i_dma_set_chan_config(struct dma_chan *dchan,
> +			      const struct sun4i_dma_chan_config
> *cfg);
> +
> +#endif /* _SUN4I_DMA_H */
> -- 
> 2.1.4
>
Boris BREZILLON March 7, 2016, 3:47 p.m. UTC | #4
Hi Priit,

On Mon, 07 Mar 2016 17:30:41 +0200
Priit Laes <plaes@plaes.org> wrote:

> On Mon, 2016-03-07 at 10:59 +0100, Boris Brezillon wrote:
> > Some drivers might need to tweak the block size and wait cycles
> > values
> > to get better performances.
> > Create and export the sun4i_dma_set_chan_config() to do that.
> > 
> > Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
> > ---
> >  drivers/dma/sun4i-dma.c       | 44 ++++++++++++++++++++++++++++++---
> > ----------
> >  include/linux/dma/sun4i-dma.h | 38
> > +++++++++++++++++++++++++++++++++++++
> >  2 files changed, 69 insertions(+), 13 deletions(-)
> >  create mode 100644 include/linux/dma/sun4i-dma.h
> > 
> > diff --git a/drivers/dma/sun4i-dma.c b/drivers/dma/sun4i-dma.c
> > index 1661d518..e48f537 100644
> > --- a/drivers/dma/sun4i-dma.c
> > +++ b/drivers/dma/sun4i-dma.c
> > @@ -12,6 +12,7 @@
> >  #include <linux/bitops.h>
> >  #include <linux/clk.h>
> >  #include <linux/dmaengine.h>
> > +#include <linux/dma/sun4i-dma.h>
> >  #include <linux/dmapool.h>
> >  #include <linux/interrupt.h>
> >  #include <linux/module.h>
> > @@ -138,6 +139,7 @@ struct sun4i_dma_pchan {
> >  struct sun4i_dma_vchan {
> >  	struct virt_dma_chan		vc;
> >  	struct dma_slave_config		cfg;
> > +	struct sun4i_dma_chan_config	scfg;
> >  	struct sun4i_dma_pchan		*pchan;
> >  	struct sun4i_dma_promise	*processing;
> >  	struct sun4i_dma_contract	*contract;
> > @@ -779,7 +781,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan,
> > struct scatterlist *sgl,
> >  	u8 ram_type, io_mode, linear_mode;
> >  	struct scatterlist *sg;
> >  	dma_addr_t srcaddr, dstaddr;
> > -	u32 endpoints, para;
> > +	u32 endpoints;
> >  	int i;
> >  
> >  	if (!sgl)
> > @@ -825,17 +827,6 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan,
> > struct scatterlist *sgl,
> >  			dstaddr = sg_dma_address(sg);
> >  		}
> >  
> > -		/*
> > -		 * These are the magic DMA engine timings that keep
> > SPI going.
> > -		 * I haven't seen any interface on DMAEngine to
> > configure
> > -		 * timings, and so far they seem to work for
> > everything we
> > -		 * support, so I've kept them here. I don't know if
> > other
> > -		 * devices need different timings because, as usual,
> > we only
> > -		 * have the "para" bitfield meanings, but no comment
> > on what
> > -		 * the values should be when doing a certain
> > operation :|
> > -		 */
> > -		para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS;
> > -
> >  		/* And make a suitable promise */
> >  		if (vchan->is_dedicated)
> >  			promise = generate_ddma_promise(chan,
> > srcaddr, dstaddr,
> > @@ -850,7 +841,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan,
> > struct scatterlist *sgl,
> >  			return NULL; /* TODO: should we free
> > everything? */
> >  
> >  		promise->cfg |= endpoints;
> > -		promise->para = para;
> > +		promise->para = vchan->scfg.para;
> >  
> >  		/* Then add it to the contract */
> >  		list_add_tail(&promise->list, &contract->demands);
> > @@ -908,6 +899,21 @@ static int sun4i_dma_config(struct dma_chan
> > *chan,
> >  	return 0;
> >  }
> >  
> > +int sun4i_dma_set_chan_config(struct dma_chan *dchan,
> > +			      const struct sun4i_dma_chan_config
> > *cfg)
> > +{
> > +	struct sun4i_dma_vchan *vchan = to_sun4i_dma_vchan(dchan);
> > +
> > +	if (!vchan->is_dedicated)
> > +		return -ENOTSUPP;
> > +
> > +	/* TODO: control cfg value */
> > +	vchan->scfg = *cfg;
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(sun4i_dma_set_chan_config);
> > +
> >  static struct dma_chan *sun4i_dma_of_xlate(struct of_phandle_args
> > *dma_spec,
> >  					   struct of_dma *ofdma)
> >  {
> > @@ -1206,6 +1212,18 @@ static int sun4i_dma_probe(struct
> > platform_device *pdev)
> >  		spin_lock_init(&vchan->vc.lock);
> >  		vchan->vc.desc_free = sun4i_dma_free_contract;
> >  		vchan_init(&vchan->vc, &priv->slave);
> > +
> > +		/*
> > +		 * These are the magic DMA engine timings that keep
> > SPI going.
> > +		 * I haven't seen any interface on DMAEngine to
> > configure
> > +		 * timings, and so far they seem to work for
> > everything we
> > +		 * support, so I've kept them here. I don't know if
> > other
> > +		 * devices need different timings because, as usual,
> > we only
> > +		 * have the "para" bitfield meanings, but no comment
> > on what
> > +		 * the values should be when doing a certain
> > operation :|
> > +		 */
> > +		vchan->scfg.para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS;
> 
> Does SPI refer the Serial Peripheral Interface?
> 
> If yes, then I would point out that current sun4i SPI driver doesn't
> actually use DMA [1]
> 
> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-February/411
> 722.html

I just moved this assignment and the associated comment in the driver,
so maybe we should ask Emilio why he thinks SPI config should be the
default one, and how he tested it...

Best Regards,

Boris
Emilio López March 7, 2016, 5:15 p.m. UTC | #5
Hi,

El 07/03/16 a las 12:47, Boris Brezillon escribió:
(...)
>> Does SPI refer the Serial Peripheral Interface?
>>
>> If yes, then I would point out that current sun4i SPI driver doesn't
>> actually use DMA [1]
>>
>> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-February/411
>> 722.html
> 
> I just moved this assignment and the associated comment in the driver,
> so maybe we should ask Emilio why he thinks SPI config should be the
> default one, and how he tested it...

When I was working on the dmaengine driver, I needed a way to test
mem<->dev and mem<->mem transfers. I used dmatest.ko for the latter, and
SPI was a good fit for the former as I had a logic analyzer. That's why
there's patches to support DMA on it (but, as Priit pointed out, are not
in mainline yet).

At first SPI was acting weird when using DMA, but the problems went away
when configuring the timings with these magic values you see on the
driver today. These timings turned out to also work for audio, so there
was no need for a mechanism to configure them.

And that's basically the story behind SUN4I_DDMA_MAGIC_SPI_PARAMETERS :)

Cheers,
Emilio
Maxime Ripard March 7, 2016, 8:30 p.m. UTC | #6
On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote:
> Hi Vinod,
> 
> On Mon, 7 Mar 2016 20:24:29 +0530
> Vinod Koul <vinod.koul@intel.com> wrote:
> 
> > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
> > > +/* Dedicated DMA parameter register layout */
> > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
> > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> > > +
> > > +/**
> > > + * struct sun4i_dma_chan_config - DMA channel config
> > > + *
> > > + * @para: contains information about block size and time before checking
> > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > + *	  DMA channels
> > 
> > What information, can you elobrate.. And why can't you use existing
> > dma_slave_config for this?
> 
> Block size is related to the device FIFO size. I guess it allows the
> DMA channel to launch a transfer of X bytes without having to check the
> DRQ line (the line telling the DMA engine it can transfer more data
> to/from the device). The wait cycles information is apparently related
> to the number of clks the engine should wait before polling/checking
> the DRQ line status between each block transfer. I'm not sure what it
> saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> Allwinner tweak that depending on the device.
> 
> Note that I'd be happy if the above configuration could go into the
> generic dma_slave_config struct. This way we could avoid per-engine
> specific APIs.

And I'd really like to avoid that too. That will avoid to cripple the
consumer drivers that might be using any of the two.

Thanks!
Maxime
Vinod Koul March 8, 2016, 2:55 a.m. UTC | #7
On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote:
> On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote:
> > Hi Vinod,
> > 
> > On Mon, 7 Mar 2016 20:24:29 +0530
> > Vinod Koul <vinod.koul@intel.com> wrote:
> > 
> > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
> > > > +/* Dedicated DMA parameter register layout */
> > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
> > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> > > > +
> > > > +/**
> > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > + *
> > > > + * @para: contains information about block size and time before checking
> > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > + *	  DMA channels
> > > 
> > > What information, can you elobrate.. And why can't you use existing
> > > dma_slave_config for this?
> > 
> > Block size is related to the device FIFO size. I guess it allows the
> > DMA channel to launch a transfer of X bytes without having to check the
> > DRQ line (the line telling the DMA engine it can transfer more data
> > to/from the device). The wait cycles information is apparently related
> > to the number of clks the engine should wait before polling/checking
> > the DRQ line status between each block transfer. I'm not sure what it
> > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > Allwinner tweak that depending on the device.

we already have block size aka src/dst_maxburst, why not use that one.

Why does dmaengine need to wait? Can you explain that

> > Note that I'd be happy if the above configuration could go into the
> > generic dma_slave_config struct. This way we could avoid per-engine
> > specific APIs.
> 
> And I'd really like to avoid that too. That will avoid to cripple the
> consumer drivers that might be using any of the two.

If it is fairly genric property we should add, otherwise yes we don't want
that
Vinod Koul March 8, 2016, 2:59 a.m. UTC | #8
On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote:
> On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote:
> > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote:

Also just noticed the subsystem name on this is not correct, pls fix that in
subsequent posting
Maxime Ripard March 8, 2016, 7:51 a.m. UTC | #9
On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote:
> On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote:
> > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote:
> > > Hi Vinod,
> > > 
> > > On Mon, 7 Mar 2016 20:24:29 +0530
> > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > 
> > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
> > > > > +/* Dedicated DMA parameter register layout */
> > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
> > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> > > > > +
> > > > > +/**
> > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > + *
> > > > > + * @para: contains information about block size and time before checking
> > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > + *	  DMA channels
> > > > 
> > > > What information, can you elobrate.. And why can't you use existing
> > > > dma_slave_config for this?
> > > 
> > > Block size is related to the device FIFO size. I guess it allows the
> > > DMA channel to launch a transfer of X bytes without having to check the
> > > DRQ line (the line telling the DMA engine it can transfer more data
> > > to/from the device). The wait cycles information is apparently related
> > > to the number of clks the engine should wait before polling/checking
> > > the DRQ line status between each block transfer. I'm not sure what it
> > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > Allwinner tweak that depending on the device.
> 
> we already have block size aka src/dst_maxburst, why not use that one.

I'm not sure it's really the same thing. The DMA controller also has a
burst parameter, that is either 1 byte or 8 bytes, and ends up being
different from this one.

> Why does dmaengine need to wait? Can you explain that

We have no idea, we thought you might have one :)

It doesn't really makes sense to us, but it does have a significant
impact on the throughput.

Maxime
Hans de Goede March 8, 2016, 8:42 a.m. UTC | #10
Hi,

On 08-03-16 08:51, Maxime Ripard wrote:
> On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote:
>> On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote:
>>> On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote:
>>>> Hi Vinod,
>>>>
>>>> On Mon, 7 Mar 2016 20:24:29 +0530
>>>> Vinod Koul <vinod.koul@intel.com> wrote:
>>>>
>>>>> On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
>>>>>> +/* Dedicated DMA parameter register layout */
>>>>>> +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
>>>>>> +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
>>>>>> +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
>>>>>> +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
>>>>>> +
>>>>>> +/**
>>>>>> + * struct sun4i_dma_chan_config - DMA channel config
>>>>>> + *
>>>>>> + * @para: contains information about block size and time before checking
>>>>>> + *	  DRQ line. This is device specific and only applicable to dedicated
>>>>>> + *	  DMA channels
>>>>>
>>>>> What information, can you elobrate.. And why can't you use existing
>>>>> dma_slave_config for this?
>>>>
>>>> Block size is related to the device FIFO size. I guess it allows the
>>>> DMA channel to launch a transfer of X bytes without having to check the
>>>> DRQ line (the line telling the DMA engine it can transfer more data
>>>> to/from the device). The wait cycles information is apparently related
>>>> to the number of clks the engine should wait before polling/checking
>>>> the DRQ line status between each block transfer. I'm not sure what it
>>>> saves to put WAIT_CYCLES() to something != 1, but in their BSP,
>>>> Allwinner tweak that depending on the device.
>>
>> we already have block size aka src/dst_maxburst, why not use that one.
>
> I'm not sure it's really the same thing. The DMA controller also has a
> burst parameter, that is either 1 byte or 8 bytes, and ends up being
> different from this one.
>
>> Why does dmaengine need to wait? Can you explain that
>
> We have no idea, we thought you might have one :)
>
> It doesn't really makes sense to us, but it does have a significant
> impact on the throughput.

<wild speculation>

I see 2 possible reasons why waiting till checking for drq can help:

1) A lot of devices have an internal fifo hooked up to a single mmio data
register which gets read using the general purpose dma-engine, it allows
this fifo to fill, and thus do burst transfers
(We've seen similar issues with the scanout engine for the display which
  has its own dma engine, and doing larger transfers helps a lot).

2) Physical memory on the sunxi SoCs is (often) divided into banks
with a shared data / address bus doing bank-switches is expensive, so
this wait cycles may introduce latency which allows a user of another
bank to complete its RAM accesses before the dma engine forces a
bank switch, which ends up avoiding a lot of (interleaved) bank switches
while both try to access a different banj and thus waiting makes things
(much) faster in the end (again a known problem with the display
scanout engine).

</wild speculation>

Note the differences these kinda tweaks make can be quite dramatic,
when using a 1920x1080p60 hdmi output on the A10 SoC with a 16 bit
memory bus (real world worst case scenario), the memory bandwidth
left for userspace processes (measured through memset) almost doubles
from 48 MB/s to 85 MB/s, source:
http://ssvb.github.io/2014/11/11/revisiting-fullhd-x11-desktop-performance-of-the-allwinner-a10.html

TL;DR: Waiting before starting DMA allows for doing larger burst
transfers which ends up making things more efficient.

Given this, I really expect there to be other dma-engines which
have some option to wait a bit before starting/unpausing a transfer
instead of starting it as soon as (more) data is available, so I think
this would make a good addition to dma_slave_config.

Regards,

Hans
Boris BREZILLON March 8, 2016, 8:46 a.m. UTC | #11
On Tue, 8 Mar 2016 08:51:31 +0100
Maxime Ripard <maxime.ripard@free-electrons.com> wrote:

> On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote:
> > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote:
> > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote:
> > > > Hi Vinod,
> > > > 
> > > > On Mon, 7 Mar 2016 20:24:29 +0530
> > > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > > 
> > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
> > > > > > +/* Dedicated DMA parameter register layout */
> > > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
> > > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> > > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> > > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> > > > > > +
> > > > > > +/**
> > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > + *
> > > > > > + * @para: contains information about block size and time before checking
> > > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > > + *	  DMA channels
> > > > > 
> > > > > What information, can you elobrate.. And why can't you use existing
> > > > > dma_slave_config for this?
> > > > 
> > > > Block size is related to the device FIFO size. I guess it allows the
> > > > DMA channel to launch a transfer of X bytes without having to check the
> > > > DRQ line (the line telling the DMA engine it can transfer more data
> > > > to/from the device). The wait cycles information is apparently related
> > > > to the number of clks the engine should wait before polling/checking
> > > > the DRQ line status between each block transfer. I'm not sure what it
> > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > > Allwinner tweak that depending on the device.
> > 
> > we already have block size aka src/dst_maxburst, why not use that one.
> 
> I'm not sure it's really the same thing. The DMA controller also has a
> burst parameter, that is either 1 byte or 8 bytes, and ends up being
> different from this one.

Well, that's what I understood to, but when reading more carefully the
src/dst_maxburst description, it seems to match the block_size concept
exposed by the sun4i dmaengine. But how should we choose the real burst
size then.
IIRC, in most documentation/datasheets, burst size is referred as the
maximum number of words (word size depends on the selected width) a
master is allowed to transfer to a slave through the bus without
being interrupted by other master requests.
Am I correct?

> 
> > Why does dmaengine need to wait? Can you explain that
> 
> We have no idea, we thought you might have one :)

Yes, it's really unclear to us why this is needed. There might be some
kind of contention, or maybe the slave device takes some time to put
DRQ line to low state, and without these wait_cycles the dmaengine
would assume some data are still available in the FIFO while there's
actually no more data to retrieve.

> 
> It doesn't really makes sense to us, but it does have a significant
> impact on the throughput.

I wouldn't say significant impact, but tweaking those parameters has
some impact on the performances, and since it's not that complicated to
implement, I thought it was worth a try, but maybe I'm wrong.

Best Regards,

Boris
Priit Laes March 8, 2016, 9:10 a.m. UTC | #12
On Tue, 2016-03-08 at 09:46 +0100, Boris Brezillon wrote:
> On Tue, 8 Mar 2016 08:51:31 +0100
> Maxime Ripard <maxime.ripard@free-electrons.com> wrote:
> 
> > On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote:
> > > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote:
> > > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon
> > > > wrote:
> > > > > Hi Vinod,
> > > > > 
> > > > > On Mon, 7 Mar 2016 20:24:29 +0530
> > > > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > > > 
> > > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon
> > > > > > wrote:
> > > > > > > +/* Dedicated DMA parameter register layout */
> > > > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n
> > > > > > > ) - 1) << 24)
> > > > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) 
> > > > > > > - 1) << 16)
> > > > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n
> > > > > > > ) - 1) << 8)
> > > > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) 
> > > > > > > - 1) << 0)
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > > + *
> > > > > > > + * @para: contains information about block size and time
> > > > > > > before checking
> > > > > > > + *	  DRQ line. This is device specific and only
> > > > > > > applicable to dedicated
> > > > > > > + *	  DMA channels
> > > > > > 
> > > > > > What information, can you elobrate.. And why can't you use
> > > > > > existing
> > > > > > dma_slave_config for this?
> > > > > 
> > > > > Block size is related to the device FIFO size. I guess it
> > > > > allows the
> > > > > DMA channel to launch a transfer of X bytes without having to
> > > > > check the
> > > > > DRQ line (the line telling the DMA engine it can transfer
> > > > > more data
> > > > > to/from the device). The wait cycles information is
> > > > > apparently related
> > > > > to the number of clks the engine should wait before
> > > > > polling/checking
> > > > > the DRQ line status between each block transfer. I'm not sure
> > > > > what it
> > > > > saves to put WAIT_CYCLES() to something != 1, but in their
> > > > > BSP,
> > > > > Allwinner tweak that depending on the device.
> > > 
> > > we already have block size aka src/dst_maxburst, why not use that
> > > one.
> > 
> > I'm not sure it's really the same thing. The DMA controller also
> > has a
> > burst parameter, that is either 1 byte or 8 bytes, and ends up
> > being
> > different from this one.
> 
> Well, that's what I understood to, but when reading more carefully
> the
> src/dst_maxburst description, it seems to match the block_size
> concept
> exposed by the sun4i dmaengine. But how should we choose the real
> burst
> size then.
> IIRC, in most documentation/datasheets, burst size is referred as the
> maximum number of words (word size depends on the selected width) a
> master is allowed to transfer to a slave through the bus without
> being interrupted by other master requests.
> Am I correct?
> 
> > 
> > > Why does dmaengine need to wait? Can you explain that
> > 
> > We have no idea, we thought you might have one :)
> 
> Yes, it's really unclear to us why this is needed. There might be
> some
> kind of contention, or maybe the slave device takes some time to put
> DRQ line to low state, and without these wait_cycles the dmaengine
> would assume some data are still available in the FIFO while there's
> actually no more data to retrieve.
> 
> > 
> > It doesn't really makes sense to us, but it does have a significant
> > impact on the throughput.
> 
> I wouldn't say significant impact, but tweaking those parameters has
> some impact on the performances, and since it's not that complicated
> to
> implement, I thought it was worth a try, but maybe I'm wrong.

Somewhat offtopic, but there was also another patchset a while ago for
sun4i SPI that removed the 64 byte limit for SPI transfers (although
this was DMA-less case) back then.

http://lists.infradead.org/pipermail/linux-arm-kernel/2014-March/241392
.html

(This patchset made it possible to use SPI-based small TFT displays via
fbtft, when I tested it some time ago).

As it currently stands, sun4i SPI is not using DMA and transfers are
limited to 64 bytes.

Päikest,
Priit ;)
Vinod Koul March 8, 2016, 9:59 a.m. UTC | #13
On Tue, Mar 08, 2016 at 08:51:31AM +0100, Maxime Ripard wrote:
> > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > + *
> > > > > > + * @para: contains information about block size and time before checking
> > > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > > + *	  DMA channels
> > > > > 
> > > > > What information, can you elobrate.. And why can't you use existing
> > > > > dma_slave_config for this?
> > > > 
> > > > Block size is related to the device FIFO size. I guess it allows the
> > > > DMA channel to launch a transfer of X bytes without having to check the
> > > > DRQ line (the line telling the DMA engine it can transfer more data
> > > > to/from the device). The wait cycles information is apparently related
> > > > to the number of clks the engine should wait before polling/checking
> > > > the DRQ line status between each block transfer. I'm not sure what it
> > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > > Allwinner tweak that depending on the device.
> > 
> > we already have block size aka src/dst_maxburst, why not use that one.
> 
> I'm not sure it's really the same thing. The DMA controller also has a
> burst parameter, that is either 1 byte or 8 bytes, and ends up being
> different from this one.

Nope that is buswidth. maxburst is words which cna be sent to device FIFO.
> 
> > Why does dmaengine need to wait? Can you explain that
> 
> We have no idea, we thought you might have one :)

Well that is hardware dependent. From DMAengine API usage we dont ahve to
wait at all. We should submit next descriptor as soon as possible.

> It doesn't really makes sense to us, but it does have a significant
> impact on the throughput.
Vinod Koul March 8, 2016, 10:04 a.m. UTC | #14
On Tue, Mar 08, 2016 at 09:46:25AM +0100, Boris Brezillon wrote:
> On Tue, 8 Mar 2016 08:51:31 +0100
> Maxime Ripard <maxime.ripard@free-electrons.com> wrote:
> 
> > On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote:
> > > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote:
> > > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote:
> > > > > Hi Vinod,
> > > > > 
> > > > > On Mon, 7 Mar 2016 20:24:29 +0530
> > > > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > > > 
> > > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
> > > > > > > +/* Dedicated DMA parameter register layout */
> > > > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
> > > > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> > > > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> > > > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > > + *
> > > > > > > + * @para: contains information about block size and time before checking
> > > > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > > > + *	  DMA channels
> > > > > > 
> > > > > > What information, can you elobrate.. And why can't you use existing
> > > > > > dma_slave_config for this?
> > > > > 
> > > > > Block size is related to the device FIFO size. I guess it allows the
> > > > > DMA channel to launch a transfer of X bytes without having to check the
> > > > > DRQ line (the line telling the DMA engine it can transfer more data
> > > > > to/from the device). The wait cycles information is apparently related
> > > > > to the number of clks the engine should wait before polling/checking
> > > > > the DRQ line status between each block transfer. I'm not sure what it
> > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > > > Allwinner tweak that depending on the device.
> > > 
> > > we already have block size aka src/dst_maxburst, why not use that one.
> > 
> > I'm not sure it's really the same thing. The DMA controller also has a
> > burst parameter, that is either 1 byte or 8 bytes, and ends up being
> > different from this one.
> 
> Well, that's what I understood to, but when reading more carefully the
> src/dst_maxburst description, it seems to match the block_size concept
> exposed by the sun4i dmaengine. But how should we choose the real burst
> size then.

maxburst is block size as you describe in this context

> IIRC, in most documentation/datasheets, burst size is referred as the
> maximum number of words (word size depends on the selected width) a
> master is allowed to transfer to a slave through the bus without
> being interrupted by other master requests.
> Am I correct?

maxburst is defined as words not bytes. Word is specfied with the
src/dst_addr_width.

> 
> > 
> > > Why does dmaengine need to wait? Can you explain that
> > 
> > We have no idea, we thought you might have one :)
> 
> Yes, it's really unclear to us why this is needed. There might be some
> kind of contention, or maybe the slave device takes some time to put
> DRQ line to low state, and without these wait_cycles the dmaengine
> would assume some data are still available in the FIFO while there's
> actually no more data to retrieve.
> 
> > 
> > It doesn't really makes sense to us, but it does have a significant
> > impact on the throughput.
> 
> I wouldn't say significant impact, but tweaking those parameters has
> some impact on the performances, and since it's not that complicated to
> implement, I thought it was worth a try, but maybe I'm wrong.

Can you guys check with HW folks and see why it is required, if that is a
possiblity!
Vinod Koul March 8, 2016, 10:05 a.m. UTC | #15
On Tue, Mar 08, 2016 at 09:42:31AM +0100, Hans de Goede wrote:
> <wild speculation>
> 
> I see 2 possible reasons why waiting till checking for drq can help:
> 
> 1) A lot of devices have an internal fifo hooked up to a single mmio data
> register which gets read using the general purpose dma-engine, it allows
> this fifo to fill, and thus do burst transfers
> (We've seen similar issues with the scanout engine for the display which
>  has its own dma engine, and doing larger transfers helps a lot).
> 
> 2) Physical memory on the sunxi SoCs is (often) divided into banks
> with a shared data / address bus doing bank-switches is expensive, so
> this wait cycles may introduce latency which allows a user of another
> bank to complete its RAM accesses before the dma engine forces a
> bank switch, which ends up avoiding a lot of (interleaved) bank switches
> while both try to access a different banj and thus waiting makes things
> (much) faster in the end (again a known problem with the display
> scanout engine).
> 
> </wild speculation>
> 
> Note the differences these kinda tweaks make can be quite dramatic,
> when using a 1920x1080p60 hdmi output on the A10 SoC with a 16 bit
> memory bus (real world worst case scenario), the memory bandwidth
> left for userspace processes (measured through memset) almost doubles
> from 48 MB/s to 85 MB/s, source:
> http://ssvb.github.io/2014/11/11/revisiting-fullhd-x11-desktop-performance-of-the-allwinner-a10.html
> 
> TL;DR: Waiting before starting DMA allows for doing larger burst
> transfers which ends up making things more efficient.
> 
> Given this, I really expect there to be other dma-engines which
> have some option to wait a bit before starting/unpausing a transfer
> instead of starting it as soon as (more) data is available, so I think
> this would make a good addition to dma_slave_config.

I tend to agree but before we do that I would like this hypothesis to be
confirmed :)
Corentin Labbe March 9, 2016, 10:08 a.m. UTC | #16
> > 
> > It doesn't really makes sense to us, but it does have a significant
> > impact on the throughput.
> 
> I wouldn't say significant impact, but tweaking those parameters has
> some impact on the performances, and since it's not that complicated to
> implement, I thought it was worth a try, but maybe I'm wrong.
> 

Hello

I am working on adding DMA on the crypto driver sun4i-ss and I confirm that tweaking those parameter can increase speed.

Regards

LABBE Corentin
Boris BREZILLON March 9, 2016, 10:14 a.m. UTC | #17
On Tue, 8 Mar 2016 08:25:47 +0530
Vinod Koul <vinod.koul@intel.com> wrote:

> On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote:
> > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote:
> > > Hi Vinod,
> > > 
> > > On Mon, 7 Mar 2016 20:24:29 +0530
> > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > 
> > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote:
> > > > > +/* Dedicated DMA parameter register layout */
> > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
> > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
> > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
> > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
> > > > > +
> > > > > +/**
> > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > + *
> > > > > + * @para: contains information about block size and time before checking
> > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > + *	  DMA channels
> > > > 
> > > > What information, can you elobrate.. And why can't you use existing
> > > > dma_slave_config for this?
> > > 
> > > Block size is related to the device FIFO size. I guess it allows the
> > > DMA channel to launch a transfer of X bytes without having to check the
> > > DRQ line (the line telling the DMA engine it can transfer more data
> > > to/from the device). The wait cycles information is apparently related
> > > to the number of clks the engine should wait before polling/checking
> > > the DRQ line status between each block transfer. I'm not sure what it
> > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > Allwinner tweak that depending on the device.
> 
> we already have block size aka src/dst_maxburst, why not use that one.

Okay, but then remains the question "how should we choose the real burst
size?". The block size described in Allwinner datasheet is not the
number of words you will transmit without being preempted by other
master -> slave requests, it's the number of bytes that can be
transmitted without checking the DRQ line.
IOW, block_size = burst_size * X
Maxime Ripard March 9, 2016, 10:58 a.m. UTC | #18
On Tue, Mar 08, 2016 at 03:35:38PM +0530, Vinod Koul wrote:
> On Tue, Mar 08, 2016 at 09:42:31AM +0100, Hans de Goede wrote:
> > <wild speculation>
> > 
> > I see 2 possible reasons why waiting till checking for drq can help:
> > 
> > 1) A lot of devices have an internal fifo hooked up to a single mmio data
> > register which gets read using the general purpose dma-engine, it allows
> > this fifo to fill, and thus do burst transfers
> > (We've seen similar issues with the scanout engine for the display which
> >  has its own dma engine, and doing larger transfers helps a lot).
> > 
> > 2) Physical memory on the sunxi SoCs is (often) divided into banks
> > with a shared data / address bus doing bank-switches is expensive, so
> > this wait cycles may introduce latency which allows a user of another
> > bank to complete its RAM accesses before the dma engine forces a
> > bank switch, which ends up avoiding a lot of (interleaved) bank switches
> > while both try to access a different banj and thus waiting makes things
> > (much) faster in the end (again a known problem with the display
> > scanout engine).
> > 
> > </wild speculation>
> > 
> > Note the differences these kinda tweaks make can be quite dramatic,
> > when using a 1920x1080p60 hdmi output on the A10 SoC with a 16 bit
> > memory bus (real world worst case scenario), the memory bandwidth
> > left for userspace processes (measured through memset) almost doubles
> > from 48 MB/s to 85 MB/s, source:
> > http://ssvb.github.io/2014/11/11/revisiting-fullhd-x11-desktop-performance-of-the-allwinner-a10.html
> > 
> > TL;DR: Waiting before starting DMA allows for doing larger burst
> > transfers which ends up making things more efficient.
> > 
> > Given this, I really expect there to be other dma-engines which
> > have some option to wait a bit before starting/unpausing a transfer
> > instead of starting it as soon as (more) data is available, so I think
> > this would make a good addition to dma_slave_config.
> 
> I tend to agree but before we do that I would like this hypothesis to be
> confirmed :)

We can't confirm it, we don't have access to any documentation that
might explain what this is about.

Maxime
Boris BREZILLON March 9, 2016, 11:06 a.m. UTC | #19
On Tue, 8 Mar 2016 08:25:47 +0530
Vinod Koul <vinod.koul@intel.com> wrote:
> 
> Why does dmaengine need to wait? Can you explain that

I don't have an answer for that one, but when I set WAIT_CYCLES to 1
for the NAND use case it does not work. So I guess it is somehow
related to how the DRQ line is controlled on the device side...
Vinod Koul March 11, 2016, 6:24 a.m. UTC | #20
On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote:
> > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > + *
> > > > > > + * @para: contains information about block size and time before checking
> > > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > > + *	  DMA channels
> > > > > 
> > > > > What information, can you elobrate.. And why can't you use existing
> > > > > dma_slave_config for this?
> > > > 
> > > > Block size is related to the device FIFO size. I guess it allows the
> > > > DMA channel to launch a transfer of X bytes without having to check the
> > > > DRQ line (the line telling the DMA engine it can transfer more data
> > > > to/from the device). The wait cycles information is apparently related
> > > > to the number of clks the engine should wait before polling/checking
> > > > the DRQ line status between each block transfer. I'm not sure what it
> > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > > Allwinner tweak that depending on the device.
> > 
> > we already have block size aka src/dst_maxburst, why not use that one.
> 
> Okay, but then remains the question "how should we choose the real burst
> size?". The block size described in Allwinner datasheet is not the
> number of words you will transmit without being preempted by other
> master -> slave requests, it's the number of bytes that can be
> transmitted without checking the DRQ line.
> IOW, block_size = burst_size * X

Thats fine, API expects words for this and also a width value. Client shoudl
pass both and for programming you should use bytes converted from words and
width.
Vinod Koul March 11, 2016, 6:26 a.m. UTC | #21
On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote:
> On Tue, 8 Mar 2016 08:25:47 +0530
> Vinod Koul <vinod.koul@intel.com> wrote:
> > 
> > Why does dmaengine need to wait? Can you explain that
> 
> I don't have an answer for that one, but when I set WAIT_CYCLES to 1
> for the NAND use case it does not work. So I guess it is somehow
> related to how the DRQ line is controlled on the device side...

Is the WAIT cycle different for different usages or same for all
usages/channels?
Boris BREZILLON March 11, 2016, 9:40 a.m. UTC | #22
On Fri, 11 Mar 2016 11:54:52 +0530
Vinod Koul <vinod.koul@intel.com> wrote:

> On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote:
> > > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > > + *
> > > > > > > + * @para: contains information about block size and time before checking
> > > > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > > > + *	  DMA channels
> > > > > > 
> > > > > > What information, can you elobrate.. And why can't you use existing
> > > > > > dma_slave_config for this?
> > > > > 
> > > > > Block size is related to the device FIFO size. I guess it allows the
> > > > > DMA channel to launch a transfer of X bytes without having to check the
> > > > > DRQ line (the line telling the DMA engine it can transfer more data
> > > > > to/from the device). The wait cycles information is apparently related
> > > > > to the number of clks the engine should wait before polling/checking
> > > > > the DRQ line status between each block transfer. I'm not sure what it
> > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > > > Allwinner tweak that depending on the device.
> > > 
> > > we already have block size aka src/dst_maxburst, why not use that one.
> > 
> > Okay, but then remains the question "how should we choose the real burst
> > size?". The block size described in Allwinner datasheet is not the
> > number of words you will transmit without being preempted by other
> > master -> slave requests, it's the number of bytes that can be
> > transmitted without checking the DRQ line.
> > IOW, block_size = burst_size * X
> 
> Thats fine, API expects words for this and also a width value. Client shoudl
> pass both and for programming you should use bytes converted from words and
> width.
> 

Not sure I get what you mean. Are you suggesting to add new fields to
the dma_slave_config struct to describe this block concept, or should
we pass it through ->xxx_burstsize, and try to guess the real burstsize?
In the latter case, you still haven't answered my question: how should
we choose the burstsize?
Boris BREZILLON March 11, 2016, 9:45 a.m. UTC | #23
On Fri, 11 Mar 2016 11:56:07 +0530
Vinod Koul <vinod.koul@intel.com> wrote:

> On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote:
> > On Tue, 8 Mar 2016 08:25:47 +0530
> > Vinod Koul <vinod.koul@intel.com> wrote:
> > > 
> > > Why does dmaengine need to wait? Can you explain that
> > 
> > I don't have an answer for that one, but when I set WAIT_CYCLES to 1
> > for the NAND use case it does not work. So I guess it is somehow
> > related to how the DRQ line is controlled on the device side...
> 
> Is the WAIT cycle different for different usages or same for all
> usages/channels?
> 

In Allwinner BSP they adapt it on a per slave device basis, but since
DMA channels are dynamically allocated, you can't know in advance which
physical channel will be attached to a specific device.

Another option I considered was adding a new cell to the sun4i DT
binding to encode these WAIT_CYCLES and BLOCK_SIZE information. But I'm
not sure adding that to the DT is a good idea (not to mention that it
would break DT ABI again, and given the last discussions on this topic,
I'm not sure it's a good idea :-/).
Vinod Koul March 11, 2016, 10:06 a.m. UTC | #24
On Fri, Mar 11, 2016 at 10:40:55AM +0100, Boris Brezillon wrote:
> On Fri, 11 Mar 2016 11:54:52 +0530
> Vinod Koul <vinod.koul@intel.com> wrote:
> 
> > On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote:
> > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > > > + *
> > > > > > > > + * @para: contains information about block size and time before checking
> > > > > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > > > > + *	  DMA channels
> > > > > > > 
> > > > > > > What information, can you elobrate.. And why can't you use existing
> > > > > > > dma_slave_config for this?
> > > > > > 
> > > > > > Block size is related to the device FIFO size. I guess it allows the
> > > > > > DMA channel to launch a transfer of X bytes without having to check the
> > > > > > DRQ line (the line telling the DMA engine it can transfer more data
> > > > > > to/from the device). The wait cycles information is apparently related
> > > > > > to the number of clks the engine should wait before polling/checking
> > > > > > the DRQ line status between each block transfer. I'm not sure what it
> > > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > > > > Allwinner tweak that depending on the device.
> > > > 
> > > > we already have block size aka src/dst_maxburst, why not use that one.
> > > 
> > > Okay, but then remains the question "how should we choose the real burst
> > > size?". The block size described in Allwinner datasheet is not the
> > > number of words you will transmit without being preempted by other
> > > master -> slave requests, it's the number of bytes that can be
> > > transmitted without checking the DRQ line.
> > > IOW, block_size = burst_size * X
> > 
> > Thats fine, API expects words for this and also a width value. Client shoudl
> > pass both and for programming you should use bytes converted from words and
> > width.
> > 
> 
> Not sure I get what you mean. Are you suggesting to add new fields to
> the dma_slave_config struct to describe this block concept, or should

No

> we pass it through ->xxx_burstsize, and try to guess the real burstsize?

Pass the real burstsize in words

> In the latter case, you still haven't answered my question: how should
> we choose the burstsize?

From word value convert to bytes and program HW

burst(in bytes) = burst (in words ) * buswidth;
Vinod Koul March 11, 2016, 10:09 a.m. UTC | #25
On Fri, Mar 11, 2016 at 10:45:52AM +0100, Boris Brezillon wrote:
> On Fri, 11 Mar 2016 11:56:07 +0530
> Vinod Koul <vinod.koul@intel.com> wrote:
> 
> > On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote:
> > > On Tue, 8 Mar 2016 08:25:47 +0530
> > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > > 
> > > > Why does dmaengine need to wait? Can you explain that
> > > 
> > > I don't have an answer for that one, but when I set WAIT_CYCLES to 1
> > > for the NAND use case it does not work. So I guess it is somehow
> > > related to how the DRQ line is controlled on the device side...
> > 
> > Is the WAIT cycle different for different usages or same for all
> > usages/channels?
> > 
> 
> In Allwinner BSP they adapt it on a per slave device basis, but since
> DMA channels are dynamically allocated, you can't know in advance which
> physical channel will be attached to a specific device.

And we have the correct values availble in datasheet for all usages

> Another option I considered was adding a new cell to the sun4i DT
> binding to encode these WAIT_CYCLES and BLOCK_SIZE information. But I'm
> not sure adding that to the DT is a good idea (not to mention that it
> would break DT ABI again, and given the last discussions on this topic,
> I'm not sure it's a good idea :-/).

Yes i was veering towards DT as well. This is a new property so ABI rules
wont break as long as driver still works with old properties.

But this nees to be property for clients and not driver. Client can then
program these
Boris BREZILLON March 11, 2016, 10:26 a.m. UTC | #26
On Fri, 11 Mar 2016 15:36:07 +0530
Vinod Koul <vinod.koul@intel.com> wrote:

> On Fri, Mar 11, 2016 at 10:40:55AM +0100, Boris Brezillon wrote:
> > On Fri, 11 Mar 2016 11:54:52 +0530
> > Vinod Koul <vinod.koul@intel.com> wrote:
> > 
> > > On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote:
> > > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > > > > + *
> > > > > > > > > + * @para: contains information about block size and time before checking
> > > > > > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > > > > > + *	  DMA channels
> > > > > > > > 
> > > > > > > > What information, can you elobrate.. And why can't you use existing
> > > > > > > > dma_slave_config for this?
> > > > > > > 
> > > > > > > Block size is related to the device FIFO size. I guess it allows the
> > > > > > > DMA channel to launch a transfer of X bytes without having to check the
> > > > > > > DRQ line (the line telling the DMA engine it can transfer more data
> > > > > > > to/from the device). The wait cycles information is apparently related
> > > > > > > to the number of clks the engine should wait before polling/checking
> > > > > > > the DRQ line status between each block transfer. I'm not sure what it
> > > > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > > > > > Allwinner tweak that depending on the device.
> > > > > 
> > > > > we already have block size aka src/dst_maxburst, why not use that one.
> > > > 
> > > > Okay, but then remains the question "how should we choose the real burst
> > > > size?". The block size described in Allwinner datasheet is not the
> > > > number of words you will transmit without being preempted by other
> > > > master -> slave requests, it's the number of bytes that can be
> > > > transmitted without checking the DRQ line.
> > > > IOW, block_size = burst_size * X
> > > 
> > > Thats fine, API expects words for this and also a width value. Client shoudl
> > > pass both and for programming you should use bytes converted from words and
> > > width.
> > > 
> > 
> > Not sure I get what you mean. Are you suggesting to add new fields to
> > the dma_slave_config struct to describe this block concept, or should
> 
> No
> 
> > we pass it through ->xxx_burstsize, and try to guess the real burstsize?
> 
> Pass the real burstsize in words
> 
> > In the latter case, you still haven't answered my question: how should
> > we choose the burstsize?
> 
> From word value convert to bytes and program HW
> 
> burst(in bytes) = burst (in words ) * buswidth;
> 


Except, as already explained, the blocksize and burstsize concepts are
not exactly the same, and the sunxi engine expect both to be defined.
So let's take a real example to illustrate my question:

For the NAND use case, here is my DMA channel setup:

buswidth (or wordsize) = 4 bytes
burstsize = 4 words (32 bytes)
blocksize = 128 bytes

Here, you can see that blocksize = 4 * burstsize, and again, burstsize
and blocksize are not encoding the same thing. So, assuming we use
->src/dst_burstsize to encode the blocksize in our case, how should we
deduce the real burstsize (which still needs to be configured in the
engine).
Maxime Ripard March 11, 2016, 10:55 a.m. UTC | #27
On Fri, Mar 11, 2016 at 03:39:02PM +0530, Vinod Koul wrote:
> On Fri, Mar 11, 2016 at 10:45:52AM +0100, Boris Brezillon wrote:
> > On Fri, 11 Mar 2016 11:56:07 +0530
> > Vinod Koul <vinod.koul@intel.com> wrote:
> > 
> > > On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote:
> > > > On Tue, 8 Mar 2016 08:25:47 +0530
> > > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > > > 
> > > > > Why does dmaengine need to wait? Can you explain that
> > > > 
> > > > I don't have an answer for that one, but when I set WAIT_CYCLES to 1
> > > > for the NAND use case it does not work. So I guess it is somehow
> > > > related to how the DRQ line is controlled on the device side...
> > > 
> > > Is the WAIT cycle different for different usages or same for all
> > > usages/channels?
> > > 
> > 
> > In Allwinner BSP they adapt it on a per slave device basis, but since
> > DMA channels are dynamically allocated, you can't know in advance which
> > physical channel will be attached to a specific device.
> 
> And we have the correct values availble in datasheet for all usages

No, we don't.

If you look at the datasheet in question, page 169.
https://github.com/allwinner-zh/documents/blob/master/A20/A20_User_Manual_v1.4_20150510.pdf

This is the only documentation we have. And as you can see, it is very
sparse (and that's an understament).

So we cannot make that assumption, so far the values have been found
through trial and error for the devices in question.

> > Another option I considered was adding a new cell to the sun4i DT
> > binding to encode these WAIT_CYCLES and BLOCK_SIZE information. But I'm
> > not sure adding that to the DT is a good idea (not to mention that it
> > would break DT ABI again, and given the last discussions on this topic,
> > I'm not sure it's a good idea :-/).
> 
> Yes i was veering towards DT as well. This is a new property so ABI rules
> wont break as long as driver still works with old properties.

Yeah, we can always default to our current hardcoded value if the
property is missing. And since no-one is using the engine at the
moment anyway, so it's not really a big deal.

> But this nees to be property for clients and not driver. Client can then
> program these

Yes, totally. The question here is how the clients give that
information to the driver.

Maxime
Vinod Koul March 11, 2016, 11:18 a.m. UTC | #28
On Fri, Mar 11, 2016 at 11:55:49AM +0100, Maxime Ripard wrote:
> On Fri, Mar 11, 2016 at 03:39:02PM +0530, Vinod Koul wrote:
> > On Fri, Mar 11, 2016 at 10:45:52AM +0100, Boris Brezillon wrote:
> > > On Fri, 11 Mar 2016 11:56:07 +0530
> > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > 
> > > > On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote:
> > > > > On Tue, 8 Mar 2016 08:25:47 +0530
> > > > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > > > > 
> > > > > > Why does dmaengine need to wait? Can you explain that
> > > > > 
> > > > > I don't have an answer for that one, but when I set WAIT_CYCLES to 1
> > > > > for the NAND use case it does not work. So I guess it is somehow
> > > > > related to how the DRQ line is controlled on the device side...
> > > > 
> > > > Is the WAIT cycle different for different usages or same for all
> > > > usages/channels?
> > > > 
> > > 
> > > In Allwinner BSP they adapt it on a per slave device basis, but since
> > > DMA channels are dynamically allocated, you can't know in advance which
> > > physical channel will be attached to a specific device.
> > 
> > And we have the correct values availble in datasheet for all usages
> 
> No, we don't.
> 
> If you look at the datasheet in question, page 169.
> https://github.com/allwinner-zh/documents/blob/master/A20/A20_User_Manual_v1.4_20150510.pdf
> 
> This is the only documentation we have. And as you can see, it is very
> sparse (and that's an understament).
> 
> So we cannot make that assumption, so far the values have been found
> through trial and error for the devices in question.
> 
> > > Another option I considered was adding a new cell to the sun4i DT
> > > binding to encode these WAIT_CYCLES and BLOCK_SIZE information. But I'm
> > > not sure adding that to the DT is a good idea (not to mention that it
> > > would break DT ABI again, and given the last discussions on this topic,
> > > I'm not sure it's a good idea :-/).
> > 
> > Yes i was veering towards DT as well. This is a new property so ABI rules
> > wont break as long as driver still works with old properties.
> 
> Yeah, we can always default to our current hardcoded value if the
> property is missing. And since no-one is using the engine at the
> moment anyway, so it's not really a big deal.
> 
> > But this nees to be property for clients and not driver. Client can then
> > program these
> 
> Yes, totally. The question here is how the clients give that
> information to the driver.

For this part am not worried. If we can generalize this then we add to
dma_slave_config. Otherwise an exported symbol from driver should be fine.
Vinod Koul March 11, 2016, 11:21 a.m. UTC | #29
On Fri, Mar 11, 2016 at 11:26:31AM +0100, Boris Brezillon wrote:
> On Fri, 11 Mar 2016 15:36:07 +0530
> Vinod Koul <vinod.koul@intel.com> wrote:
> 
> > On Fri, Mar 11, 2016 at 10:40:55AM +0100, Boris Brezillon wrote:
> > > On Fri, 11 Mar 2016 11:54:52 +0530
> > > Vinod Koul <vinod.koul@intel.com> wrote:
> > > 
> > > > On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote:
> > > > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config
> > > > > > > > > > + *
> > > > > > > > > > + * @para: contains information about block size and time before checking
> > > > > > > > > > + *	  DRQ line. This is device specific and only applicable to dedicated
> > > > > > > > > > + *	  DMA channels
> > > > > > > > > 
> > > > > > > > > What information, can you elobrate.. And why can't you use existing
> > > > > > > > > dma_slave_config for this?
> > > > > > > > 
> > > > > > > > Block size is related to the device FIFO size. I guess it allows the
> > > > > > > > DMA channel to launch a transfer of X bytes without having to check the
> > > > > > > > DRQ line (the line telling the DMA engine it can transfer more data
> > > > > > > > to/from the device). The wait cycles information is apparently related
> > > > > > > > to the number of clks the engine should wait before polling/checking
> > > > > > > > the DRQ line status between each block transfer. I'm not sure what it
> > > > > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP,
> > > > > > > > Allwinner tweak that depending on the device.
> > > > > > 
> > > > > > we already have block size aka src/dst_maxburst, why not use that one.
> > > > > 
> > > > > Okay, but then remains the question "how should we choose the real burst
> > > > > size?". The block size described in Allwinner datasheet is not the
> > > > > number of words you will transmit without being preempted by other
> > > > > master -> slave requests, it's the number of bytes that can be
> > > > > transmitted without checking the DRQ line.
> > > > > IOW, block_size = burst_size * X
> > > > 
> > > > Thats fine, API expects words for this and also a width value. Client shoudl
> > > > pass both and for programming you should use bytes converted from words and
> > > > width.
> > > > 
> > > 
> > > Not sure I get what you mean. Are you suggesting to add new fields to
> > > the dma_slave_config struct to describe this block concept, or should
> > 
> > No
> > 
> > > we pass it through ->xxx_burstsize, and try to guess the real burstsize?
> > 
> > Pass the real burstsize in words
> > 
> > > In the latter case, you still haven't answered my question: how should
> > > we choose the burstsize?
> > 
> > From word value convert to bytes and program HW
> > 
> > burst(in bytes) = burst (in words ) * buswidth;
> > 
> 
> 
> Except, as already explained, the blocksize and burstsize concepts are
> not exactly the same, and the sunxi engine expect both to be defined.
> So let's take a real example to illustrate my question:
> 
> For the NAND use case, here is my DMA channel setup:
> 
> buswidth (or wordsize) = 4 bytes
> burstsize = 4 words (32 bytes)
> blocksize = 128 bytes
> 
> Here, you can see that blocksize = 4 * burstsize, and again, burstsize
> and blocksize are not encoding the same thing. So, assuming we use
> ->src/dst_burstsize to encode the blocksize in our case, how should we
> deduce the real burstsize (which still needs to be configured in the
> engine).

Oh, i was somehow under the impression they are same! Then we can't use
blocksize here, pls pass burst and width properly.

How is block size calculated?
Maxime Ripard March 14, 2016, 11:46 a.m. UTC | #30
On Fri, Mar 11, 2016 at 04:48:26PM +0530, Vinod Koul wrote:
> > > But this nees to be property for clients and not driver. Client can then
> > > program these
> > 
> > Yes, totally. The question here is how the clients give that
> > information to the driver.
> 
> For this part am not worried. If we can generalize this then we add to
> dma_slave_config. Otherwise an exported symbol from driver should be fine.

It's actually what we would like to avoid.

We have two potential provider driver that would need such an
interface, and we have customer drivers that would be able to use any
of these two, depending on which SoCs we're talking about.

Maintaining some logic in each and every driver in that case to know
which one of this symbol is to be called seems counterproductive and
painful.

Maxime
Vinod Koul March 16, 2016, 3:22 a.m. UTC | #31
On Mon, Mar 14, 2016 at 12:46:41PM +0100, Maxime Ripard wrote:
> On Fri, Mar 11, 2016 at 04:48:26PM +0530, Vinod Koul wrote:
> > > > But this nees to be property for clients and not driver. Client can then
> > > > program these
> > > 
> > > Yes, totally. The question here is how the clients give that
> > > information to the driver.
> > 
> > For this part am not worried. If we can generalize this then we add to
> > dma_slave_config. Otherwise an exported symbol from driver should be fine.
> 
> It's actually what we would like to avoid.
> 
> We have two potential provider driver that would need such an
> interface, and we have customer drivers that would be able to use any
> of these two, depending on which SoCs we're talking about.
> 
> Maintaining some logic in each and every driver in that case to know
> which one of this symbol is to be called seems counterproductive and
> painful.

You didn't specify which one you want to avoid, and my guess is latter
choice and not former :)

As I said, if it's something we can use in few examples and describe
generically I do not mind adding to dma_slave_config
diff mbox

Patch

diff --git a/drivers/dma/sun4i-dma.c b/drivers/dma/sun4i-dma.c
index 1661d518..e48f537 100644
--- a/drivers/dma/sun4i-dma.c
+++ b/drivers/dma/sun4i-dma.c
@@ -12,6 +12,7 @@ 
 #include <linux/bitops.h>
 #include <linux/clk.h>
 #include <linux/dmaengine.h>
+#include <linux/dma/sun4i-dma.h>
 #include <linux/dmapool.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
@@ -138,6 +139,7 @@  struct sun4i_dma_pchan {
 struct sun4i_dma_vchan {
 	struct virt_dma_chan		vc;
 	struct dma_slave_config		cfg;
+	struct sun4i_dma_chan_config	scfg;
 	struct sun4i_dma_pchan		*pchan;
 	struct sun4i_dma_promise	*processing;
 	struct sun4i_dma_contract	*contract;
@@ -779,7 +781,7 @@  sun4i_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	u8 ram_type, io_mode, linear_mode;
 	struct scatterlist *sg;
 	dma_addr_t srcaddr, dstaddr;
-	u32 endpoints, para;
+	u32 endpoints;
 	int i;
 
 	if (!sgl)
@@ -825,17 +827,6 @@  sun4i_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 			dstaddr = sg_dma_address(sg);
 		}
 
-		/*
-		 * These are the magic DMA engine timings that keep SPI going.
-		 * I haven't seen any interface on DMAEngine to configure
-		 * timings, and so far they seem to work for everything we
-		 * support, so I've kept them here. I don't know if other
-		 * devices need different timings because, as usual, we only
-		 * have the "para" bitfield meanings, but no comment on what
-		 * the values should be when doing a certain operation :|
-		 */
-		para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS;
-
 		/* And make a suitable promise */
 		if (vchan->is_dedicated)
 			promise = generate_ddma_promise(chan, srcaddr, dstaddr,
@@ -850,7 +841,7 @@  sun4i_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 			return NULL; /* TODO: should we free everything? */
 
 		promise->cfg |= endpoints;
-		promise->para = para;
+		promise->para = vchan->scfg.para;
 
 		/* Then add it to the contract */
 		list_add_tail(&promise->list, &contract->demands);
@@ -908,6 +899,21 @@  static int sun4i_dma_config(struct dma_chan *chan,
 	return 0;
 }
 
+int sun4i_dma_set_chan_config(struct dma_chan *dchan,
+			      const struct sun4i_dma_chan_config *cfg)
+{
+	struct sun4i_dma_vchan *vchan = to_sun4i_dma_vchan(dchan);
+
+	if (!vchan->is_dedicated)
+		return -ENOTSUPP;
+
+	/* TODO: control cfg value */
+	vchan->scfg = *cfg;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sun4i_dma_set_chan_config);
+
 static struct dma_chan *sun4i_dma_of_xlate(struct of_phandle_args *dma_spec,
 					   struct of_dma *ofdma)
 {
@@ -1206,6 +1212,18 @@  static int sun4i_dma_probe(struct platform_device *pdev)
 		spin_lock_init(&vchan->vc.lock);
 		vchan->vc.desc_free = sun4i_dma_free_contract;
 		vchan_init(&vchan->vc, &priv->slave);
+
+		/*
+		 * These are the magic DMA engine timings that keep SPI going.
+		 * I haven't seen any interface on DMAEngine to configure
+		 * timings, and so far they seem to work for everything we
+		 * support, so I've kept them here. I don't know if other
+		 * devices need different timings because, as usual, we only
+		 * have the "para" bitfield meanings, but no comment on what
+		 * the values should be when doing a certain operation :|
+		 */
+		vchan->scfg.para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS;
+
 	}
 
 	ret = clk_prepare_enable(priv->clk);
diff --git a/include/linux/dma/sun4i-dma.h b/include/linux/dma/sun4i-dma.h
new file mode 100644
index 0000000..f643539
--- /dev/null
+++ b/include/linux/dma/sun4i-dma.h
@@ -0,0 +1,38 @@ 
+/*
+ * Sun4i DMA Engine drivers support header file
+ *
+ * Copyright (C) 2016 Free Electrons. All rights reserved.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _SUN4I_DMA_H
+#define _SUN4I_DMA_H
+
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+
+/* Dedicated DMA parameter register layout */
+#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n)	(((n) - 1) << 24)
+#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n)	(((n) - 1) << 16)
+#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n)	(((n) - 1) << 8)
+#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n)	(((n) - 1) << 0)
+
+/**
+ * struct sun4i_dma_chan_config - DMA channel config
+ *
+ * @para: contains information about block size and time before checking
+ *	  DRQ line. This is device specific and only applicable to dedicated
+ *	  DMA channels
+ */
+struct sun4i_dma_chan_config {
+	u32 para;
+};
+
+int sun4i_dma_set_chan_config(struct dma_chan *dchan,
+			      const struct sun4i_dma_chan_config *cfg);
+
+#endif /* _SUN4I_DMA_H */