Message ID | 1457344771-12946-1-git-send-email-boris.brezillon@free-electrons.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: > +/* Dedicated DMA parameter register layout */ > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > + > +/** > + * struct sun4i_dma_chan_config - DMA channel config > + * > + * @para: contains information about block size and time before checking > + * DRQ line. This is device specific and only applicable to dedicated > + * DMA channels What information, can you elobrate.. And why can't you use existing dma_slave_config for this? > + */ > +struct sun4i_dma_chan_config { > + u32 para; > +}; > + > +int sun4i_dma_set_chan_config(struct dma_chan *dchan, > + const struct sun4i_dma_chan_config *cfg); > + > +#endif /* _SUN4I_DMA_H */
Hi Vinod, On Mon, 7 Mar 2016 20:24:29 +0530 Vinod Koul <vinod.koul@intel.com> wrote: > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: > > +/* Dedicated DMA parameter register layout */ > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > > + > > +/** > > + * struct sun4i_dma_chan_config - DMA channel config > > + * > > + * @para: contains information about block size and time before checking > > + * DRQ line. This is device specific and only applicable to dedicated > > + * DMA channels > > What information, can you elobrate.. And why can't you use existing > dma_slave_config for this? Block size is related to the device FIFO size. I guess it allows the DMA channel to launch a transfer of X bytes without having to check the DRQ line (the line telling the DMA engine it can transfer more data to/from the device). The wait cycles information is apparently related to the number of clks the engine should wait before polling/checking the DRQ line status between each block transfer. I'm not sure what it saves to put WAIT_CYCLES() to something != 1, but in their BSP, Allwinner tweak that depending on the device. Note that I'd be happy if the above configuration could go into the generic dma_slave_config struct. This way we could avoid per-engine specific APIs. Best Regards, Boris
On Mon, 2016-03-07 at 10:59 +0100, Boris Brezillon wrote: > Some drivers might need to tweak the block size and wait cycles > values > to get better performances. > Create and export the sun4i_dma_set_chan_config() to do that. > > Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com> > --- > drivers/dma/sun4i-dma.c | 44 ++++++++++++++++++++++++++++++--- > ---------- > include/linux/dma/sun4i-dma.h | 38 > +++++++++++++++++++++++++++++++++++++ > 2 files changed, 69 insertions(+), 13 deletions(-) > create mode 100644 include/linux/dma/sun4i-dma.h > > diff --git a/drivers/dma/sun4i-dma.c b/drivers/dma/sun4i-dma.c > index 1661d518..e48f537 100644 > --- a/drivers/dma/sun4i-dma.c > +++ b/drivers/dma/sun4i-dma.c > @@ -12,6 +12,7 @@ > #include <linux/bitops.h> > #include <linux/clk.h> > #include <linux/dmaengine.h> > +#include <linux/dma/sun4i-dma.h> > #include <linux/dmapool.h> > #include <linux/interrupt.h> > #include <linux/module.h> > @@ -138,6 +139,7 @@ struct sun4i_dma_pchan { > struct sun4i_dma_vchan { > struct virt_dma_chan vc; > struct dma_slave_config cfg; > + struct sun4i_dma_chan_config scfg; > struct sun4i_dma_pchan *pchan; > struct sun4i_dma_promise *processing; > struct sun4i_dma_contract *contract; > @@ -779,7 +781,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, > struct scatterlist *sgl, > u8 ram_type, io_mode, linear_mode; > struct scatterlist *sg; > dma_addr_t srcaddr, dstaddr; > - u32 endpoints, para; > + u32 endpoints; > int i; > > if (!sgl) > @@ -825,17 +827,6 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, > struct scatterlist *sgl, > dstaddr = sg_dma_address(sg); > } > > - /* > - * These are the magic DMA engine timings that keep > SPI going. > - * I haven't seen any interface on DMAEngine to > configure > - * timings, and so far they seem to work for > everything we > - * support, so I've kept them here. I don't know if > other > - * devices need different timings because, as usual, > we only > - * have the "para" bitfield meanings, but no comment > on what > - * the values should be when doing a certain > operation :| > - */ > - para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS; > - > /* And make a suitable promise */ > if (vchan->is_dedicated) > promise = generate_ddma_promise(chan, > srcaddr, dstaddr, > @@ -850,7 +841,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, > struct scatterlist *sgl, > return NULL; /* TODO: should we free > everything? */ > > promise->cfg |= endpoints; > - promise->para = para; > + promise->para = vchan->scfg.para; > > /* Then add it to the contract */ > list_add_tail(&promise->list, &contract->demands); > @@ -908,6 +899,21 @@ static int sun4i_dma_config(struct dma_chan > *chan, > return 0; > } > > +int sun4i_dma_set_chan_config(struct dma_chan *dchan, > + const struct sun4i_dma_chan_config > *cfg) > +{ > + struct sun4i_dma_vchan *vchan = to_sun4i_dma_vchan(dchan); > + > + if (!vchan->is_dedicated) > + return -ENOTSUPP; > + > + /* TODO: control cfg value */ > + vchan->scfg = *cfg; > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(sun4i_dma_set_chan_config); > + > static struct dma_chan *sun4i_dma_of_xlate(struct of_phandle_args > *dma_spec, > struct of_dma *ofdma) > { > @@ -1206,6 +1212,18 @@ static int sun4i_dma_probe(struct > platform_device *pdev) > spin_lock_init(&vchan->vc.lock); > vchan->vc.desc_free = sun4i_dma_free_contract; > vchan_init(&vchan->vc, &priv->slave); > + > + /* > + * These are the magic DMA engine timings that keep > SPI going. > + * I haven't seen any interface on DMAEngine to > configure > + * timings, and so far they seem to work for > everything we > + * support, so I've kept them here. I don't know if > other > + * devices need different timings because, as usual, > we only > + * have the "para" bitfield meanings, but no comment > on what > + * the values should be when doing a certain > operation :| > + */ > + vchan->scfg.para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS; Does SPI refer the Serial Peripheral Interface? If yes, then I would point out that current sun4i SPI driver doesn't actually use DMA [1] http://lists.infradead.org/pipermail/linux-arm-kernel/2016-February/411 722.html > + > } > > ret = clk_prepare_enable(priv->clk); > diff --git a/include/linux/dma/sun4i-dma.h b/include/linux/dma/sun4i > -dma.h > new file mode 100644 > index 0000000..f643539 > --- /dev/null > +++ b/include/linux/dma/sun4i-dma.h > @@ -0,0 +1,38 @@ > +/* > + * Sun4i DMA Engine drivers support header file > + * > + * Copyright (C) 2016 Free Electrons. All rights reserved. > + * > + * This is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published > by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + */ > + > +#ifndef _SUN4I_DMA_H > +#define _SUN4I_DMA_H > + > +#include <linux/dma-mapping.h> > +#include <linux/dmaengine.h> > + > +/* Dedicated DMA parameter register layout */ > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << > 24) > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > + > +/** > + * struct sun4i_dma_chan_config - DMA channel config > + * > + * @para: contains information about block size and time before > checking > + * DRQ line. This is device specific and only applicable to > dedicated > + * DMA channels > + */ > +struct sun4i_dma_chan_config { > + u32 para; > +}; > + > +int sun4i_dma_set_chan_config(struct dma_chan *dchan, > + const struct sun4i_dma_chan_config > *cfg); > + > +#endif /* _SUN4I_DMA_H */ > -- > 2.1.4 >
Hi Priit, On Mon, 07 Mar 2016 17:30:41 +0200 Priit Laes <plaes@plaes.org> wrote: > On Mon, 2016-03-07 at 10:59 +0100, Boris Brezillon wrote: > > Some drivers might need to tweak the block size and wait cycles > > values > > to get better performances. > > Create and export the sun4i_dma_set_chan_config() to do that. > > > > Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com> > > --- > > drivers/dma/sun4i-dma.c | 44 ++++++++++++++++++++++++++++++--- > > ---------- > > include/linux/dma/sun4i-dma.h | 38 > > +++++++++++++++++++++++++++++++++++++ > > 2 files changed, 69 insertions(+), 13 deletions(-) > > create mode 100644 include/linux/dma/sun4i-dma.h > > > > diff --git a/drivers/dma/sun4i-dma.c b/drivers/dma/sun4i-dma.c > > index 1661d518..e48f537 100644 > > --- a/drivers/dma/sun4i-dma.c > > +++ b/drivers/dma/sun4i-dma.c > > @@ -12,6 +12,7 @@ > > #include <linux/bitops.h> > > #include <linux/clk.h> > > #include <linux/dmaengine.h> > > +#include <linux/dma/sun4i-dma.h> > > #include <linux/dmapool.h> > > #include <linux/interrupt.h> > > #include <linux/module.h> > > @@ -138,6 +139,7 @@ struct sun4i_dma_pchan { > > struct sun4i_dma_vchan { > > struct virt_dma_chan vc; > > struct dma_slave_config cfg; > > + struct sun4i_dma_chan_config scfg; > > struct sun4i_dma_pchan *pchan; > > struct sun4i_dma_promise *processing; > > struct sun4i_dma_contract *contract; > > @@ -779,7 +781,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, > > struct scatterlist *sgl, > > u8 ram_type, io_mode, linear_mode; > > struct scatterlist *sg; > > dma_addr_t srcaddr, dstaddr; > > - u32 endpoints, para; > > + u32 endpoints; > > int i; > > > > if (!sgl) > > @@ -825,17 +827,6 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, > > struct scatterlist *sgl, > > dstaddr = sg_dma_address(sg); > > } > > > > - /* > > - * These are the magic DMA engine timings that keep > > SPI going. > > - * I haven't seen any interface on DMAEngine to > > configure > > - * timings, and so far they seem to work for > > everything we > > - * support, so I've kept them here. I don't know if > > other > > - * devices need different timings because, as usual, > > we only > > - * have the "para" bitfield meanings, but no comment > > on what > > - * the values should be when doing a certain > > operation :| > > - */ > > - para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS; > > - > > /* And make a suitable promise */ > > if (vchan->is_dedicated) > > promise = generate_ddma_promise(chan, > > srcaddr, dstaddr, > > @@ -850,7 +841,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, > > struct scatterlist *sgl, > > return NULL; /* TODO: should we free > > everything? */ > > > > promise->cfg |= endpoints; > > - promise->para = para; > > + promise->para = vchan->scfg.para; > > > > /* Then add it to the contract */ > > list_add_tail(&promise->list, &contract->demands); > > @@ -908,6 +899,21 @@ static int sun4i_dma_config(struct dma_chan > > *chan, > > return 0; > > } > > > > +int sun4i_dma_set_chan_config(struct dma_chan *dchan, > > + const struct sun4i_dma_chan_config > > *cfg) > > +{ > > + struct sun4i_dma_vchan *vchan = to_sun4i_dma_vchan(dchan); > > + > > + if (!vchan->is_dedicated) > > + return -ENOTSUPP; > > + > > + /* TODO: control cfg value */ > > + vchan->scfg = *cfg; > > + > > + return 0; > > +} > > +EXPORT_SYMBOL_GPL(sun4i_dma_set_chan_config); > > + > > static struct dma_chan *sun4i_dma_of_xlate(struct of_phandle_args > > *dma_spec, > > struct of_dma *ofdma) > > { > > @@ -1206,6 +1212,18 @@ static int sun4i_dma_probe(struct > > platform_device *pdev) > > spin_lock_init(&vchan->vc.lock); > > vchan->vc.desc_free = sun4i_dma_free_contract; > > vchan_init(&vchan->vc, &priv->slave); > > + > > + /* > > + * These are the magic DMA engine timings that keep > > SPI going. > > + * I haven't seen any interface on DMAEngine to > > configure > > + * timings, and so far they seem to work for > > everything we > > + * support, so I've kept them here. I don't know if > > other > > + * devices need different timings because, as usual, > > we only > > + * have the "para" bitfield meanings, but no comment > > on what > > + * the values should be when doing a certain > > operation :| > > + */ > > + vchan->scfg.para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS; > > Does SPI refer the Serial Peripheral Interface? > > If yes, then I would point out that current sun4i SPI driver doesn't > actually use DMA [1] > > http://lists.infradead.org/pipermail/linux-arm-kernel/2016-February/411 > 722.html I just moved this assignment and the associated comment in the driver, so maybe we should ask Emilio why he thinks SPI config should be the default one, and how he tested it... Best Regards, Boris
Hi, El 07/03/16 a las 12:47, Boris Brezillon escribió: (...) >> Does SPI refer the Serial Peripheral Interface? >> >> If yes, then I would point out that current sun4i SPI driver doesn't >> actually use DMA [1] >> >> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-February/411 >> 722.html > > I just moved this assignment and the associated comment in the driver, > so maybe we should ask Emilio why he thinks SPI config should be the > default one, and how he tested it... When I was working on the dmaengine driver, I needed a way to test mem<->dev and mem<->mem transfers. I used dmatest.ko for the latter, and SPI was a good fit for the former as I had a logic analyzer. That's why there's patches to support DMA on it (but, as Priit pointed out, are not in mainline yet). At first SPI was acting weird when using DMA, but the problems went away when configuring the timings with these magic values you see on the driver today. These timings turned out to also work for audio, so there was no need for a mechanism to configure them. And that's basically the story behind SUN4I_DDMA_MAGIC_SPI_PARAMETERS :) Cheers, Emilio
On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote: > Hi Vinod, > > On Mon, 7 Mar 2016 20:24:29 +0530 > Vinod Koul <vinod.koul@intel.com> wrote: > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: > > > +/* Dedicated DMA parameter register layout */ > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > > > + > > > +/** > > > + * struct sun4i_dma_chan_config - DMA channel config > > > + * > > > + * @para: contains information about block size and time before checking > > > + * DRQ line. This is device specific and only applicable to dedicated > > > + * DMA channels > > > > What information, can you elobrate.. And why can't you use existing > > dma_slave_config for this? > > Block size is related to the device FIFO size. I guess it allows the > DMA channel to launch a transfer of X bytes without having to check the > DRQ line (the line telling the DMA engine it can transfer more data > to/from the device). The wait cycles information is apparently related > to the number of clks the engine should wait before polling/checking > the DRQ line status between each block transfer. I'm not sure what it > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > Allwinner tweak that depending on the device. > > Note that I'd be happy if the above configuration could go into the > generic dma_slave_config struct. This way we could avoid per-engine > specific APIs. And I'd really like to avoid that too. That will avoid to cripple the consumer drivers that might be using any of the two. Thanks! Maxime
On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote: > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote: > > Hi Vinod, > > > > On Mon, 7 Mar 2016 20:24:29 +0530 > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: > > > > +/* Dedicated DMA parameter register layout */ > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > > > > + > > > > +/** > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > + * > > > > + * @para: contains information about block size and time before checking > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > + * DMA channels > > > > > > What information, can you elobrate.. And why can't you use existing > > > dma_slave_config for this? > > > > Block size is related to the device FIFO size. I guess it allows the > > DMA channel to launch a transfer of X bytes without having to check the > > DRQ line (the line telling the DMA engine it can transfer more data > > to/from the device). The wait cycles information is apparently related > > to the number of clks the engine should wait before polling/checking > > the DRQ line status between each block transfer. I'm not sure what it > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > Allwinner tweak that depending on the device. we already have block size aka src/dst_maxburst, why not use that one. Why does dmaengine need to wait? Can you explain that > > Note that I'd be happy if the above configuration could go into the > > generic dma_slave_config struct. This way we could avoid per-engine > > specific APIs. > > And I'd really like to avoid that too. That will avoid to cripple the > consumer drivers that might be using any of the two. If it is fairly genric property we should add, otherwise yes we don't want that
On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote: > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote: > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote: Also just noticed the subsystem name on this is not correct, pls fix that in subsequent posting
On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote: > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote: > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote: > > > Hi Vinod, > > > > > > On Mon, 7 Mar 2016 20:24:29 +0530 > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: > > > > > +/* Dedicated DMA parameter register layout */ > > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) > > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > > > > > + > > > > > +/** > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > + * > > > > > + * @para: contains information about block size and time before checking > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > + * DMA channels > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > dma_slave_config for this? > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > DMA channel to launch a transfer of X bytes without having to check the > > > DRQ line (the line telling the DMA engine it can transfer more data > > > to/from the device). The wait cycles information is apparently related > > > to the number of clks the engine should wait before polling/checking > > > the DRQ line status between each block transfer. I'm not sure what it > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > Allwinner tweak that depending on the device. > > we already have block size aka src/dst_maxburst, why not use that one. I'm not sure it's really the same thing. The DMA controller also has a burst parameter, that is either 1 byte or 8 bytes, and ends up being different from this one. > Why does dmaengine need to wait? Can you explain that We have no idea, we thought you might have one :) It doesn't really makes sense to us, but it does have a significant impact on the throughput. Maxime
Hi, On 08-03-16 08:51, Maxime Ripard wrote: > On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote: >> On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote: >>> On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote: >>>> Hi Vinod, >>>> >>>> On Mon, 7 Mar 2016 20:24:29 +0530 >>>> Vinod Koul <vinod.koul@intel.com> wrote: >>>> >>>>> On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: >>>>>> +/* Dedicated DMA parameter register layout */ >>>>>> +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) >>>>>> +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) >>>>>> +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) >>>>>> +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) >>>>>> + >>>>>> +/** >>>>>> + * struct sun4i_dma_chan_config - DMA channel config >>>>>> + * >>>>>> + * @para: contains information about block size and time before checking >>>>>> + * DRQ line. This is device specific and only applicable to dedicated >>>>>> + * DMA channels >>>>> >>>>> What information, can you elobrate.. And why can't you use existing >>>>> dma_slave_config for this? >>>> >>>> Block size is related to the device FIFO size. I guess it allows the >>>> DMA channel to launch a transfer of X bytes without having to check the >>>> DRQ line (the line telling the DMA engine it can transfer more data >>>> to/from the device). The wait cycles information is apparently related >>>> to the number of clks the engine should wait before polling/checking >>>> the DRQ line status between each block transfer. I'm not sure what it >>>> saves to put WAIT_CYCLES() to something != 1, but in their BSP, >>>> Allwinner tweak that depending on the device. >> >> we already have block size aka src/dst_maxburst, why not use that one. > > I'm not sure it's really the same thing. The DMA controller also has a > burst parameter, that is either 1 byte or 8 bytes, and ends up being > different from this one. > >> Why does dmaengine need to wait? Can you explain that > > We have no idea, we thought you might have one :) > > It doesn't really makes sense to us, but it does have a significant > impact on the throughput. <wild speculation> I see 2 possible reasons why waiting till checking for drq can help: 1) A lot of devices have an internal fifo hooked up to a single mmio data register which gets read using the general purpose dma-engine, it allows this fifo to fill, and thus do burst transfers (We've seen similar issues with the scanout engine for the display which has its own dma engine, and doing larger transfers helps a lot). 2) Physical memory on the sunxi SoCs is (often) divided into banks with a shared data / address bus doing bank-switches is expensive, so this wait cycles may introduce latency which allows a user of another bank to complete its RAM accesses before the dma engine forces a bank switch, which ends up avoiding a lot of (interleaved) bank switches while both try to access a different banj and thus waiting makes things (much) faster in the end (again a known problem with the display scanout engine). </wild speculation> Note the differences these kinda tweaks make can be quite dramatic, when using a 1920x1080p60 hdmi output on the A10 SoC with a 16 bit memory bus (real world worst case scenario), the memory bandwidth left for userspace processes (measured through memset) almost doubles from 48 MB/s to 85 MB/s, source: http://ssvb.github.io/2014/11/11/revisiting-fullhd-x11-desktop-performance-of-the-allwinner-a10.html TL;DR: Waiting before starting DMA allows for doing larger burst transfers which ends up making things more efficient. Given this, I really expect there to be other dma-engines which have some option to wait a bit before starting/unpausing a transfer instead of starting it as soon as (more) data is available, so I think this would make a good addition to dma_slave_config. Regards, Hans
On Tue, 8 Mar 2016 08:51:31 +0100 Maxime Ripard <maxime.ripard@free-electrons.com> wrote: > On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote: > > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote: > > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote: > > > > Hi Vinod, > > > > > > > > On Mon, 7 Mar 2016 20:24:29 +0530 > > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: > > > > > > +/* Dedicated DMA parameter register layout */ > > > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) > > > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > > > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > > > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > > > > > > + > > > > > > +/** > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > + * > > > > > > + * @para: contains information about block size and time before checking > > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > > + * DMA channels > > > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > > dma_slave_config for this? > > > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > > DMA channel to launch a transfer of X bytes without having to check the > > > > DRQ line (the line telling the DMA engine it can transfer more data > > > > to/from the device). The wait cycles information is apparently related > > > > to the number of clks the engine should wait before polling/checking > > > > the DRQ line status between each block transfer. I'm not sure what it > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > > Allwinner tweak that depending on the device. > > > > we already have block size aka src/dst_maxburst, why not use that one. > > I'm not sure it's really the same thing. The DMA controller also has a > burst parameter, that is either 1 byte or 8 bytes, and ends up being > different from this one. Well, that's what I understood to, but when reading more carefully the src/dst_maxburst description, it seems to match the block_size concept exposed by the sun4i dmaengine. But how should we choose the real burst size then. IIRC, in most documentation/datasheets, burst size is referred as the maximum number of words (word size depends on the selected width) a master is allowed to transfer to a slave through the bus without being interrupted by other master requests. Am I correct? > > > Why does dmaengine need to wait? Can you explain that > > We have no idea, we thought you might have one :) Yes, it's really unclear to us why this is needed. There might be some kind of contention, or maybe the slave device takes some time to put DRQ line to low state, and without these wait_cycles the dmaengine would assume some data are still available in the FIFO while there's actually no more data to retrieve. > > It doesn't really makes sense to us, but it does have a significant > impact on the throughput. I wouldn't say significant impact, but tweaking those parameters has some impact on the performances, and since it's not that complicated to implement, I thought it was worth a try, but maybe I'm wrong. Best Regards, Boris
On Tue, 2016-03-08 at 09:46 +0100, Boris Brezillon wrote: > On Tue, 8 Mar 2016 08:51:31 +0100 > Maxime Ripard <maxime.ripard@free-electrons.com> wrote: > > > On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote: > > > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote: > > > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon > > > > wrote: > > > > > Hi Vinod, > > > > > > > > > > On Mon, 7 Mar 2016 20:24:29 +0530 > > > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon > > > > > > wrote: > > > > > > > +/* Dedicated DMA parameter register layout */ > > > > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n > > > > > > > ) - 1) << 24) > > > > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) > > > > > > > - 1) << 16) > > > > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n > > > > > > > ) - 1) << 8) > > > > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) > > > > > > > - 1) << 0) > > > > > > > + > > > > > > > +/** > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > > + * > > > > > > > + * @para: contains information about block size and time > > > > > > > before checking > > > > > > > + * DRQ line. This is device specific and only > > > > > > > applicable to dedicated > > > > > > > + * DMA channels > > > > > > > > > > > > What information, can you elobrate.. And why can't you use > > > > > > existing > > > > > > dma_slave_config for this? > > > > > > > > > > Block size is related to the device FIFO size. I guess it > > > > > allows the > > > > > DMA channel to launch a transfer of X bytes without having to > > > > > check the > > > > > DRQ line (the line telling the DMA engine it can transfer > > > > > more data > > > > > to/from the device). The wait cycles information is > > > > > apparently related > > > > > to the number of clks the engine should wait before > > > > > polling/checking > > > > > the DRQ line status between each block transfer. I'm not sure > > > > > what it > > > > > saves to put WAIT_CYCLES() to something != 1, but in their > > > > > BSP, > > > > > Allwinner tweak that depending on the device. > > > > > > we already have block size aka src/dst_maxburst, why not use that > > > one. > > > > I'm not sure it's really the same thing. The DMA controller also > > has a > > burst parameter, that is either 1 byte or 8 bytes, and ends up > > being > > different from this one. > > Well, that's what I understood to, but when reading more carefully > the > src/dst_maxburst description, it seems to match the block_size > concept > exposed by the sun4i dmaengine. But how should we choose the real > burst > size then. > IIRC, in most documentation/datasheets, burst size is referred as the > maximum number of words (word size depends on the selected width) a > master is allowed to transfer to a slave through the bus without > being interrupted by other master requests. > Am I correct? > > > > > > Why does dmaengine need to wait? Can you explain that > > > > We have no idea, we thought you might have one :) > > Yes, it's really unclear to us why this is needed. There might be > some > kind of contention, or maybe the slave device takes some time to put > DRQ line to low state, and without these wait_cycles the dmaengine > would assume some data are still available in the FIFO while there's > actually no more data to retrieve. > > > > > It doesn't really makes sense to us, but it does have a significant > > impact on the throughput. > > I wouldn't say significant impact, but tweaking those parameters has > some impact on the performances, and since it's not that complicated > to > implement, I thought it was worth a try, but maybe I'm wrong. Somewhat offtopic, but there was also another patchset a while ago for sun4i SPI that removed the 64 byte limit for SPI transfers (although this was DMA-less case) back then. http://lists.infradead.org/pipermail/linux-arm-kernel/2014-March/241392 .html (This patchset made it possible to use SPI-based small TFT displays via fbtft, when I tested it some time ago). As it currently stands, sun4i SPI is not using DMA and transfers are limited to 64 bytes. Päikest, Priit ;)
On Tue, Mar 08, 2016 at 08:51:31AM +0100, Maxime Ripard wrote: > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > + * > > > > > > + * @para: contains information about block size and time before checking > > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > > + * DMA channels > > > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > > dma_slave_config for this? > > > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > > DMA channel to launch a transfer of X bytes without having to check the > > > > DRQ line (the line telling the DMA engine it can transfer more data > > > > to/from the device). The wait cycles information is apparently related > > > > to the number of clks the engine should wait before polling/checking > > > > the DRQ line status between each block transfer. I'm not sure what it > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > > Allwinner tweak that depending on the device. > > > > we already have block size aka src/dst_maxburst, why not use that one. > > I'm not sure it's really the same thing. The DMA controller also has a > burst parameter, that is either 1 byte or 8 bytes, and ends up being > different from this one. Nope that is buswidth. maxburst is words which cna be sent to device FIFO. > > > Why does dmaengine need to wait? Can you explain that > > We have no idea, we thought you might have one :) Well that is hardware dependent. From DMAengine API usage we dont ahve to wait at all. We should submit next descriptor as soon as possible. > It doesn't really makes sense to us, but it does have a significant > impact on the throughput.
On Tue, Mar 08, 2016 at 09:46:25AM +0100, Boris Brezillon wrote: > On Tue, 8 Mar 2016 08:51:31 +0100 > Maxime Ripard <maxime.ripard@free-electrons.com> wrote: > > > On Tue, Mar 08, 2016 at 08:25:47AM +0530, Vinod Koul wrote: > > > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote: > > > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote: > > > > > Hi Vinod, > > > > > > > > > > On Mon, 7 Mar 2016 20:24:29 +0530 > > > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: > > > > > > > +/* Dedicated DMA parameter register layout */ > > > > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) > > > > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > > > > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > > > > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > > > > > > > + > > > > > > > +/** > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > > + * > > > > > > > + * @para: contains information about block size and time before checking > > > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > > > + * DMA channels > > > > > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > > > dma_slave_config for this? > > > > > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > > > DMA channel to launch a transfer of X bytes without having to check the > > > > > DRQ line (the line telling the DMA engine it can transfer more data > > > > > to/from the device). The wait cycles information is apparently related > > > > > to the number of clks the engine should wait before polling/checking > > > > > the DRQ line status between each block transfer. I'm not sure what it > > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > > > Allwinner tweak that depending on the device. > > > > > > we already have block size aka src/dst_maxburst, why not use that one. > > > > I'm not sure it's really the same thing. The DMA controller also has a > > burst parameter, that is either 1 byte or 8 bytes, and ends up being > > different from this one. > > Well, that's what I understood to, but when reading more carefully the > src/dst_maxburst description, it seems to match the block_size concept > exposed by the sun4i dmaengine. But how should we choose the real burst > size then. maxburst is block size as you describe in this context > IIRC, in most documentation/datasheets, burst size is referred as the > maximum number of words (word size depends on the selected width) a > master is allowed to transfer to a slave through the bus without > being interrupted by other master requests. > Am I correct? maxburst is defined as words not bytes. Word is specfied with the src/dst_addr_width. > > > > > > Why does dmaengine need to wait? Can you explain that > > > > We have no idea, we thought you might have one :) > > Yes, it's really unclear to us why this is needed. There might be some > kind of contention, or maybe the slave device takes some time to put > DRQ line to low state, and without these wait_cycles the dmaengine > would assume some data are still available in the FIFO while there's > actually no more data to retrieve. > > > > > It doesn't really makes sense to us, but it does have a significant > > impact on the throughput. > > I wouldn't say significant impact, but tweaking those parameters has > some impact on the performances, and since it's not that complicated to > implement, I thought it was worth a try, but maybe I'm wrong. Can you guys check with HW folks and see why it is required, if that is a possiblity!
On Tue, Mar 08, 2016 at 09:42:31AM +0100, Hans de Goede wrote: > <wild speculation> > > I see 2 possible reasons why waiting till checking for drq can help: > > 1) A lot of devices have an internal fifo hooked up to a single mmio data > register which gets read using the general purpose dma-engine, it allows > this fifo to fill, and thus do burst transfers > (We've seen similar issues with the scanout engine for the display which > has its own dma engine, and doing larger transfers helps a lot). > > 2) Physical memory on the sunxi SoCs is (often) divided into banks > with a shared data / address bus doing bank-switches is expensive, so > this wait cycles may introduce latency which allows a user of another > bank to complete its RAM accesses before the dma engine forces a > bank switch, which ends up avoiding a lot of (interleaved) bank switches > while both try to access a different banj and thus waiting makes things > (much) faster in the end (again a known problem with the display > scanout engine). > > </wild speculation> > > Note the differences these kinda tweaks make can be quite dramatic, > when using a 1920x1080p60 hdmi output on the A10 SoC with a 16 bit > memory bus (real world worst case scenario), the memory bandwidth > left for userspace processes (measured through memset) almost doubles > from 48 MB/s to 85 MB/s, source: > http://ssvb.github.io/2014/11/11/revisiting-fullhd-x11-desktop-performance-of-the-allwinner-a10.html > > TL;DR: Waiting before starting DMA allows for doing larger burst > transfers which ends up making things more efficient. > > Given this, I really expect there to be other dma-engines which > have some option to wait a bit before starting/unpausing a transfer > instead of starting it as soon as (more) data is available, so I think > this would make a good addition to dma_slave_config. I tend to agree but before we do that I would like this hypothesis to be confirmed :)
> > > > It doesn't really makes sense to us, but it does have a significant > > impact on the throughput. > > I wouldn't say significant impact, but tweaking those parameters has > some impact on the performances, and since it's not that complicated to > implement, I thought it was worth a try, but maybe I'm wrong. > Hello I am working on adding DMA on the crypto driver sun4i-ss and I confirm that tweaking those parameter can increase speed. Regards LABBE Corentin
On Tue, 8 Mar 2016 08:25:47 +0530 Vinod Koul <vinod.koul@intel.com> wrote: > On Mon, Mar 07, 2016 at 09:30:24PM +0100, Maxime Ripard wrote: > > On Mon, Mar 07, 2016 at 04:08:57PM +0100, Boris Brezillon wrote: > > > Hi Vinod, > > > > > > On Mon, 7 Mar 2016 20:24:29 +0530 > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > On Mon, Mar 07, 2016 at 10:59:31AM +0100, Boris Brezillon wrote: > > > > > +/* Dedicated DMA parameter register layout */ > > > > > +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) > > > > > +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) > > > > > +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) > > > > > +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) > > > > > + > > > > > +/** > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > + * > > > > > + * @para: contains information about block size and time before checking > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > + * DMA channels > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > dma_slave_config for this? > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > DMA channel to launch a transfer of X bytes without having to check the > > > DRQ line (the line telling the DMA engine it can transfer more data > > > to/from the device). The wait cycles information is apparently related > > > to the number of clks the engine should wait before polling/checking > > > the DRQ line status between each block transfer. I'm not sure what it > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > Allwinner tweak that depending on the device. > > we already have block size aka src/dst_maxburst, why not use that one. Okay, but then remains the question "how should we choose the real burst size?". The block size described in Allwinner datasheet is not the number of words you will transmit without being preempted by other master -> slave requests, it's the number of bytes that can be transmitted without checking the DRQ line. IOW, block_size = burst_size * X
On Tue, Mar 08, 2016 at 03:35:38PM +0530, Vinod Koul wrote: > On Tue, Mar 08, 2016 at 09:42:31AM +0100, Hans de Goede wrote: > > <wild speculation> > > > > I see 2 possible reasons why waiting till checking for drq can help: > > > > 1) A lot of devices have an internal fifo hooked up to a single mmio data > > register which gets read using the general purpose dma-engine, it allows > > this fifo to fill, and thus do burst transfers > > (We've seen similar issues with the scanout engine for the display which > > has its own dma engine, and doing larger transfers helps a lot). > > > > 2) Physical memory on the sunxi SoCs is (often) divided into banks > > with a shared data / address bus doing bank-switches is expensive, so > > this wait cycles may introduce latency which allows a user of another > > bank to complete its RAM accesses before the dma engine forces a > > bank switch, which ends up avoiding a lot of (interleaved) bank switches > > while both try to access a different banj and thus waiting makes things > > (much) faster in the end (again a known problem with the display > > scanout engine). > > > > </wild speculation> > > > > Note the differences these kinda tweaks make can be quite dramatic, > > when using a 1920x1080p60 hdmi output on the A10 SoC with a 16 bit > > memory bus (real world worst case scenario), the memory bandwidth > > left for userspace processes (measured through memset) almost doubles > > from 48 MB/s to 85 MB/s, source: > > http://ssvb.github.io/2014/11/11/revisiting-fullhd-x11-desktop-performance-of-the-allwinner-a10.html > > > > TL;DR: Waiting before starting DMA allows for doing larger burst > > transfers which ends up making things more efficient. > > > > Given this, I really expect there to be other dma-engines which > > have some option to wait a bit before starting/unpausing a transfer > > instead of starting it as soon as (more) data is available, so I think > > this would make a good addition to dma_slave_config. > > I tend to agree but before we do that I would like this hypothesis to be > confirmed :) We can't confirm it, we don't have access to any documentation that might explain what this is about. Maxime
On Tue, 8 Mar 2016 08:25:47 +0530 Vinod Koul <vinod.koul@intel.com> wrote: > > Why does dmaengine need to wait? Can you explain that I don't have an answer for that one, but when I set WAIT_CYCLES to 1 for the NAND use case it does not work. So I guess it is somehow related to how the DRQ line is controlled on the device side...
On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote: > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > + * > > > > > > + * @para: contains information about block size and time before checking > > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > > + * DMA channels > > > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > > dma_slave_config for this? > > > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > > DMA channel to launch a transfer of X bytes without having to check the > > > > DRQ line (the line telling the DMA engine it can transfer more data > > > > to/from the device). The wait cycles information is apparently related > > > > to the number of clks the engine should wait before polling/checking > > > > the DRQ line status between each block transfer. I'm not sure what it > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > > Allwinner tweak that depending on the device. > > > > we already have block size aka src/dst_maxburst, why not use that one. > > Okay, but then remains the question "how should we choose the real burst > size?". The block size described in Allwinner datasheet is not the > number of words you will transmit without being preempted by other > master -> slave requests, it's the number of bytes that can be > transmitted without checking the DRQ line. > IOW, block_size = burst_size * X Thats fine, API expects words for this and also a width value. Client shoudl pass both and for programming you should use bytes converted from words and width.
On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote: > On Tue, 8 Mar 2016 08:25:47 +0530 > Vinod Koul <vinod.koul@intel.com> wrote: > > > > Why does dmaengine need to wait? Can you explain that > > I don't have an answer for that one, but when I set WAIT_CYCLES to 1 > for the NAND use case it does not work. So I guess it is somehow > related to how the DRQ line is controlled on the device side... Is the WAIT cycle different for different usages or same for all usages/channels?
On Fri, 11 Mar 2016 11:54:52 +0530 Vinod Koul <vinod.koul@intel.com> wrote: > On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote: > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > > + * > > > > > > > + * @para: contains information about block size and time before checking > > > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > > > + * DMA channels > > > > > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > > > dma_slave_config for this? > > > > > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > > > DMA channel to launch a transfer of X bytes without having to check the > > > > > DRQ line (the line telling the DMA engine it can transfer more data > > > > > to/from the device). The wait cycles information is apparently related > > > > > to the number of clks the engine should wait before polling/checking > > > > > the DRQ line status between each block transfer. I'm not sure what it > > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > > > Allwinner tweak that depending on the device. > > > > > > we already have block size aka src/dst_maxburst, why not use that one. > > > > Okay, but then remains the question "how should we choose the real burst > > size?". The block size described in Allwinner datasheet is not the > > number of words you will transmit without being preempted by other > > master -> slave requests, it's the number of bytes that can be > > transmitted without checking the DRQ line. > > IOW, block_size = burst_size * X > > Thats fine, API expects words for this and also a width value. Client shoudl > pass both and for programming you should use bytes converted from words and > width. > Not sure I get what you mean. Are you suggesting to add new fields to the dma_slave_config struct to describe this block concept, or should we pass it through ->xxx_burstsize, and try to guess the real burstsize? In the latter case, you still haven't answered my question: how should we choose the burstsize?
On Fri, 11 Mar 2016 11:56:07 +0530 Vinod Koul <vinod.koul@intel.com> wrote: > On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote: > > On Tue, 8 Mar 2016 08:25:47 +0530 > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > Why does dmaengine need to wait? Can you explain that > > > > I don't have an answer for that one, but when I set WAIT_CYCLES to 1 > > for the NAND use case it does not work. So I guess it is somehow > > related to how the DRQ line is controlled on the device side... > > Is the WAIT cycle different for different usages or same for all > usages/channels? > In Allwinner BSP they adapt it on a per slave device basis, but since DMA channels are dynamically allocated, you can't know in advance which physical channel will be attached to a specific device. Another option I considered was adding a new cell to the sun4i DT binding to encode these WAIT_CYCLES and BLOCK_SIZE information. But I'm not sure adding that to the DT is a good idea (not to mention that it would break DT ABI again, and given the last discussions on this topic, I'm not sure it's a good idea :-/).
On Fri, Mar 11, 2016 at 10:40:55AM +0100, Boris Brezillon wrote: > On Fri, 11 Mar 2016 11:54:52 +0530 > Vinod Koul <vinod.koul@intel.com> wrote: > > > On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote: > > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > > > + * > > > > > > > > + * @para: contains information about block size and time before checking > > > > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > > > > + * DMA channels > > > > > > > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > > > > dma_slave_config for this? > > > > > > > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > > > > DMA channel to launch a transfer of X bytes without having to check the > > > > > > DRQ line (the line telling the DMA engine it can transfer more data > > > > > > to/from the device). The wait cycles information is apparently related > > > > > > to the number of clks the engine should wait before polling/checking > > > > > > the DRQ line status between each block transfer. I'm not sure what it > > > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > > > > Allwinner tweak that depending on the device. > > > > > > > > we already have block size aka src/dst_maxburst, why not use that one. > > > > > > Okay, but then remains the question "how should we choose the real burst > > > size?". The block size described in Allwinner datasheet is not the > > > number of words you will transmit without being preempted by other > > > master -> slave requests, it's the number of bytes that can be > > > transmitted without checking the DRQ line. > > > IOW, block_size = burst_size * X > > > > Thats fine, API expects words for this and also a width value. Client shoudl > > pass both and for programming you should use bytes converted from words and > > width. > > > > Not sure I get what you mean. Are you suggesting to add new fields to > the dma_slave_config struct to describe this block concept, or should No > we pass it through ->xxx_burstsize, and try to guess the real burstsize? Pass the real burstsize in words > In the latter case, you still haven't answered my question: how should > we choose the burstsize? From word value convert to bytes and program HW burst(in bytes) = burst (in words ) * buswidth;
On Fri, Mar 11, 2016 at 10:45:52AM +0100, Boris Brezillon wrote: > On Fri, 11 Mar 2016 11:56:07 +0530 > Vinod Koul <vinod.koul@intel.com> wrote: > > > On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote: > > > On Tue, 8 Mar 2016 08:25:47 +0530 > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > > Why does dmaengine need to wait? Can you explain that > > > > > > I don't have an answer for that one, but when I set WAIT_CYCLES to 1 > > > for the NAND use case it does not work. So I guess it is somehow > > > related to how the DRQ line is controlled on the device side... > > > > Is the WAIT cycle different for different usages or same for all > > usages/channels? > > > > In Allwinner BSP they adapt it on a per slave device basis, but since > DMA channels are dynamically allocated, you can't know in advance which > physical channel will be attached to a specific device. And we have the correct values availble in datasheet for all usages > Another option I considered was adding a new cell to the sun4i DT > binding to encode these WAIT_CYCLES and BLOCK_SIZE information. But I'm > not sure adding that to the DT is a good idea (not to mention that it > would break DT ABI again, and given the last discussions on this topic, > I'm not sure it's a good idea :-/). Yes i was veering towards DT as well. This is a new property so ABI rules wont break as long as driver still works with old properties. But this nees to be property for clients and not driver. Client can then program these
On Fri, 11 Mar 2016 15:36:07 +0530 Vinod Koul <vinod.koul@intel.com> wrote: > On Fri, Mar 11, 2016 at 10:40:55AM +0100, Boris Brezillon wrote: > > On Fri, 11 Mar 2016 11:54:52 +0530 > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote: > > > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > > > > + * > > > > > > > > > + * @para: contains information about block size and time before checking > > > > > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > > > > > + * DMA channels > > > > > > > > > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > > > > > dma_slave_config for this? > > > > > > > > > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > > > > > DMA channel to launch a transfer of X bytes without having to check the > > > > > > > DRQ line (the line telling the DMA engine it can transfer more data > > > > > > > to/from the device). The wait cycles information is apparently related > > > > > > > to the number of clks the engine should wait before polling/checking > > > > > > > the DRQ line status between each block transfer. I'm not sure what it > > > > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > > > > > Allwinner tweak that depending on the device. > > > > > > > > > > we already have block size aka src/dst_maxburst, why not use that one. > > > > > > > > Okay, but then remains the question "how should we choose the real burst > > > > size?". The block size described in Allwinner datasheet is not the > > > > number of words you will transmit without being preempted by other > > > > master -> slave requests, it's the number of bytes that can be > > > > transmitted without checking the DRQ line. > > > > IOW, block_size = burst_size * X > > > > > > Thats fine, API expects words for this and also a width value. Client shoudl > > > pass both and for programming you should use bytes converted from words and > > > width. > > > > > > > Not sure I get what you mean. Are you suggesting to add new fields to > > the dma_slave_config struct to describe this block concept, or should > > No > > > we pass it through ->xxx_burstsize, and try to guess the real burstsize? > > Pass the real burstsize in words > > > In the latter case, you still haven't answered my question: how should > > we choose the burstsize? > > From word value convert to bytes and program HW > > burst(in bytes) = burst (in words ) * buswidth; > Except, as already explained, the blocksize and burstsize concepts are not exactly the same, and the sunxi engine expect both to be defined. So let's take a real example to illustrate my question: For the NAND use case, here is my DMA channel setup: buswidth (or wordsize) = 4 bytes burstsize = 4 words (32 bytes) blocksize = 128 bytes Here, you can see that blocksize = 4 * burstsize, and again, burstsize and blocksize are not encoding the same thing. So, assuming we use ->src/dst_burstsize to encode the blocksize in our case, how should we deduce the real burstsize (which still needs to be configured in the engine).
On Fri, Mar 11, 2016 at 03:39:02PM +0530, Vinod Koul wrote: > On Fri, Mar 11, 2016 at 10:45:52AM +0100, Boris Brezillon wrote: > > On Fri, 11 Mar 2016 11:56:07 +0530 > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote: > > > > On Tue, 8 Mar 2016 08:25:47 +0530 > > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > > > > Why does dmaengine need to wait? Can you explain that > > > > > > > > I don't have an answer for that one, but when I set WAIT_CYCLES to 1 > > > > for the NAND use case it does not work. So I guess it is somehow > > > > related to how the DRQ line is controlled on the device side... > > > > > > Is the WAIT cycle different for different usages or same for all > > > usages/channels? > > > > > > > In Allwinner BSP they adapt it on a per slave device basis, but since > > DMA channels are dynamically allocated, you can't know in advance which > > physical channel will be attached to a specific device. > > And we have the correct values availble in datasheet for all usages No, we don't. If you look at the datasheet in question, page 169. https://github.com/allwinner-zh/documents/blob/master/A20/A20_User_Manual_v1.4_20150510.pdf This is the only documentation we have. And as you can see, it is very sparse (and that's an understament). So we cannot make that assumption, so far the values have been found through trial and error for the devices in question. > > Another option I considered was adding a new cell to the sun4i DT > > binding to encode these WAIT_CYCLES and BLOCK_SIZE information. But I'm > > not sure adding that to the DT is a good idea (not to mention that it > > would break DT ABI again, and given the last discussions on this topic, > > I'm not sure it's a good idea :-/). > > Yes i was veering towards DT as well. This is a new property so ABI rules > wont break as long as driver still works with old properties. Yeah, we can always default to our current hardcoded value if the property is missing. And since no-one is using the engine at the moment anyway, so it's not really a big deal. > But this nees to be property for clients and not driver. Client can then > program these Yes, totally. The question here is how the clients give that information to the driver. Maxime
On Fri, Mar 11, 2016 at 11:55:49AM +0100, Maxime Ripard wrote: > On Fri, Mar 11, 2016 at 03:39:02PM +0530, Vinod Koul wrote: > > On Fri, Mar 11, 2016 at 10:45:52AM +0100, Boris Brezillon wrote: > > > On Fri, 11 Mar 2016 11:56:07 +0530 > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > On Wed, Mar 09, 2016 at 12:06:27PM +0100, Boris Brezillon wrote: > > > > > On Tue, 8 Mar 2016 08:25:47 +0530 > > > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > > > > > > Why does dmaengine need to wait? Can you explain that > > > > > > > > > > I don't have an answer for that one, but when I set WAIT_CYCLES to 1 > > > > > for the NAND use case it does not work. So I guess it is somehow > > > > > related to how the DRQ line is controlled on the device side... > > > > > > > > Is the WAIT cycle different for different usages or same for all > > > > usages/channels? > > > > > > > > > > In Allwinner BSP they adapt it on a per slave device basis, but since > > > DMA channels are dynamically allocated, you can't know in advance which > > > physical channel will be attached to a specific device. > > > > And we have the correct values availble in datasheet for all usages > > No, we don't. > > If you look at the datasheet in question, page 169. > https://github.com/allwinner-zh/documents/blob/master/A20/A20_User_Manual_v1.4_20150510.pdf > > This is the only documentation we have. And as you can see, it is very > sparse (and that's an understament). > > So we cannot make that assumption, so far the values have been found > through trial and error for the devices in question. > > > > Another option I considered was adding a new cell to the sun4i DT > > > binding to encode these WAIT_CYCLES and BLOCK_SIZE information. But I'm > > > not sure adding that to the DT is a good idea (not to mention that it > > > would break DT ABI again, and given the last discussions on this topic, > > > I'm not sure it's a good idea :-/). > > > > Yes i was veering towards DT as well. This is a new property so ABI rules > > wont break as long as driver still works with old properties. > > Yeah, we can always default to our current hardcoded value if the > property is missing. And since no-one is using the engine at the > moment anyway, so it's not really a big deal. > > > But this nees to be property for clients and not driver. Client can then > > program these > > Yes, totally. The question here is how the clients give that > information to the driver. For this part am not worried. If we can generalize this then we add to dma_slave_config. Otherwise an exported symbol from driver should be fine.
On Fri, Mar 11, 2016 at 11:26:31AM +0100, Boris Brezillon wrote: > On Fri, 11 Mar 2016 15:36:07 +0530 > Vinod Koul <vinod.koul@intel.com> wrote: > > > On Fri, Mar 11, 2016 at 10:40:55AM +0100, Boris Brezillon wrote: > > > On Fri, 11 Mar 2016 11:54:52 +0530 > > > Vinod Koul <vinod.koul@intel.com> wrote: > > > > > > > On Wed, Mar 09, 2016 at 11:14:34AM +0100, Boris Brezillon wrote: > > > > > > > > > > + * struct sun4i_dma_chan_config - DMA channel config > > > > > > > > > > + * > > > > > > > > > > + * @para: contains information about block size and time before checking > > > > > > > > > > + * DRQ line. This is device specific and only applicable to dedicated > > > > > > > > > > + * DMA channels > > > > > > > > > > > > > > > > > > What information, can you elobrate.. And why can't you use existing > > > > > > > > > dma_slave_config for this? > > > > > > > > > > > > > > > > Block size is related to the device FIFO size. I guess it allows the > > > > > > > > DMA channel to launch a transfer of X bytes without having to check the > > > > > > > > DRQ line (the line telling the DMA engine it can transfer more data > > > > > > > > to/from the device). The wait cycles information is apparently related > > > > > > > > to the number of clks the engine should wait before polling/checking > > > > > > > > the DRQ line status between each block transfer. I'm not sure what it > > > > > > > > saves to put WAIT_CYCLES() to something != 1, but in their BSP, > > > > > > > > Allwinner tweak that depending on the device. > > > > > > > > > > > > we already have block size aka src/dst_maxburst, why not use that one. > > > > > > > > > > Okay, but then remains the question "how should we choose the real burst > > > > > size?". The block size described in Allwinner datasheet is not the > > > > > number of words you will transmit without being preempted by other > > > > > master -> slave requests, it's the number of bytes that can be > > > > > transmitted without checking the DRQ line. > > > > > IOW, block_size = burst_size * X > > > > > > > > Thats fine, API expects words for this and also a width value. Client shoudl > > > > pass both and for programming you should use bytes converted from words and > > > > width. > > > > > > > > > > Not sure I get what you mean. Are you suggesting to add new fields to > > > the dma_slave_config struct to describe this block concept, or should > > > > No > > > > > we pass it through ->xxx_burstsize, and try to guess the real burstsize? > > > > Pass the real burstsize in words > > > > > In the latter case, you still haven't answered my question: how should > > > we choose the burstsize? > > > > From word value convert to bytes and program HW > > > > burst(in bytes) = burst (in words ) * buswidth; > > > > > Except, as already explained, the blocksize and burstsize concepts are > not exactly the same, and the sunxi engine expect both to be defined. > So let's take a real example to illustrate my question: > > For the NAND use case, here is my DMA channel setup: > > buswidth (or wordsize) = 4 bytes > burstsize = 4 words (32 bytes) > blocksize = 128 bytes > > Here, you can see that blocksize = 4 * burstsize, and again, burstsize > and blocksize are not encoding the same thing. So, assuming we use > ->src/dst_burstsize to encode the blocksize in our case, how should we > deduce the real burstsize (which still needs to be configured in the > engine). Oh, i was somehow under the impression they are same! Then we can't use blocksize here, pls pass burst and width properly. How is block size calculated?
On Fri, Mar 11, 2016 at 04:48:26PM +0530, Vinod Koul wrote: > > > But this nees to be property for clients and not driver. Client can then > > > program these > > > > Yes, totally. The question here is how the clients give that > > information to the driver. > > For this part am not worried. If we can generalize this then we add to > dma_slave_config. Otherwise an exported symbol from driver should be fine. It's actually what we would like to avoid. We have two potential provider driver that would need such an interface, and we have customer drivers that would be able to use any of these two, depending on which SoCs we're talking about. Maintaining some logic in each and every driver in that case to know which one of this symbol is to be called seems counterproductive and painful. Maxime
On Mon, Mar 14, 2016 at 12:46:41PM +0100, Maxime Ripard wrote: > On Fri, Mar 11, 2016 at 04:48:26PM +0530, Vinod Koul wrote: > > > > But this nees to be property for clients and not driver. Client can then > > > > program these > > > > > > Yes, totally. The question here is how the clients give that > > > information to the driver. > > > > For this part am not worried. If we can generalize this then we add to > > dma_slave_config. Otherwise an exported symbol from driver should be fine. > > It's actually what we would like to avoid. > > We have two potential provider driver that would need such an > interface, and we have customer drivers that would be able to use any > of these two, depending on which SoCs we're talking about. > > Maintaining some logic in each and every driver in that case to know > which one of this symbol is to be called seems counterproductive and > painful. You didn't specify which one you want to avoid, and my guess is latter choice and not former :) As I said, if it's something we can use in few examples and describe generically I do not mind adding to dma_slave_config
diff --git a/drivers/dma/sun4i-dma.c b/drivers/dma/sun4i-dma.c index 1661d518..e48f537 100644 --- a/drivers/dma/sun4i-dma.c +++ b/drivers/dma/sun4i-dma.c @@ -12,6 +12,7 @@ #include <linux/bitops.h> #include <linux/clk.h> #include <linux/dmaengine.h> +#include <linux/dma/sun4i-dma.h> #include <linux/dmapool.h> #include <linux/interrupt.h> #include <linux/module.h> @@ -138,6 +139,7 @@ struct sun4i_dma_pchan { struct sun4i_dma_vchan { struct virt_dma_chan vc; struct dma_slave_config cfg; + struct sun4i_dma_chan_config scfg; struct sun4i_dma_pchan *pchan; struct sun4i_dma_promise *processing; struct sun4i_dma_contract *contract; @@ -779,7 +781,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, u8 ram_type, io_mode, linear_mode; struct scatterlist *sg; dma_addr_t srcaddr, dstaddr; - u32 endpoints, para; + u32 endpoints; int i; if (!sgl) @@ -825,17 +827,6 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, dstaddr = sg_dma_address(sg); } - /* - * These are the magic DMA engine timings that keep SPI going. - * I haven't seen any interface on DMAEngine to configure - * timings, and so far they seem to work for everything we - * support, so I've kept them here. I don't know if other - * devices need different timings because, as usual, we only - * have the "para" bitfield meanings, but no comment on what - * the values should be when doing a certain operation :| - */ - para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS; - /* And make a suitable promise */ if (vchan->is_dedicated) promise = generate_ddma_promise(chan, srcaddr, dstaddr, @@ -850,7 +841,7 @@ sun4i_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, return NULL; /* TODO: should we free everything? */ promise->cfg |= endpoints; - promise->para = para; + promise->para = vchan->scfg.para; /* Then add it to the contract */ list_add_tail(&promise->list, &contract->demands); @@ -908,6 +899,21 @@ static int sun4i_dma_config(struct dma_chan *chan, return 0; } +int sun4i_dma_set_chan_config(struct dma_chan *dchan, + const struct sun4i_dma_chan_config *cfg) +{ + struct sun4i_dma_vchan *vchan = to_sun4i_dma_vchan(dchan); + + if (!vchan->is_dedicated) + return -ENOTSUPP; + + /* TODO: control cfg value */ + vchan->scfg = *cfg; + + return 0; +} +EXPORT_SYMBOL_GPL(sun4i_dma_set_chan_config); + static struct dma_chan *sun4i_dma_of_xlate(struct of_phandle_args *dma_spec, struct of_dma *ofdma) { @@ -1206,6 +1212,18 @@ static int sun4i_dma_probe(struct platform_device *pdev) spin_lock_init(&vchan->vc.lock); vchan->vc.desc_free = sun4i_dma_free_contract; vchan_init(&vchan->vc, &priv->slave); + + /* + * These are the magic DMA engine timings that keep SPI going. + * I haven't seen any interface on DMAEngine to configure + * timings, and so far they seem to work for everything we + * support, so I've kept them here. I don't know if other + * devices need different timings because, as usual, we only + * have the "para" bitfield meanings, but no comment on what + * the values should be when doing a certain operation :| + */ + vchan->scfg.para = SUN4I_DDMA_MAGIC_SPI_PARAMETERS; + } ret = clk_prepare_enable(priv->clk); diff --git a/include/linux/dma/sun4i-dma.h b/include/linux/dma/sun4i-dma.h new file mode 100644 index 0000000..f643539 --- /dev/null +++ b/include/linux/dma/sun4i-dma.h @@ -0,0 +1,38 @@ +/* + * Sun4i DMA Engine drivers support header file + * + * Copyright (C) 2016 Free Electrons. All rights reserved. + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _SUN4I_DMA_H +#define _SUN4I_DMA_H + +#include <linux/dma-mapping.h> +#include <linux/dmaengine.h> + +/* Dedicated DMA parameter register layout */ +#define SUN4I_DDMA_PARA_DST_DATA_BLK_SIZE(n) (((n) - 1) << 24) +#define SUN4I_DDMA_PARA_DST_WAIT_CYCLES(n) (((n) - 1) << 16) +#define SUN4I_DDMA_PARA_SRC_DATA_BLK_SIZE(n) (((n) - 1) << 8) +#define SUN4I_DDMA_PARA_SRC_WAIT_CYCLES(n) (((n) - 1) << 0) + +/** + * struct sun4i_dma_chan_config - DMA channel config + * + * @para: contains information about block size and time before checking + * DRQ line. This is device specific and only applicable to dedicated + * DMA channels + */ +struct sun4i_dma_chan_config { + u32 para; +}; + +int sun4i_dma_set_chan_config(struct dma_chan *dchan, + const struct sun4i_dma_chan_config *cfg); + +#endif /* _SUN4I_DMA_H */
Some drivers might need to tweak the block size and wait cycles values to get better performances. Create and export the sun4i_dma_set_chan_config() to do that. Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com> --- drivers/dma/sun4i-dma.c | 44 ++++++++++++++++++++++++++++++------------- include/linux/dma/sun4i-dma.h | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 13 deletions(-) create mode 100644 include/linux/dma/sun4i-dma.h