diff mbox series

[v3,3/3] spi: spi-qcom-qspi: Add DMA mode support

Message ID 1681481153-24036-4-git-send-email-quic_vnivarth@quicinc.com (mailing list archive)
State Superseded, archived
Headers show
Series spi: Add DMA mode support to spi-qcom-qspi | expand

Commit Message

Vijaya Krishna Nivarthi April 14, 2023, 2:05 p.m. UTC
Current driver supports only PIO mode.

HW supports DMA, so add DMA mode support to the driver
for better performance for larger xfers.

Signed-off-by: Vijaya Krishna Nivarthi <quic_vnivarth@quicinc.com>
---
v2 -> v3:
- dropped Reported-by tag
- unset fragment bit only for last cmd_desc of last xfer

v1 -> v2:
- modified commit message
- addressed kernel test robot error
- correct placement of header file includes
- added more comments
- coding style related changes
- renamed variables
- used u32/u8 instead of uint32_t/8_t
- removed unnecessary casting
- retain same MSTR_CONFIG as PIO for DMA
---
 drivers/spi/spi-qcom-qspi.c | 434 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 403 insertions(+), 31 deletions(-)

Comments

Douglas Anderson April 14, 2023, 10:05 p.m. UTC | #1
Hi,

On Fri, Apr 14, 2023 at 7:06 AM Vijaya Krishna Nivarthi
<quic_vnivarth@quicinc.com> wrote:
>
> @@ -14,7 +16,6 @@
>  #include <linux/spi/spi.h>
>  #include <linux/spi/spi-mem.h>
>
> -

Drop unrelated whitespace change.


> @@ -108,6 +110,10 @@
>  #define RD_FIFO_RESET          0x0030
>  #define RESET_FIFO             BIT(0)
>
> +#define NEXT_DMA_DESC_ADDR             0x0040
> +#define CURRENT_DMA_DESC_ADDR  0x0044
> +#define CURRENT_MEM_ADDR               0x0048

Looking at the above with a correctly configured editor (tab size=8)
the numbers don't line up. The first and 3rd have one too many tabs.


> @@ -120,6 +126,27 @@ enum qspi_dir {
>         QSPI_WRITE,
>  };
>
> +struct qspi_cmd_desc {
> +       u32 data_address;
> +       u32 next_descriptor;
> +       u32 direction:1;
> +       u32 multi_io_mode:3;
> +       u32 reserved1:4;
> +       u32 fragment:1;
> +       u32 reserved2:7;
> +       u32 length:16;
> +       /*
> +        * This marks end of HW cmd descriptor
> +        * Fields down below are for SW usage to
> +        * copy data from DMA buffer to rx buffer
> +        */
> +       u8 *bounce_src;
> +       u8 *bounce_dst;
> +       u32 bounce_length;
> +};
> +
> +#define QSPI_MAX_NUM_DESC 5

Nothing uses QSPI_MAX_NUM_DESC. Drop it.


> @@ -137,11 +164,36 @@ enum qspi_clocks {
>         QSPI_NUM_CLKS
>  };
>
> +enum qspi_xfer_mode {
> +       QSPI_INVALID,
> +       QSPI_FIFO,
> +       QSPI_DMA
> +};

Why bother with INVALID? It's either FIFO or DMA, right?


> +/* number of entries in sgt returned from spi framework that we can support */
> +#define QSPI_MAX_SG 5

Is the above a hardware limitation, or just because you are statically
allocating arrays? Please clarify in the comment. Would it make sense
to just dynamically allocate the arrays and remove the need for this
limitation?


> +/* 3 descriptors for head, aligned part and tail */
> +#define QSPI_NUM_CMD_DESC 3
> +
> +/* 2 descriptors for head, tail */
> +#define QSPI_NUM_DAT_DESC 2
> +
>  struct qcom_qspi {
>         void __iomem *base;
>         struct device *dev;
>         struct clk_bulk_data *clks;
>         struct qspi_xfer xfer;
> +       struct dma_pool *dma_cmd_pool;
> +       struct dma_pool *dma_data_pool;
> +       dma_addr_t dma_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
> +       dma_addr_t dma_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
> +       void *virt_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
> +       void *virt_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
> +       unsigned int n_cmd_desc;
> +       unsigned int n_data_desc;
> +       int xfer_mode;

Instead of "int", shouldn't xfer_mode be "enum qspi_xfer_mode"?
Although below I'm proposing that xfer_mode can just be completely
dropped from this structure.


> @@ -151,18 +203,25 @@ struct qcom_qspi {
>  static u32 qspi_buswidth_to_iomode(struct qcom_qspi *ctrl,
>                                    unsigned int buswidth)
>  {
> +       u32 ret;
> +
> +       /* for DMA we don't write to PIO_XFER_CFG register, so don't shift */
>         switch (buswidth) {
>         case 1:
> -               return SDR_1BIT << MULTI_IO_MODE_SHFT;
> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
> +               break;
>         case 2:
> -               return SDR_2BIT << MULTI_IO_MODE_SHFT;
> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_2BIT : SDR_2BIT << MULTI_IO_MODE_SHFT);
> +               break;
>         case 4:
> -               return SDR_4BIT << MULTI_IO_MODE_SHFT;
> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_4BIT : SDR_4BIT << MULTI_IO_MODE_SHFT);
> +               break;
>         default:
>                 dev_warn_once(ctrl->dev,
>                                 "Unexpected bus width: %u\n", buswidth);
> -               return SDR_1BIT << MULTI_IO_MODE_SHFT;
> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
>         }
> +       return ret;

Wouldn't it be easier to do the test once at the end? In other words,
in the switch statement "ret" never contains the shift and then at the
end:

if (ctrl->xfer_mode != QSPI_DMA)
  ret <<= MULTI_IO_MODE_SHFT;
return ret;

...or, even better, just always return the unshifted mode and do the
shifting unconditionally in qcom_qspi_pio_xfer_cfg(). Then you never
need to look at xfer_mode to decide.


> @@ -241,12 +316,16 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
>                 return ret;
>         }
>
> +       avg_bw_cpu = Bps_to_icc(speed_hz);
>         /*
> -        * Set BW quota for CPU as driver supports FIFO mode only.
> -        * We don't have explicit peak requirement so keep it equal to avg_bw.
> +        * Set BW quota for CPU for FIFO to avg_bw
> +        * as we don't have explicit peak requirement.
> +        * TBD TBD TBD - change this as required for DMA.
> +        * As of now same peak requirement seems to be working.
>          */
> -       avg_bw_cpu = Bps_to_icc(speed_hz);
> -       ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, avg_bw_cpu);
> +       peak_bw_cpu = ctrl->xfer_mode == QSPI_FIFO ? avg_bw_cpu : avg_bw_cpu;
> +
> +       ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, peak_bw_cpu);

The change to this function is completely a no-op, right? You check
the mode against QSPI_FIFO but you set the "peak_bw_cpu" to the same
thing in both modes. ...and the thing you set it to is exactly the
same as the function set it to before your patch.

...so drop all the changes you made to this function.


> @@ -258,6 +337,190 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
>         return 0;
>  }
>
> +/* aligned to 32 bytes, not to cross 1KB boundary */
> +#define QSPI_ALIGN_REQ         32
> +#define QSPI_BOUNDARY_REQ      1024
> +
> +static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
> +                       dma_addr_t dma_ptr, uint32_t n_bytes)

Why is "n_bytes" "uint32_t" instead of just "u32"? Please just use
"u32" consistently in this file.


> +{
> +       struct qspi_cmd_desc *virt_cmd_desc, *prev;
> +       uint8_t *virt_data_desc;
> +       dma_addr_t dma_cmd_desc, dma_data_desc;
> +
> +       if (virt_ptr && n_bytes >= QSPI_ALIGN_REQ) {
> +               dev_err(ctrl->dev,
> +                       "Exiting to avert memory overwrite, n_bytes-%d\n", n_bytes);
> +               return -ENOMEM;
> +       }
> +
> +       /* allocate for dma cmd descriptor */
> +       virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
> +               GFP_KERNEL, &dma_cmd_desc);
> +       if (!virt_cmd_desc) {
> +               dev_err(ctrl->dev,
> +                       "Could not allocate for cmd_desc\n");
> +               return -ENOMEM;
> +       }
> +       ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
> +       ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
> +       ctrl->n_cmd_desc++;
> +
> +       /* allocate for dma data descriptor if unaligned else use pre-aligned */
> +       if (virt_ptr) {
> +               virt_data_desc = (uint8_t *)dma_pool_zalloc(ctrl->dma_data_pool,
> +                       GFP_KERNEL, &dma_data_desc);
> +               if (!virt_data_desc) {
> +                       dev_err(ctrl->dev,
> +                               "Could not allocate for data_desc\n");
> +                       return -ENOMEM;
> +               }
> +               ctrl->virt_data_desc[ctrl->n_data_desc] = virt_data_desc;
> +               ctrl->dma_data_desc[ctrl->n_data_desc] = dma_data_desc;
> +               ctrl->n_data_desc++;
> +
> +               /*
> +                * for tx copy xfer data into allocated buffer
> +                * for rx setup bounce info to copy after xfer
> +                */
> +               if (ctrl->xfer.dir == QSPI_WRITE) {
> +                       memcpy(virt_data_desc, virt_ptr, n_bytes);
> +               } else {
> +                       virt_cmd_desc->bounce_src = virt_data_desc;
> +                       virt_cmd_desc->bounce_dst = virt_ptr;
> +                       virt_cmd_desc->bounce_length = n_bytes;
> +               }
> +       } else {
> +               dma_data_desc = dma_ptr;
> +       }
> +
> +       /* setup cmd descriptor */
> +       virt_cmd_desc->data_address = dma_data_desc;
> +       virt_cmd_desc->next_descriptor = 0;
> +       virt_cmd_desc->direction = ctrl->xfer.dir;
> +       virt_cmd_desc->multi_io_mode = ctrl->iomode;
> +       virt_cmd_desc->reserved1 = 0;
> +       virt_cmd_desc->fragment = ctrl->xfer.is_last ? 0 : 1;
> +       virt_cmd_desc->reserved2 = 0;
> +       virt_cmd_desc->length = n_bytes;
> +
> +       /* update previous descriptor */
> +       if (ctrl->n_cmd_desc >= 2) {
> +               prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
> +               prev->next_descriptor = dma_cmd_desc;
> +               prev->fragment = 1;
> +       }
> +
> +       return 0;
> +}
> +
> +static int qcom_qspi_setup_dma_desc(struct qcom_qspi *ctrl,
> +                               struct spi_transfer *xfer)
> +{
> +       int ret;
> +       struct sg_table *sgt;
> +       unsigned int sg_total_len = 0;
> +       dma_addr_t dma_ptr_sg;
> +       unsigned int dma_len_sg;
> +       uint32_t prolog_bytes, aligned_bytes, epilog_bytes;
> +       dma_addr_t aligned_addr;
> +       int i;
> +       uint8_t *byte_ptr;
> +
> +       if (ctrl->n_cmd_desc || ctrl->n_data_desc) {
> +               dev_err(ctrl->dev, "Remnant dma buffers cmd-%d, data-%d\n",
> +                       ctrl->n_cmd_desc, ctrl->n_data_desc);
> +               return -EIO;
> +       }
> +
> +       sgt = (ctrl->xfer.dir == QSPI_READ) ? &xfer->rx_sg : &xfer->tx_sg;
> +       if (!sgt->nents || sgt->nents > QSPI_MAX_SG) {
> +               dev_err(ctrl->dev, "Cannot handle %d entries in scatter list\n", sgt->nents);
> +               return -EINVAL;
> +       }
> +
> +       for (i = 0; i < sgt->nents; i++)
> +               sg_total_len += sg_dma_len(sgt->sgl + i);
> +       if (sg_total_len != xfer->len) {
> +               dev_err(ctrl->dev, "Data lengths mismatch\n");
> +               return -EINVAL;
> +       }
> +
> +       if (ctrl->xfer.dir == QSPI_READ)
> +               byte_ptr = xfer->rx_buf;
> +       else
> +               byte_ptr = (uint8_t *)xfer->tx_buf;
> +
> +       for (i = 0; i < sgt->nents; i++) {
> +               dma_ptr_sg = sg_dma_address(sgt->sgl + i);
> +               dma_len_sg = sg_dma_len(sgt->sgl + i);
> +
> +               aligned_addr = PTR_ALIGN(dma_ptr_sg, QSPI_ALIGN_REQ);
> +
> +               prolog_bytes = min(dma_len_sg, (unsigned int)(aligned_addr - dma_ptr_sg));
> +               if (prolog_bytes) {
> +                       ret = qcom_qspi_alloc_desc(ctrl, byte_ptr, 0, prolog_bytes);
> +                       if (ret)
> +                               goto cleanup;
> +                       byte_ptr += prolog_bytes;
> +               }
> +
> +               aligned_bytes = PTR_ALIGN_DOWN(dma_len_sg - prolog_bytes, QSPI_ALIGN_REQ);
> +               if (aligned_bytes) {
> +                       ret = qcom_qspi_alloc_desc(ctrl, 0, aligned_addr, aligned_bytes);
> +                       if (ret)
> +                               goto cleanup;
> +                       byte_ptr += aligned_bytes;
> +               }
> +
> +               epilog_bytes = dma_len_sg - prolog_bytes - aligned_bytes;
> +               if (epilog_bytes) {
> +                       ret = qcom_qspi_alloc_desc(ctrl, byte_ptr, 0, epilog_bytes);
> +                       if (ret)
> +                               goto cleanup;
> +                       byte_ptr += epilog_bytes;

While I won't claim to be an expert on DMA, the above smells wrong to
me. It looks as if you're doing a whole lot of work here that doesn't
really belong in your driver but should be in the SPI core. If I
understand correctly, the issue is that this SPI controller needs DMA
buffers to start aligned on a 32-byte boundary. To handle that, you're
doing a whole lot of manual work to copy/bounce the bits that aren't
aligned.

Having alignment requirements like this doesn't seem like it should be
that unusual, though, and that's why it feels like the logic belongs
in the SPI core. In fact, it seems like this is _supposed_ to be
handled in the SPI core, but it isn't? In "spi.h" I see
"dma_alignment" that claims to be exactly what you need. As far as I
can tell, though, the core doesn't use this? ...so I'm kinda confused.
As far as I can tell this doesn't do anything and thus anyone setting
it today is broken?

Mark can tell me if I'm giving bad advice here, but IMO you should:

1. Set your controller's "dma_alignment" property to 32.

2. Add support to the core (in spi_map_buf() and spi_unmap_buf(), I
think) to handle the bouncing.


Other notes around this:

* In your code, I see "QSPI_BOUNDARY_REQ as 1K" which confuses me. As
far as I can tell you can do DMA that crosses 1K boundaries since (I
think) "aligned_bytes" above can be up to 64K, right?

* I haven't yet figured out why exactly you need the "epilog". Are you
sure you really do, or could this be combined with the "aligned"
chunk? I think the only requirement is that the start of the transfer
is aligned, right?

* If this is done in the core in spi_map_buf(), I think you only need
a single bounce buffer that is "dma_alignment" bytes big that you can
allocate once and store with the controller, right?


> +               }
> +       }
> +       return 0;
> +
> +cleanup:
> +       dev_err(ctrl->dev, "ERROR cleanup in setup_dma_desc\n");
> +       for (i = 0; i < ctrl->n_data_desc; i++)
> +               dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
> +                                 ctrl->dma_data_desc[i]);
> +       ctrl->n_data_desc = 0;
> +
> +       for (i = 0; i < ctrl->n_cmd_desc; i++)
> +               dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
> +                                 ctrl->dma_cmd_desc[i]);
> +       ctrl->n_cmd_desc = 0;
> +       return ret;
> +}
> +
> +static void qcom_qspi_dma_xfer(struct qcom_qspi *ctrl)
> +{
> +       /* Ack any previous interrupts that might be hanging around */
> +       writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_STATUS);

Do we really need the above? Maybe we can drop it and (in a separate
patch) drop the similar statement in qcom_qspi_pio_xfer()?

If this is truly needed for some reason, then it seems like in both
cases you should ack _all_ interrupts (the FIFO plus the DMA ones)
since we might be switching back and forth between the two modes and
thus any old interrupts that are "hanging around" could be from
either. ...but I think you can just drop it. If there are really any
interrupts "hanging around" we're in pretty bad shape.


> +       /* Setup new interrupts */
> +       writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_EN);
> +
> +       /* flush all writes */
> +       wmb();

Why do you need this explicit wmb()? I'm fairly sure that this is
handled automatically by the fact that you're using writel() and not
writel_relaxed(). writel() documents that it is "ordered relative to
any prior Normal memory access" and certainly it's ordered relative to
IO writes to the same device.


> +
> +       /* kick off transfer */
> +       writel((uint32_t)(uintptr_t)(ctrl->dma_cmd_desc)[0], ctrl->base + NEXT_DMA_DESC_ADDR);

It feels like there's one too many casts here. Shouldn't this just be
"(u32)(ctrl->dma_cmd_desc[0])"?


> +}
> +
> +/* Switch to DMA if transfer length exceeds this */
> +#define QSPI_MAX_BYTES_FIFO 64
> +
> +static bool qcom_qspi_can_dma(struct spi_controller *ctlr,
> +                        struct spi_device *slv, struct spi_transfer *xfer)
> +{
> +       return xfer->len > QSPI_MAX_BYTES_FIFO ? true : false;

No need for the "? true : false". Just:

return xfer->len > QSPI_MAX_BYTES_FIFO;


> @@ -290,7 +555,25 @@ static int qcom_qspi_transfer_one(struct spi_master *master,
>         ctrl->xfer.is_last = list_is_last(&xfer->transfer_list,
>                                           &master->cur_msg->transfers);
>         ctrl->xfer.rem_bytes = xfer->len;
> -       qcom_qspi_pio_xfer(ctrl);
> +
> +       if (qcom_qspi_can_dma(master, slv, xfer)) {

Maybe it would be better to check if either "xfer->rx_sg.nents" or
"xfer->tx_sg.nents" is non-zero. That indicates that the SPI framework
is expecting you to do DMA.


> +               ctrl->xfer_mode = QSPI_DMA;
> +               ctrl->iomode = qspi_buswidth_to_iomode(ctrl, ctrl->xfer.buswidth);

Don't store iomode in the "ctrl" structure (remove it from that
struct). Just make it a local variable in qcom_qspi_setup_dma_desc()
and then pass it in to the one place that needs it:
qcom_qspi_alloc_desc()


> +               mstr_cfg |= DMA_ENABLE;
> +               writel(mstr_cfg, ctrl->base + MSTR_CONFIG);

nit: I seem to remember IO writes to the controller taking a
non-trivial amount of time. Maybe worth it to do?

if (!(mstr_cfg & DMA_ENABLE)) {
  mstr_cfg |= DMA_ENABLE;
  writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
}

...similar for the "else" case below.


> @@ -328,6 +611,40 @@ static int qcom_qspi_prepare_message(struct spi_master *master,
>         return 0;
>  }
>
> +static int qcom_qspi_alloc_dma(struct qcom_qspi *ctrl)
> +{
> +       /* allocate for cmd descriptors pool */
> +       ctrl->dma_cmd_pool = dma_pool_create("qspi cmd desc pool",
> +               ctrl->dev, sizeof(struct qspi_cmd_desc), 0, 0);

Instead of dma_pool_create(), use dmam_pool_create(). That looks to be
the (oddly named) devm version of the function. Then you can fully get
rid of qcom_qspi_free_dma() and also the dma_pool_destroy() in your
error handling below.

It also seems really weird that the "data" pool has such strict
alignment requirements and you do a whole ton of work to meet those
requirements, but the "cmd" pool has no alignment requirements at all.
Is this really correct?


> +       if (!ctrl->dma_cmd_pool) {
> +               dev_err(ctrl->dev, "Could not create dma pool for cmd_desc\n");

nit: no need for an error message here. You can assume that allocation
failures will already print a warning splat and you don't need another
one for this incredibly unlikely event.


> @@ -426,27 +743,63 @@ static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
>         int_status = readl(ctrl->base + MSTR_INT_STATUS);
>         writel(int_status, ctrl->base + MSTR_INT_STATUS);
>
> -       if (ctrl->xfer.dir == QSPI_WRITE) {
> -               if (int_status & WR_FIFO_EMPTY)
> -                       ret = pio_write(ctrl);
> -       } else {
> -               if (int_status & RESP_FIFO_RDY)
> -                       ret = pio_read(ctrl);
> -       }
> -
> -       if (int_status & QSPI_ERR_IRQS) {
> -               if (int_status & RESP_FIFO_UNDERRUN)
> -                       dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
> -               if (int_status & WR_FIFO_OVERRUN)
> -                       dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
> -               if (int_status & HRESP_FROM_NOC_ERR)
> -                       dev_err(ctrl->dev, "IRQ error: NOC response error\n");
> -               ret = IRQ_HANDLED;
> -       }
> -
> -       if (!ctrl->xfer.rem_bytes) {
> -               writel(0, ctrl->base + MSTR_INT_EN);
> -               spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
> +       switch (ctrl->xfer_mode) {
> +       case QSPI_FIFO:

I don't think you really need to check xfer_mode here, do you? If
xfer_mode is FIFO then only the FIFO-related interrupts are enabled.
If xfer_mode is DMA then only the DMA-related interrupts are enabled.

In fact, I think you can fully get rid of the "xfer_mode" structure
member completely. It's completely redundant.


> @@ -487,6 +840,9 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>         if (ret)
>                 return ret;
>
> +       /* set default mode to FIFO */
> +       ctrl->xfer_mode = QSPI_FIFO;

Get rid of this initialization. Above I'm suggesting getting rid of
"xfter" mode altogether, but in any case, we should be setting this
before each transfer so the initialization doesn't do anything useful.


> @@ -556,10 +923,15 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>  static void qcom_qspi_remove(struct platform_device *pdev)
>  {
>         struct spi_master *master = platform_get_drvdata(pdev);
> +       struct qcom_qspi *ctrl;
> +
> +       ctrl = spi_master_get_devdata(master);
>
>         /* Unregister _before_ disabling pm_runtime() so we stop transfers */
>         spi_unregister_master(master);
>
> +       qcom_qspi_free_dma(ctrl);
> +
>         pm_runtime_disable(&pdev->dev);

To make this the reverse order of probe the qcom_qspi_free_dma() call
should be _after_ the pm_runtime_disable(), although above I'm
suggesting fully getting rid of qcom_qspi_free_dma() so maybe the
point is moot.
Mark Brown April 17, 2023, 12:12 p.m. UTC | #2
On Fri, Apr 14, 2023 at 03:05:58PM -0700, Doug Anderson wrote:

> Having alignment requirements like this doesn't seem like it should be
> that unusual, though, and that's why it feels like the logic belongs
> in the SPI core. In fact, it seems like this is _supposed_ to be
> handled in the SPI core, but it isn't? In "spi.h" I see
> "dma_alignment" that claims to be exactly what you need. As far as I
> can tell, though, the core doesn't use this? ...so I'm kinda confused.
> As far as I can tell this doesn't do anything and thus anyone setting
> it today is broken?

SPI consumers should only be providing dmaable buffers.
Douglas Anderson April 17, 2023, 2:07 p.m. UTC | #3
Hi,

On Mon, Apr 17, 2023 at 5:12 AM Mark Brown <broonie@kernel.org> wrote:
>
> On Fri, Apr 14, 2023 at 03:05:58PM -0700, Doug Anderson wrote:
>
> > Having alignment requirements like this doesn't seem like it should be
> > that unusual, though, and that's why it feels like the logic belongs
> > in the SPI core. In fact, it seems like this is _supposed_ to be
> > handled in the SPI core, but it isn't? In "spi.h" I see
> > "dma_alignment" that claims to be exactly what you need. As far as I
> > can tell, though, the core doesn't use this? ...so I'm kinda confused.
> > As far as I can tell this doesn't do anything and thus anyone setting
> > it today is broken?
>
> SPI consumers should only be providing dmaable buffers.

Ah, I think I see.

1. In "struct spi_transfer" the @tx_buf and @rx_buf are documented to
have "dma-safe memory".

2. On ARM64 anyway, I see "ARCH_DMA_MINALIGN" is 128.

So there is no reason to do any special rules to force alignment to
32-bytes because that's already guaranteed. Presumably that means you
can drop a whole pile of code and things will still work fine.

-Doug
Vijaya Krishna Nivarthi April 17, 2023, 3:57 p.m. UTC | #4
Hi,


On 4/17/2023 7:37 PM, Doug Anderson wrote:
> Hi,
>
> On Mon, Apr 17, 2023 at 5:12 AM Mark Brown <broonie@kernel.org> wrote:
>> On Fri, Apr 14, 2023 at 03:05:58PM -0700, Doug Anderson wrote:
>>
>>> Having alignment requirements like this doesn't seem like it should be
>>> that unusual, though, and that's why it feels like the logic belongs
>>> in the SPI core. In fact, it seems like this is _supposed_ to be
>>> handled in the SPI core, but it isn't? In "spi.h" I see
>>> "dma_alignment" that claims to be exactly what you need. As far as I
>>> can tell, though, the core doesn't use this? ...so I'm kinda confused.
>>> As far as I can tell this doesn't do anything and thus anyone setting
>>> it today is broken?
>> SPI consumers should only be providing dmaable buffers.
> Ah, I think I see.
>
> 1. In "struct spi_transfer" the @tx_buf and @rx_buf are documented to
> have "dma-safe memory".
>
> 2. On ARM64 anyway, I see "ARCH_DMA_MINALIGN" is 128.
>
> So there is no reason to do any special rules to force alignment to
> 32-bytes because that's already guaranteed. Presumably that means you
> can drop a whole pile of code and things will still work fine.
>
> -Doug


Thank you very much Mark and Doug for review and inputs.


spi_map_buf is taking into consideration max_dma_len (in spi.h) when it 
is set.

For example if set to 1024 instead of 65536(the actual max_dma_len of 
HW), 4 entries are created in the sg_table for a buffer of size 4096.

However, Like Doug pointed, dma_alignment is not being used by core.

Some drivers seem to be setting both of these in probe() but 
dma_alignment is probably not taking effect.

Is it up to the SPI consumers to read this and ensure they are providing 
dmaable buffers of required alignment?


The dma_addresses coming from core are aligned for larger sized buffers 
but for small ones like 1 and 3 bytes they are not aligned.

Hence if the code handing the alignment part is not present, smaller 
transfers fail.

For example, Below debug patch does

a) all transfers in DMA

b) prints DMA addresses and lengths

====== patch start ======

diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
index 60c4f554..8d24022 100644
--- a/drivers/spi/spi-qcom-qspi.c
+++ b/drivers/spi/spi-qcom-qspi.c
@@ -438,8 +438,14 @@ static int qcom_qspi_setup_dma_desc(struct 
qcom_qspi *ctrl,
                 return -EINVAL;
         }

-       for (i = 0; i < sgt->nents; i++)
+       for (i = 0; i < sgt->nents; i++) {
+               dma_addr_t temp_addr;
                 sg_total_len += sg_dma_len(sgt->sgl + i);
+
+               temp_addr = sg_dma_address(sgt->sgl + i);
+               dev_err_ratelimited(ctrl->dev, "%s pilli-20230417 i-%d, 
nents-%d, len-%d, dma_address-%pad\n",
+                               __func__, i, sgt->nents, 
sg_dma_len(sgt->sgl + i), &temp_addr);
+       }
         if (sg_total_len != xfer->len) {
                 dev_err(ctrl->dev, "Data lengths mismatch\n");
                 return -EINVAL;
@@ -517,6 +523,7 @@ static void qcom_qspi_dma_xfer(struct qcom_qspi *ctrl)
  static bool qcom_qspi_can_dma(struct spi_controller *ctlr,
                          struct spi_device *slv, struct spi_transfer *xfer)
  {
+       return true;
         return xfer->len > QSPI_MAX_BYTES_FIFO ? true : false;
  }

====== patch end =======

and below is sample outcome...

[   23.620397] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-1, dma_address-0x00000000fff3e000
[   23.638392] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-3, dma_address-0x00000000fff3f001
[   23.650238] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-1, dma_address-0x00000000fff40004
[   23.662039] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-8, dma_address-0x00000000fff44080
[   23.673965] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-1, dma_address-0x00000000fff44000
[   23.685749] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-3, dma_address-0x00000000fff40001
[   23.697630] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-1, dma_address-0x00000000fff3f004
[   23.709552] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-8, dma_address-0x00000000fff3e600
[   23.721460] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-1, dma_address-0x00000000fff3e000
[   23.733257] qcom_qspi 88dc000.spi: qcom_qspi_setup_dma_desc 
pilli-20230417 i-0, nents-1, len-3, dma_address-0x00000000fff3f001

If we use the dma_address from sg table as is, transfers fail.

I have not checked the spi-nor driver, but is it the consumer driver's 
job to ensure required alignment in all cases?

Can you Please comment?

Thank you,

-Vijay/
Mark Brown April 17, 2023, 4:39 p.m. UTC | #5
On Mon, Apr 17, 2023 at 09:27:16PM +0530, Vijaya Krishna Nivarthi wrote:

> However, Like Doug pointed, dma_alignment is not being used by core.

The core will use kmalloc() for any new buffers it allocates, this is
guaranteed to satisfy DMA constraints.

> Is it up to the SPI consumers to read this and ensure they are providing
> dmaable buffers of required alignment?

If they're doing anything fun for allocation.  Or they can just use
kmalloc() themselves.

> The dma_addresses coming from core are aligned for larger sized buffers but
> for small ones like 1 and 3 bytes they are not aligned.

In theory even buffers that small should be DMA safe, in practice that
rarely matters since it's vanishingly rare that it's sensible to DMA
such tiny buffers rather than PIOing them so drivers will tend to never
actually try to do so and I'd expect bugs.  It is likely worth checking
that DMA makes sense for this hardware.

> I have not checked the spi-nor driver, but is it the consumer driver's job
> to ensure required alignment in all cases?

Yes.
Vijaya Krishna Nivarthi April 20, 2023, 1:11 p.m. UTC | #6
Hi,

Thank you very much for the review...

Uploaded v4 with below...


On 4/15/2023 3:35 AM, Doug Anderson wrote:
> Hi,
>
> On Fri, Apr 14, 2023 at 7:06 AM Vijaya Krishna Nivarthi
> <quic_vnivarth@quicinc.com> wrote:
>> @@ -14,7 +16,6 @@
>>   #include <linux/spi/spi.h>
>>   #include <linux/spi/spi-mem.h>
>>
>> -
> Drop unrelated whitespace change.


Done

>
>
>> @@ -108,6 +110,10 @@
>>   #define RD_FIFO_RESET          0x0030
>>   #define RESET_FIFO             BIT(0)
>>
>> +#define NEXT_DMA_DESC_ADDR             0x0040
>> +#define CURRENT_DMA_DESC_ADDR  0x0044
>> +#define CURRENT_MEM_ADDR               0x0048
> Looking at the above with a correctly configured editor (tab size=8)
> the numbers don't line up. The first and 3rd have one too many tabs.
>

Corrected

>> @@ -120,6 +126,27 @@ enum qspi_dir {
>>          QSPI_WRITE,
>>   };
>>
>> +struct qspi_cmd_desc {
>> +       u32 data_address;
>> +       u32 next_descriptor;
>> +       u32 direction:1;
>> +       u32 multi_io_mode:3;
>> +       u32 reserved1:4;
>> +       u32 fragment:1;
>> +       u32 reserved2:7;
>> +       u32 length:16;
>> +       /*
>> +        * This marks end of HW cmd descriptor
>> +        * Fields down below are for SW usage to
>> +        * copy data from DMA buffer to rx buffer
>> +        */
>> +       u8 *bounce_src;
>> +       u8 *bounce_dst;
>> +       u32 bounce_length;
>> +};
>> +
>> +#define QSPI_MAX_NUM_DESC 5
> Nothing uses QSPI_MAX_NUM_DESC. Drop it.


Done

>
>
>> @@ -137,11 +164,36 @@ enum qspi_clocks {
>>          QSPI_NUM_CLKS
>>   };
>>
>> +enum qspi_xfer_mode {
>> +       QSPI_INVALID,
>> +       QSPI_FIFO,
>> +       QSPI_DMA
>> +};
> Why bother with INVALID? It's either FIFO or DMA, right?


Dropped

>
>
>> +/* number of entries in sgt returned from spi framework that we can support */
>> +#define QSPI_MAX_SG 5
> Is the above a hardware limitation, or just because you are statically
> allocating arrays? Please clarify in the comment. Would it make sense
> to just dynamically allocate the arrays and remove the need for this
> limitation?


Added comment.

Dynamic allocation may not be required here as we dont expect too many 
cmd descriptors to be necessary.

>
>> +/* 3 descriptors for head, aligned part and tail */
>> +#define QSPI_NUM_CMD_DESC 3
>> +
>> +/* 2 descriptors for head, tail */
>> +#define QSPI_NUM_DAT_DESC 2
>> +
>>   struct qcom_qspi {
>>          void __iomem *base;
>>          struct device *dev;
>>          struct clk_bulk_data *clks;
>>          struct qspi_xfer xfer;
>> +       struct dma_pool *dma_cmd_pool;
>> +       struct dma_pool *dma_data_pool;
>> +       dma_addr_t dma_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
>> +       dma_addr_t dma_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
>> +       void *virt_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
>> +       void *virt_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
>> +       unsigned int n_cmd_desc;
>> +       unsigned int n_data_desc;
>> +       int xfer_mode;
> Instead of "int", shouldn't xfer_mode be "enum qspi_xfer_mode"?
> Although below I'm proposing that xfer_mode can just be completely
> dropped from this structure.
>

Done

>> @@ -151,18 +203,25 @@ struct qcom_qspi {
>>   static u32 qspi_buswidth_to_iomode(struct qcom_qspi *ctrl,
>>                                     unsigned int buswidth)
>>   {
>> +       u32 ret;
>> +
>> +       /* for DMA we don't write to PIO_XFER_CFG register, so don't shift */
>>          switch (buswidth) {
>>          case 1:
>> -               return SDR_1BIT << MULTI_IO_MODE_SHFT;
>> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
>> +               break;
>>          case 2:
>> -               return SDR_2BIT << MULTI_IO_MODE_SHFT;
>> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_2BIT : SDR_2BIT << MULTI_IO_MODE_SHFT);
>> +               break;
>>          case 4:
>> -               return SDR_4BIT << MULTI_IO_MODE_SHFT;
>> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_4BIT : SDR_4BIT << MULTI_IO_MODE_SHFT);
>> +               break;
>>          default:
>>                  dev_warn_once(ctrl->dev,
>>                                  "Unexpected bus width: %u\n", buswidth);
>> -               return SDR_1BIT << MULTI_IO_MODE_SHFT;
>> +               ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
>>          }
>> +       return ret;
> Wouldn't it be easier to do the test once at the end? In other words,
> in the switch statement "ret" never contains the shift and then at the
> end:
>
> if (ctrl->xfer_mode != QSPI_DMA)
>    ret <<= MULTI_IO_MODE_SHFT;
> return ret;
>
> ...or, even better, just always return the unshifted mode and do the
> shifting unconditionally in qcom_qspi_pio_xfer_cfg(). Then you never
> need to look at xfer_mode to decide.
>

Agree, Done

>> @@ -241,12 +316,16 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
>>                  return ret;
>>          }
>>
>> +       avg_bw_cpu = Bps_to_icc(speed_hz);
>>          /*
>> -        * Set BW quota for CPU as driver supports FIFO mode only.
>> -        * We don't have explicit peak requirement so keep it equal to avg_bw.
>> +        * Set BW quota for CPU for FIFO to avg_bw
>> +        * as we don't have explicit peak requirement.
>> +        * TBD TBD TBD - change this as required for DMA.
>> +        * As of now same peak requirement seems to be working.
>>           */
>> -       avg_bw_cpu = Bps_to_icc(speed_hz);
>> -       ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, avg_bw_cpu);
>> +       peak_bw_cpu = ctrl->xfer_mode == QSPI_FIFO ? avg_bw_cpu : avg_bw_cpu;
>> +
>> +       ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, peak_bw_cpu);
> The change to this function is completely a no-op, right? You check
> the mode against QSPI_FIFO but you set the "peak_bw_cpu" to the same
> thing in both modes. ...and the thing you set it to is exactly the
> same as the function set it to before your patch.
>
> ...so drop all the changes you made to this function.
>

Dropped

>> @@ -258,6 +337,190 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
>>          return 0;
>>   }
>>
>> +/* aligned to 32 bytes, not to cross 1KB boundary */
>> +#define QSPI_ALIGN_REQ         32
>> +#define QSPI_BOUNDARY_REQ      1024
>> +
>> +static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
>> +                       dma_addr_t dma_ptr, uint32_t n_bytes)
> Why is "n_bytes" "uint32_t" instead of just "u32"? Please just use
> "u32" consistently in this file.
>

Done

>
>> +               }
>> +       }
>> +       return 0;
>> +
>> +cleanup:
>> +       dev_err(ctrl->dev, "ERROR cleanup in setup_dma_desc\n");
>> +       for (i = 0; i < ctrl->n_data_desc; i++)
>> +               dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
>> +                                 ctrl->dma_data_desc[i]);
>> +       ctrl->n_data_desc = 0;
>> +
>> +       for (i = 0; i < ctrl->n_cmd_desc; i++)
>> +               dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
>> +                                 ctrl->dma_cmd_desc[i]);
>> +       ctrl->n_cmd_desc = 0;
>> +       return ret;
>> +}
>> +
>> +static void qcom_qspi_dma_xfer(struct qcom_qspi *ctrl)
>> +{
>> +       /* Ack any previous interrupts that might be hanging around */
>> +       writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_STATUS);
> Do we really need the above? Maybe we can drop it and (in a separate
> patch) drop the similar statement in qcom_qspi_pio_xfer()?
>
> If this is truly needed for some reason, then it seems like in both
> cases you should ack _all_ interrupts (the FIFO plus the DMA ones)
> since we might be switching back and forth between the two modes and
> thus any old interrupts that are "hanging around" could be from
> either. ...but I think you can just drop it. If there are really any
> interrupts "hanging around" we're in pretty bad shape.
>

Dropped

>> +       /* Setup new interrupts */
>> +       writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_EN);
>> +
>> +       /* flush all writes */
>> +       wmb();
> Why do you need this explicit wmb()? I'm fairly sure that this is
> handled automatically by the fact that you're using writel() and not
> writel_relaxed(). writel() documents that it is "ordered relative to
> any prior Normal memory access" and certainly it's ordered relative to
> IO writes to the same device.
>

Dropped

>> +
>> +       /* kick off transfer */
>> +       writel((uint32_t)(uintptr_t)(ctrl->dma_cmd_desc)[0], ctrl->base + NEXT_DMA_DESC_ADDR);
> It feels like there's one too many casts here. Shouldn't this just be
> "(u32)(ctrl->dma_cmd_desc[0])"?


Changed

>
>> +}
>> +
>> +/* Switch to DMA if transfer length exceeds this */
>> +#define QSPI_MAX_BYTES_FIFO 64
>> +
>> +static bool qcom_qspi_can_dma(struct spi_controller *ctlr,
>> +                        struct spi_device *slv, struct spi_transfer *xfer)
>> +{
>> +       return xfer->len > QSPI_MAX_BYTES_FIFO ? true : false;
> No need for the "? true : false". Just:
>
> return xfer->len > QSPI_MAX_BYTES_FIFO;
>

Correct. Done.

>> @@ -290,7 +555,25 @@ static int qcom_qspi_transfer_one(struct spi_master *master,
>>          ctrl->xfer.is_last = list_is_last(&xfer->transfer_list,
>>                                            &master->cur_msg->transfers);
>>          ctrl->xfer.rem_bytes = xfer->len;
>> -       qcom_qspi_pio_xfer(ctrl);
>> +
>> +       if (qcom_qspi_can_dma(master, slv, xfer)) {
> Maybe it would be better to check if either "xfer->rx_sg.nents" or
> "xfer->tx_sg.nents" is non-zero. That indicates that the SPI framework
> is expecting you to do DMA.
>

Yes, changed.

>> +               ctrl->xfer_mode = QSPI_DMA;
>> +               ctrl->iomode = qspi_buswidth_to_iomode(ctrl, ctrl->xfer.buswidth);
> Don't store iomode in the "ctrl" structure (remove it from that
> struct). Just make it a local variable in qcom_qspi_setup_dma_desc()
> and then pass it in to the one place that needs it:
> qcom_qspi_alloc_desc()
>

Done

>> +               mstr_cfg |= DMA_ENABLE;
>> +               writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
> nit: I seem to remember IO writes to the controller taking a
> non-trivial amount of time. Maybe worth it to do?
>
> if (!(mstr_cfg & DMA_ENABLE)) {
>    mstr_cfg |= DMA_ENABLE;
>    writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
> }
>
> ...similar for the "else" case below.
>

Done

>> @@ -328,6 +611,40 @@ static int qcom_qspi_prepare_message(struct spi_master *master,
>>          return 0;
>>   }
>>
>> +static int qcom_qspi_alloc_dma(struct qcom_qspi *ctrl)
>> +{
>> +       /* allocate for cmd descriptors pool */
>> +       ctrl->dma_cmd_pool = dma_pool_create("qspi cmd desc pool",
>> +               ctrl->dev, sizeof(struct qspi_cmd_desc), 0, 0);
> Instead of dma_pool_create(), use dmam_pool_create(). That looks to be
> the (oddly named) devm version of the function. Then you can fully get
> rid of qcom_qspi_free_dma() and also the dma_pool_destroy() in your
> error handling below.
>
> It also seems really weird that the "data" pool has such strict
> alignment requirements and you do a whole ton of work to meet those
> requirements, but the "cmd" pool has no alignment requirements at all.
> Is this really correct?
>

Used dmam version.

Yes only the data addresses need to be aligned; there is no constraint 
for cmd descriptors.

>> +       if (!ctrl->dma_cmd_pool) {
>> +               dev_err(ctrl->dev, "Could not create dma pool for cmd_desc\n");
> nit: no need for an error message here. You can assume that allocation
> failures will already print a warning splat and you don't need another
> one for this incredibly unlikely event.
>

Dropped

>> @@ -426,27 +743,63 @@ static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
>>          int_status = readl(ctrl->base + MSTR_INT_STATUS);
>>          writel(int_status, ctrl->base + MSTR_INT_STATUS);
>>
>> -       if (ctrl->xfer.dir == QSPI_WRITE) {
>> -               if (int_status & WR_FIFO_EMPTY)
>> -                       ret = pio_write(ctrl);
>> -       } else {
>> -               if (int_status & RESP_FIFO_RDY)
>> -                       ret = pio_read(ctrl);
>> -       }
>> -
>> -       if (int_status & QSPI_ERR_IRQS) {
>> -               if (int_status & RESP_FIFO_UNDERRUN)
>> -                       dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
>> -               if (int_status & WR_FIFO_OVERRUN)
>> -                       dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
>> -               if (int_status & HRESP_FROM_NOC_ERR)
>> -                       dev_err(ctrl->dev, "IRQ error: NOC response error\n");
>> -               ret = IRQ_HANDLED;
>> -       }
>> -
>> -       if (!ctrl->xfer.rem_bytes) {
>> -               writel(0, ctrl->base + MSTR_INT_EN);
>> -               spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
>> +       switch (ctrl->xfer_mode) {
>> +       case QSPI_FIFO:
> I don't think you really need to check xfer_mode here, do you? If
> xfer_mode is FIFO then only the FIFO-related interrupts are enabled.
> If xfer_mode is DMA then only the DMA-related interrupts are enabled.
>
> In fact, I think you can fully get rid of the "xfer_mode" structure
> member completely. It's completely redundant.
>

I think the xfer_mode is required to conditionally cleanup in 
handle_err() and isr()

I have retained it for now; it also seems to help keep isr() more readable.

>> @@ -487,6 +840,9 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>>          if (ret)
>>                  return ret;
>>
>> +       /* set default mode to FIFO */
>> +       ctrl->xfer_mode = QSPI_FIFO;
> Get rid of this initialization. Above I'm suggesting getting rid of
> "xfter" mode altogether, but in any case, we should be setting this
> before each transfer so the initialization doesn't do anything useful.
>

Done

>> @@ -556,10 +923,15 @@ static int qcom_qspi_probe(struct platform_device *pdev)
>>   static void qcom_qspi_remove(struct platform_device *pdev)
>>   {
>>          struct spi_master *master = platform_get_drvdata(pdev);
>> +       struct qcom_qspi *ctrl;
>> +
>> +       ctrl = spi_master_get_devdata(master);
>>
>>          /* Unregister _before_ disabling pm_runtime() so we stop transfers */
>>          spi_unregister_master(master);
>>
>> +       qcom_qspi_free_dma(ctrl);
>> +
>>          pm_runtime_disable(&pdev->dev);
> To make this the reverse order of probe the qcom_qspi_free_dma() call
> should be _after_ the pm_runtime_disable(), although above I'm
> suggesting fully getting rid of qcom_qspi_free_dma() so maybe the
> point is moot.
diff mbox series

Patch

diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
index fab1553..cfd92a3 100644
--- a/drivers/spi/spi-qcom-qspi.c
+++ b/drivers/spi/spi-qcom-qspi.c
@@ -2,6 +2,8 @@ 
 // Copyright (c) 2017-2018, The Linux foundation. All rights reserved.
 
 #include <linux/clk.h>
+#include <linux/dmapool.h>
+#include <linux/dma-mapping.h>
 #include <linux/interconnect.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -14,7 +16,6 @@ 
 #include <linux/spi/spi.h>
 #include <linux/spi/spi-mem.h>
 
-
 #define QSPI_NUM_CS		2
 #define QSPI_BYTES_PER_WORD	4
 
@@ -62,6 +63,7 @@ 
 #define WR_FIFO_FULL		BIT(10)
 #define WR_FIFO_OVERRUN		BIT(11)
 #define TRANSACTION_DONE	BIT(16)
+#define DMA_CHAIN_DONE		BIT(31)
 #define QSPI_ERR_IRQS		(RESP_FIFO_UNDERRUN | HRESP_FROM_NOC_ERR | \
 				 WR_FIFO_OVERRUN)
 #define QSPI_ALL_IRQS		(QSPI_ERR_IRQS | RESP_FIFO_RDY | \
@@ -108,6 +110,10 @@ 
 #define RD_FIFO_RESET		0x0030
 #define RESET_FIFO		BIT(0)
 
+#define NEXT_DMA_DESC_ADDR		0x0040
+#define CURRENT_DMA_DESC_ADDR	0x0044
+#define CURRENT_MEM_ADDR		0x0048
+
 #define CUR_MEM_ADDR		0x0048
 #define HW_VERSION		0x004c
 #define RD_FIFO			0x0050
@@ -120,6 +126,27 @@  enum qspi_dir {
 	QSPI_WRITE,
 };
 
+struct qspi_cmd_desc {
+	u32 data_address;
+	u32 next_descriptor;
+	u32 direction:1;
+	u32 multi_io_mode:3;
+	u32 reserved1:4;
+	u32 fragment:1;
+	u32 reserved2:7;
+	u32 length:16;
+	/*
+	 * This marks end of HW cmd descriptor
+	 * Fields down below are for SW usage to
+	 * copy data from DMA buffer to rx buffer
+	 */
+	u8 *bounce_src;
+	u8 *bounce_dst;
+	u32 bounce_length;
+};
+
+#define QSPI_MAX_NUM_DESC 5
+
 struct qspi_xfer {
 	union {
 		const void *tx_buf;
@@ -137,11 +164,36 @@  enum qspi_clocks {
 	QSPI_NUM_CLKS
 };
 
+enum qspi_xfer_mode {
+	QSPI_INVALID,
+	QSPI_FIFO,
+	QSPI_DMA
+};
+
+/* number of entries in sgt returned from spi framework that we can support */
+#define QSPI_MAX_SG 5
+
+/* 3 descriptors for head, aligned part and tail */
+#define QSPI_NUM_CMD_DESC 3
+
+/* 2 descriptors for head, tail */
+#define QSPI_NUM_DAT_DESC 2
+
 struct qcom_qspi {
 	void __iomem *base;
 	struct device *dev;
 	struct clk_bulk_data *clks;
 	struct qspi_xfer xfer;
+	struct dma_pool *dma_cmd_pool;
+	struct dma_pool *dma_data_pool;
+	dma_addr_t dma_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
+	dma_addr_t dma_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
+	void *virt_cmd_desc[QSPI_NUM_CMD_DESC*QSPI_MAX_SG];
+	void *virt_data_desc[QSPI_NUM_DAT_DESC*QSPI_MAX_SG];
+	unsigned int n_cmd_desc;
+	unsigned int n_data_desc;
+	int xfer_mode;
+	u32 iomode;
 	struct icc_path *icc_path_cpu_to_qspi;
 	unsigned long last_speed;
 	/* Lock to protect data accessed by IRQs */
@@ -151,18 +203,25 @@  struct qcom_qspi {
 static u32 qspi_buswidth_to_iomode(struct qcom_qspi *ctrl,
 				   unsigned int buswidth)
 {
+	u32 ret;
+
+	/* for DMA we don't write to PIO_XFER_CFG register, so don't shift */
 	switch (buswidth) {
 	case 1:
-		return SDR_1BIT << MULTI_IO_MODE_SHFT;
+		ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
+		break;
 	case 2:
-		return SDR_2BIT << MULTI_IO_MODE_SHFT;
+		ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_2BIT : SDR_2BIT << MULTI_IO_MODE_SHFT);
+		break;
 	case 4:
-		return SDR_4BIT << MULTI_IO_MODE_SHFT;
+		ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_4BIT : SDR_4BIT << MULTI_IO_MODE_SHFT);
+		break;
 	default:
 		dev_warn_once(ctrl->dev,
 				"Unexpected bus width: %u\n", buswidth);
-		return SDR_1BIT << MULTI_IO_MODE_SHFT;
+		ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
 	}
+	return ret;
 }
 
 static void qcom_qspi_pio_xfer_cfg(struct qcom_qspi *ctrl)
@@ -223,6 +282,21 @@  static void qcom_qspi_handle_err(struct spi_master *master,
 	spin_lock_irqsave(&ctrl->lock, flags);
 	writel(0, ctrl->base + MSTR_INT_EN);
 	ctrl->xfer.rem_bytes = 0;
+
+	if (ctrl->xfer_mode == QSPI_DMA) {
+		int i;
+
+		/* free cmd and data descriptors */
+		for (i = 0; i < ctrl->n_data_desc; i++)
+			dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
+					  ctrl->dma_data_desc[i]);
+		ctrl->n_data_desc = 0;
+
+		for (i = 0; i < ctrl->n_cmd_desc; i++)
+			dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+					  ctrl->dma_cmd_desc[i]);
+		ctrl->n_cmd_desc = 0;
+	}
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 }
 
@@ -230,6 +304,7 @@  static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
 {
 	int ret;
 	unsigned int avg_bw_cpu;
+	unsigned int peak_bw_cpu;
 
 	if (speed_hz == ctrl->last_speed)
 		return 0;
@@ -241,12 +316,16 @@  static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
 		return ret;
 	}
 
+	avg_bw_cpu = Bps_to_icc(speed_hz);
 	/*
-	 * Set BW quota for CPU as driver supports FIFO mode only.
-	 * We don't have explicit peak requirement so keep it equal to avg_bw.
+	 * Set BW quota for CPU for FIFO to avg_bw
+	 * as we don't have explicit peak requirement.
+	 * TBD TBD TBD - change this as required for DMA.
+	 * As of now same peak requirement seems to be working.
 	 */
-	avg_bw_cpu = Bps_to_icc(speed_hz);
-	ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, avg_bw_cpu);
+	peak_bw_cpu = ctrl->xfer_mode == QSPI_FIFO ? avg_bw_cpu : avg_bw_cpu;
+
+	ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, peak_bw_cpu);
 	if (ret) {
 		dev_err(ctrl->dev, "%s: ICC BW voting failed for cpu: %d\n",
 			__func__, ret);
@@ -258,6 +337,190 @@  static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
 	return 0;
 }
 
+/* aligned to 32 bytes, not to cross 1KB boundary */
+#define QSPI_ALIGN_REQ		32
+#define QSPI_BOUNDARY_REQ	1024
+
+static int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
+			dma_addr_t dma_ptr, uint32_t n_bytes)
+{
+	struct qspi_cmd_desc *virt_cmd_desc, *prev;
+	uint8_t *virt_data_desc;
+	dma_addr_t dma_cmd_desc, dma_data_desc;
+
+	if (virt_ptr && n_bytes >= QSPI_ALIGN_REQ) {
+		dev_err(ctrl->dev,
+			"Exiting to avert memory overwrite, n_bytes-%d\n", n_bytes);
+		return -ENOMEM;
+	}
+
+	/* allocate for dma cmd descriptor */
+	virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
+		GFP_KERNEL, &dma_cmd_desc);
+	if (!virt_cmd_desc) {
+		dev_err(ctrl->dev,
+			"Could not allocate for cmd_desc\n");
+		return -ENOMEM;
+	}
+	ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
+	ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
+	ctrl->n_cmd_desc++;
+
+	/* allocate for dma data descriptor if unaligned else use pre-aligned */
+	if (virt_ptr) {
+		virt_data_desc = (uint8_t *)dma_pool_zalloc(ctrl->dma_data_pool,
+			GFP_KERNEL, &dma_data_desc);
+		if (!virt_data_desc) {
+			dev_err(ctrl->dev,
+				"Could not allocate for data_desc\n");
+			return -ENOMEM;
+		}
+		ctrl->virt_data_desc[ctrl->n_data_desc] = virt_data_desc;
+		ctrl->dma_data_desc[ctrl->n_data_desc] = dma_data_desc;
+		ctrl->n_data_desc++;
+
+		/*
+		 * for tx copy xfer data into allocated buffer
+		 * for rx setup bounce info to copy after xfer
+		 */
+		if (ctrl->xfer.dir == QSPI_WRITE) {
+			memcpy(virt_data_desc, virt_ptr, n_bytes);
+		} else {
+			virt_cmd_desc->bounce_src = virt_data_desc;
+			virt_cmd_desc->bounce_dst = virt_ptr;
+			virt_cmd_desc->bounce_length = n_bytes;
+		}
+	} else {
+		dma_data_desc = dma_ptr;
+	}
+
+	/* setup cmd descriptor */
+	virt_cmd_desc->data_address = dma_data_desc;
+	virt_cmd_desc->next_descriptor = 0;
+	virt_cmd_desc->direction = ctrl->xfer.dir;
+	virt_cmd_desc->multi_io_mode = ctrl->iomode;
+	virt_cmd_desc->reserved1 = 0;
+	virt_cmd_desc->fragment = ctrl->xfer.is_last ? 0 : 1;
+	virt_cmd_desc->reserved2 = 0;
+	virt_cmd_desc->length = n_bytes;
+
+	/* update previous descriptor */
+	if (ctrl->n_cmd_desc >= 2) {
+		prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
+		prev->next_descriptor = dma_cmd_desc;
+		prev->fragment = 1;
+	}
+
+	return 0;
+}
+
+static int qcom_qspi_setup_dma_desc(struct qcom_qspi *ctrl,
+				struct spi_transfer *xfer)
+{
+	int ret;
+	struct sg_table *sgt;
+	unsigned int sg_total_len = 0;
+	dma_addr_t dma_ptr_sg;
+	unsigned int dma_len_sg;
+	uint32_t prolog_bytes, aligned_bytes, epilog_bytes;
+	dma_addr_t aligned_addr;
+	int i;
+	uint8_t *byte_ptr;
+
+	if (ctrl->n_cmd_desc || ctrl->n_data_desc) {
+		dev_err(ctrl->dev, "Remnant dma buffers cmd-%d, data-%d\n",
+			ctrl->n_cmd_desc, ctrl->n_data_desc);
+		return -EIO;
+	}
+
+	sgt = (ctrl->xfer.dir == QSPI_READ) ? &xfer->rx_sg : &xfer->tx_sg;
+	if (!sgt->nents || sgt->nents > QSPI_MAX_SG) {
+		dev_err(ctrl->dev, "Cannot handle %d entries in scatter list\n", sgt->nents);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < sgt->nents; i++)
+		sg_total_len += sg_dma_len(sgt->sgl + i);
+	if (sg_total_len != xfer->len) {
+		dev_err(ctrl->dev, "Data lengths mismatch\n");
+		return -EINVAL;
+	}
+
+	if (ctrl->xfer.dir == QSPI_READ)
+		byte_ptr = xfer->rx_buf;
+	else
+		byte_ptr = (uint8_t *)xfer->tx_buf;
+
+	for (i = 0; i < sgt->nents; i++) {
+		dma_ptr_sg = sg_dma_address(sgt->sgl + i);
+		dma_len_sg = sg_dma_len(sgt->sgl + i);
+
+		aligned_addr = PTR_ALIGN(dma_ptr_sg, QSPI_ALIGN_REQ);
+
+		prolog_bytes = min(dma_len_sg, (unsigned int)(aligned_addr - dma_ptr_sg));
+		if (prolog_bytes) {
+			ret = qcom_qspi_alloc_desc(ctrl, byte_ptr, 0, prolog_bytes);
+			if (ret)
+				goto cleanup;
+			byte_ptr += prolog_bytes;
+		}
+
+		aligned_bytes = PTR_ALIGN_DOWN(dma_len_sg - prolog_bytes, QSPI_ALIGN_REQ);
+		if (aligned_bytes) {
+			ret = qcom_qspi_alloc_desc(ctrl, 0, aligned_addr, aligned_bytes);
+			if (ret)
+				goto cleanup;
+			byte_ptr += aligned_bytes;
+		}
+
+		epilog_bytes = dma_len_sg - prolog_bytes - aligned_bytes;
+		if (epilog_bytes) {
+			ret = qcom_qspi_alloc_desc(ctrl, byte_ptr, 0, epilog_bytes);
+			if (ret)
+				goto cleanup;
+			byte_ptr += epilog_bytes;
+		}
+	}
+	return 0;
+
+cleanup:
+	dev_err(ctrl->dev, "ERROR cleanup in setup_dma_desc\n");
+	for (i = 0; i < ctrl->n_data_desc; i++)
+		dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
+				  ctrl->dma_data_desc[i]);
+	ctrl->n_data_desc = 0;
+
+	for (i = 0; i < ctrl->n_cmd_desc; i++)
+		dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+				  ctrl->dma_cmd_desc[i]);
+	ctrl->n_cmd_desc = 0;
+	return ret;
+}
+
+static void qcom_qspi_dma_xfer(struct qcom_qspi *ctrl)
+{
+	/* Ack any previous interrupts that might be hanging around */
+	writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_STATUS);
+
+	/* Setup new interrupts */
+	writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_EN);
+
+	/* flush all writes */
+	wmb();
+
+	/* kick off transfer */
+	writel((uint32_t)(uintptr_t)(ctrl->dma_cmd_desc)[0], ctrl->base + NEXT_DMA_DESC_ADDR);
+}
+
+/* Switch to DMA if transfer length exceeds this */
+#define QSPI_MAX_BYTES_FIFO 64
+
+static bool qcom_qspi_can_dma(struct spi_controller *ctlr,
+			 struct spi_device *slv, struct spi_transfer *xfer)
+{
+	return xfer->len > QSPI_MAX_BYTES_FIFO ? true : false;
+}
+
 static int qcom_qspi_transfer_one(struct spi_master *master,
 				  struct spi_device *slv,
 				  struct spi_transfer *xfer)
@@ -266,6 +529,7 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 	int ret;
 	unsigned long speed_hz;
 	unsigned long flags;
+	u32 mstr_cfg;
 
 	speed_hz = slv->max_speed_hz;
 	if (xfer->speed_hz)
@@ -276,6 +540,7 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 		return ret;
 
 	spin_lock_irqsave(&ctrl->lock, flags);
+	mstr_cfg = readl(ctrl->base + MSTR_CONFIG);
 
 	/* We are half duplex, so either rx or tx will be set */
 	if (xfer->rx_buf) {
@@ -290,7 +555,25 @@  static int qcom_qspi_transfer_one(struct spi_master *master,
 	ctrl->xfer.is_last = list_is_last(&xfer->transfer_list,
 					  &master->cur_msg->transfers);
 	ctrl->xfer.rem_bytes = xfer->len;
-	qcom_qspi_pio_xfer(ctrl);
+
+	if (qcom_qspi_can_dma(master, slv, xfer)) {
+		ctrl->xfer_mode = QSPI_DMA;
+		ctrl->iomode = qspi_buswidth_to_iomode(ctrl, ctrl->xfer.buswidth);
+		mstr_cfg |= DMA_ENABLE;
+		writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
+
+		ret = qcom_qspi_setup_dma_desc(ctrl, xfer);
+		if (ret) {
+			spin_unlock_irqrestore(&ctrl->lock, flags);
+			return ret;
+		}
+		qcom_qspi_dma_xfer(ctrl);
+	} else {
+		ctrl->xfer_mode = QSPI_FIFO;
+		mstr_cfg &= ~DMA_ENABLE;
+		writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
+		qcom_qspi_pio_xfer(ctrl);
+	}
 
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 
@@ -328,6 +611,40 @@  static int qcom_qspi_prepare_message(struct spi_master *master,
 	return 0;
 }
 
+static int qcom_qspi_alloc_dma(struct qcom_qspi *ctrl)
+{
+	/* allocate for cmd descriptors pool */
+	ctrl->dma_cmd_pool = dma_pool_create("qspi cmd desc pool",
+		ctrl->dev, sizeof(struct qspi_cmd_desc), 0, 0);
+	if (!ctrl->dma_cmd_pool) {
+		dev_err(ctrl->dev, "Could not create dma pool for cmd_desc\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * allocate for data descriptors pool as per alignment
+	 * and boundary requirements
+	 */
+	ctrl->dma_data_pool = dma_pool_create("qspi data desc pool",
+		ctrl->dev, QSPI_ALIGN_REQ, QSPI_ALIGN_REQ, QSPI_BOUNDARY_REQ);
+	if (!ctrl->dma_data_pool) {
+		dev_err(ctrl->dev, "Could not create dma pool for data desc\n");
+		dma_pool_destroy(ctrl->dma_cmd_pool);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void qcom_qspi_free_dma(struct qcom_qspi *ctrl)
+{
+	/* free pool buffers */
+	dma_pool_destroy(ctrl->dma_data_pool);
+
+	/* free pool */
+	dma_pool_destroy(ctrl->dma_cmd_pool);
+}
+
 static irqreturn_t pio_read(struct qcom_qspi *ctrl)
 {
 	u32 rd_fifo_status;
@@ -426,27 +743,63 @@  static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
 	int_status = readl(ctrl->base + MSTR_INT_STATUS);
 	writel(int_status, ctrl->base + MSTR_INT_STATUS);
 
-	if (ctrl->xfer.dir == QSPI_WRITE) {
-		if (int_status & WR_FIFO_EMPTY)
-			ret = pio_write(ctrl);
-	} else {
-		if (int_status & RESP_FIFO_RDY)
-			ret = pio_read(ctrl);
-	}
-
-	if (int_status & QSPI_ERR_IRQS) {
-		if (int_status & RESP_FIFO_UNDERRUN)
-			dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
-		if (int_status & WR_FIFO_OVERRUN)
-			dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
-		if (int_status & HRESP_FROM_NOC_ERR)
-			dev_err(ctrl->dev, "IRQ error: NOC response error\n");
-		ret = IRQ_HANDLED;
-	}
-
-	if (!ctrl->xfer.rem_bytes) {
-		writel(0, ctrl->base + MSTR_INT_EN);
-		spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+	switch (ctrl->xfer_mode) {
+	case QSPI_FIFO:
+		if (ctrl->xfer.dir == QSPI_WRITE) {
+			if (int_status & WR_FIFO_EMPTY)
+				ret = pio_write(ctrl);
+		} else {
+			if (int_status & RESP_FIFO_RDY)
+				ret = pio_read(ctrl);
+		}
+
+		if (int_status & QSPI_ERR_IRQS) {
+			if (int_status & RESP_FIFO_UNDERRUN)
+				dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
+			if (int_status & WR_FIFO_OVERRUN)
+				dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
+			if (int_status & HRESP_FROM_NOC_ERR)
+				dev_err(ctrl->dev, "IRQ error: NOC response error\n");
+			ret = IRQ_HANDLED;
+		}
+
+		if (!ctrl->xfer.rem_bytes) {
+			writel(0, ctrl->base + MSTR_INT_EN);
+			spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+		}
+		break;
+	case QSPI_DMA:
+		if (int_status & DMA_CHAIN_DONE) {
+			int i;
+
+			writel(0, ctrl->base + MSTR_INT_EN);
+
+			if (ctrl->xfer.dir == QSPI_READ) {
+				struct qspi_cmd_desc *cmd_desc;
+
+				for (i = 0; i < ctrl->n_cmd_desc; i++) {
+					cmd_desc = (struct qspi_cmd_desc *)ctrl->virt_cmd_desc[i];
+					memcpy(cmd_desc->bounce_dst,
+						cmd_desc->bounce_src, cmd_desc->bounce_length);
+				}
+			}
+
+			for (i = 0; i < ctrl->n_data_desc; i++)
+				dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[i],
+						  ctrl->dma_data_desc[i]);
+			ctrl->n_data_desc = 0;
+
+			for (i = 0; i < ctrl->n_cmd_desc; i++)
+				dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[i],
+						  ctrl->dma_cmd_desc[i]);
+			ctrl->n_cmd_desc = 0;
+
+			ret = IRQ_HANDLED;
+			spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+		}
+		break;
+	default:
+		dev_err(ctrl->dev, "Unknown xfer mode:%d", ctrl->xfer_mode);
 	}
 
 	spin_unlock(&ctrl->lock);
@@ -487,6 +840,9 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	/* set default mode to FIFO */
+	ctrl->xfer_mode = QSPI_FIFO;
+
 	ctrl->icc_path_cpu_to_qspi = devm_of_icc_get(dev, "qspi-config");
 	if (IS_ERR(ctrl->icc_path_cpu_to_qspi))
 		return dev_err_probe(dev, PTR_ERR(ctrl->icc_path_cpu_to_qspi),
@@ -517,7 +873,12 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+	if (ret)
+		return dev_err_probe(dev, ret, "could not set DMA mask\n");
+
 	master->max_speed_hz = 300000000;
+	master->max_dma_len = 65536; /* as per HPG */
 	master->num_chipselect = QSPI_NUM_CS;
 	master->bus_num = -1;
 	master->dev.of_node = pdev->dev.of_node;
@@ -528,6 +889,7 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 	master->prepare_message = qcom_qspi_prepare_message;
 	master->transfer_one = qcom_qspi_transfer_one;
 	master->handle_err = qcom_qspi_handle_err;
+	master->can_dma = qcom_qspi_can_dma;
 	master->auto_runtime_pm = true;
 
 	ret = devm_pm_opp_set_clkname(&pdev->dev, "core");
@@ -540,6 +902,11 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	/* allocate for DMA descriptor pools */
+	ret = qcom_qspi_alloc_dma(ctrl);
+	if (ret)
+		return ret;
+
 	pm_runtime_use_autosuspend(dev);
 	pm_runtime_set_autosuspend_delay(dev, 250);
 	pm_runtime_enable(dev);
@@ -556,10 +923,15 @@  static int qcom_qspi_probe(struct platform_device *pdev)
 static void qcom_qspi_remove(struct platform_device *pdev)
 {
 	struct spi_master *master = platform_get_drvdata(pdev);
+	struct qcom_qspi *ctrl;
+
+	ctrl = spi_master_get_devdata(master);
 
 	/* Unregister _before_ disabling pm_runtime() so we stop transfers */
 	spi_unregister_master(master);
 
+	qcom_qspi_free_dma(ctrl);
+
 	pm_runtime_disable(&pdev->dev);
 }