Message ID | 20191206160007.331801-4-jean.pihet@newoldbits.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | TI QSPI: Add support for large flash devices | expand |
On Fri, Dec 06, 2019 at 05:00:07PM +0100, Jean Pihet wrote: > By reading the 32 bits data register and copy the contents to the > receive buffer, according to the single/dual/quad read mode and > the data length to read. > > The speed improvement is 3.5x using quad read. > --- This is missing a Signed-off-by.
On 06/12/19 9:30 pm, Jean Pihet wrote: > By reading the 32 bits data register and copy the contents to the > receive buffer, according to the single/dual/quad read mode and > the data length to read. > > The speed improvement is 3.5x using quad read. > --- > drivers/spi/spi-ti-qspi.c | 48 ++++++++++++++++++++++++++------------- > 1 file changed, 32 insertions(+), 16 deletions(-) > > diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c > index 13916232a959..65ec3bcb25ae 100644 > --- a/drivers/spi/spi-ti-qspi.c > +++ b/drivers/spi/spi-ti-qspi.c > @@ -313,24 +313,25 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t, > static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, > int count) > { > - int wlen; > unsigned int cmd; > + u32 rx; > u8 *rxbuf; > + u8 xfer_len; > > rxbuf = t->rx_buf; > cmd = qspi->cmd; > + /* Optimize the transfers for dual and quad read */ > switch (t->rx_nbits) { > - case SPI_NBITS_DUAL: > - cmd |= QSPI_RD_DUAL; > - break; > case SPI_NBITS_QUAD: > - cmd |= QSPI_RD_QUAD; > + cmd |= QSPI_RD_QUAD | QSPI_WLEN(32); Why does QUAD mode mean 32 bit words? IO mode and word size are independent of each other. If you want to optimize for speed, why not set FLEN field to max and WLEN to max and use ti_qspi_adjust_op_size to clamp max read size to 4096 words when required. This should work irrespective of IO modes. Driver already does something similar to this in the write path. > + break; > + case SPI_NBITS_DUAL: > + cmd |= QSPI_RD_DUAL | QSPI_WLEN(16); > break; > default: > - cmd |= QSPI_RD_SNGL; > + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); > break; > } > - wlen = t->bits_per_word >> 3; /* in bytes */ > > while (count) { > dev_dbg(qspi->dev, "rx cmd %08x dc %08x\n", cmd, qspi->dc); > @@ -342,19 +343,34 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, > dev_err(qspi->dev, "read timed out\n"); > return -ETIMEDOUT; > } > - switch (wlen) { > - case 1: > - *rxbuf = readb(qspi->base + QSPI_SPI_DATA_REG); > + > + /* Optimize the transfers for dual and quad read */ > + rx = readl(qspi->base + QSPI_SPI_DATA_REG); > + switch (t->rx_nbits) { > + case SPI_NBITS_QUAD: > + if (count >= 1) > + *rxbuf++ = rx >> 24; > + if (count >= 2) > + *rxbuf++ = rx >> 16; > + if (count >= 3) > + *rxbuf++ = rx >> 8; > + if (count >= 4) > + *rxbuf++ = rx; > + xfer_len = min(count, 4); > break; > - case 2: > - *((u16 *)rxbuf) = readw(qspi->base + QSPI_SPI_DATA_REG); > + case SPI_NBITS_DUAL: > + if (count >= 1) > + *rxbuf++ = rx >> 8; > + if (count >= 2) > + *rxbuf++ = rx; > + xfer_len = min(count, 2); > break; > - case 4: > - *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); > + default: > + *rxbuf++ = rx; > + xfer_len = 1; > break; This will fail in case of t->rx_nbits = 1 and t->bits_per_word = 32 (1 bit SPI bus bit slave with 32-bit addressable registers) bits_per_word indicate the address granularity within SPI Slave (represented by WLEN field in TI QSPI) and rx_nbits tells about buswidth (bits received per bus clock) > } > - rxbuf += wlen; > - count -= wlen; > + count -= xfer_len; > } > > return 0; >
Hi Vignesh, On Tue, Dec 10, 2019 at 9:40 AM Vignesh Raghavendra <vigneshr@ti.com> wrote: > > > > On 06/12/19 9:30 pm, Jean Pihet wrote: > > By reading the 32 bits data register and copy the contents to the > > receive buffer, according to the single/dual/quad read mode and > > the data length to read. > > > > The speed improvement is 3.5x using quad read. > > --- > > drivers/spi/spi-ti-qspi.c | 48 ++++++++++++++++++++++++++------------- > > 1 file changed, 32 insertions(+), 16 deletions(-) > > > > diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c > > index 13916232a959..65ec3bcb25ae 100644 > > --- a/drivers/spi/spi-ti-qspi.c > > +++ b/drivers/spi/spi-ti-qspi.c > > @@ -313,24 +313,25 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t, > > static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, > > int count) > > { > > - int wlen; > > unsigned int cmd; > > + u32 rx; > > u8 *rxbuf; > > + u8 xfer_len; > > > > rxbuf = t->rx_buf; > > cmd = qspi->cmd; > > + /* Optimize the transfers for dual and quad read */ > > switch (t->rx_nbits) { > > - case SPI_NBITS_DUAL: > > - cmd |= QSPI_RD_DUAL; > > - break; > > case SPI_NBITS_QUAD: > > - cmd |= QSPI_RD_QUAD; > > + cmd |= QSPI_RD_QUAD | QSPI_WLEN(32); > > Why does QUAD mode mean 32 bit words? This is based on the assumption that every transfer is multiple of 8 clock cycles. For SPI flash devices where bits_per_word is 8, the original code always reads data by 1 byte, which can be optimized. > > IO mode and word size are independent of each other. If you want to > optimize for speed, why not set FLEN field to max and WLEN to max and > use ti_qspi_adjust_op_size to clamp max read size to 4096 words when > required. This should work irrespective of IO modes. > Driver already does something similar to this in the write path. Ok! A new patch series is coming. > > > + break; > > + case SPI_NBITS_DUAL: > > + cmd |= QSPI_RD_DUAL | QSPI_WLEN(16); > > break; > > default: > > - cmd |= QSPI_RD_SNGL; > > + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); > > break; > > } > > - wlen = t->bits_per_word >> 3; /* in bytes */ > > > > while (count) { > > dev_dbg(qspi->dev, "rx cmd %08x dc %08x\n", cmd, qspi->dc); > > @@ -342,19 +343,34 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, > > dev_err(qspi->dev, "read timed out\n"); > > return -ETIMEDOUT; > > } > > - switch (wlen) { > > - case 1: > > - *rxbuf = readb(qspi->base + QSPI_SPI_DATA_REG); > > + > > + /* Optimize the transfers for dual and quad read */ > > + rx = readl(qspi->base + QSPI_SPI_DATA_REG); > > + switch (t->rx_nbits) { > > + case SPI_NBITS_QUAD: > > + if (count >= 1) > > + *rxbuf++ = rx >> 24; > > + if (count >= 2) > > + *rxbuf++ = rx >> 16; > > + if (count >= 3) > > + *rxbuf++ = rx >> 8; > > + if (count >= 4) > > + *rxbuf++ = rx; > > + xfer_len = min(count, 4); > > break; > > - case 2: > > - *((u16 *)rxbuf) = readw(qspi->base + QSPI_SPI_DATA_REG); > > + case SPI_NBITS_DUAL: > > + if (count >= 1) > > + *rxbuf++ = rx >> 8; > > + if (count >= 2) > > + *rxbuf++ = rx; > > + xfer_len = min(count, 2); > > break; > > - case 4: > > - *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); > > + default: > > + *rxbuf++ = rx; > > + xfer_len = 1; > > break; > > This will fail in case of t->rx_nbits = 1 and t->bits_per_word = 32 > (1 bit SPI bus bit slave with 32-bit addressable registers) I do not see why this would fail. If bits_per_word is 32, count (in bytes) is a multiple of 4 and 1 byte will be read every time the loop runs. Is that correct? Can you elaborate more on why this would fail? Thanks for reviewing, regards, Jean > > bits_per_word indicate the address granularity within SPI Slave > (represented by WLEN field in TI QSPI) and rx_nbits tells about buswidth > (bits received per bus clock) > > > } > > - rxbuf += wlen; > > - count -= wlen; > > + count -= xfer_len; > > } > > > > return 0; > > > > -- > Regards > Vignesh
On 10/12/19 5:55 pm, Jean Pihet wrote: > Hi Vignesh, > > On Tue, Dec 10, 2019 at 9:40 AM Vignesh Raghavendra <vigneshr@ti.com> wrote: >> >> >> >> On 06/12/19 9:30 pm, Jean Pihet wrote: >>> By reading the 32 bits data register and copy the contents to the >>> receive buffer, according to the single/dual/quad read mode and >>> the data length to read. >>> >>> The speed improvement is 3.5x using quad read. >>> --- >>> drivers/spi/spi-ti-qspi.c | 48 ++++++++++++++++++++++++++------------- >>> 1 file changed, 32 insertions(+), 16 deletions(-) >>> >>> diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c >>> index 13916232a959..65ec3bcb25ae 100644 >>> --- a/drivers/spi/spi-ti-qspi.c >>> +++ b/drivers/spi/spi-ti-qspi.c >>> @@ -313,24 +313,25 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t, >>> static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, >>> int count) >>> { >>> - int wlen; >>> unsigned int cmd; >>> + u32 rx; >>> u8 *rxbuf; >>> + u8 xfer_len; >>> >>> rxbuf = t->rx_buf; >>> cmd = qspi->cmd; >>> + /* Optimize the transfers for dual and quad read */ >>> switch (t->rx_nbits) { >>> - case SPI_NBITS_DUAL: >>> - cmd |= QSPI_RD_DUAL; >>> - break; >>> case SPI_NBITS_QUAD: >>> - cmd |= QSPI_RD_QUAD; >>> + cmd |= QSPI_RD_QUAD | QSPI_WLEN(32); >> >> Why does QUAD mode mean 32 bit words? > This is based on the assumption that every transfer is multiple of 8 > clock cycles. > For SPI flash devices where bits_per_word is 8, the original code > always reads data by 1 byte, which > can be optimized. > >> >> IO mode and word size are independent of each other. If you want to >> optimize for speed, why not set FLEN field to max and WLEN to max and >> use ti_qspi_adjust_op_size to clamp max read size to 4096 words when >> required. This should work irrespective of IO modes. >> Driver already does something similar to this in the write path. > > Ok! A new patch series is coming. > >> >>> + break; >>> + case SPI_NBITS_DUAL: >>> + cmd |= QSPI_RD_DUAL | QSPI_WLEN(16); >>> break; >>> default: >>> - cmd |= QSPI_RD_SNGL; >>> + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); >>> break; >>> } >>> - wlen = t->bits_per_word >> 3; /* in bytes */ >>> >>> while (count) { >>> dev_dbg(qspi->dev, "rx cmd %08x dc %08x\n", cmd, qspi->dc); >>> @@ -342,19 +343,34 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, >>> dev_err(qspi->dev, "read timed out\n"); >>> return -ETIMEDOUT; >>> } >>> - switch (wlen) { >>> - case 1: >>> - *rxbuf = readb(qspi->base + QSPI_SPI_DATA_REG); >>> + >>> + /* Optimize the transfers for dual and quad read */ >>> + rx = readl(qspi->base + QSPI_SPI_DATA_REG); >>> + switch (t->rx_nbits) { >>> + case SPI_NBITS_QUAD: >>> + if (count >= 1) >>> + *rxbuf++ = rx >> 24; >>> + if (count >= 2) >>> + *rxbuf++ = rx >> 16; >>> + if (count >= 3) >>> + *rxbuf++ = rx >> 8; >>> + if (count >= 4) >>> + *rxbuf++ = rx; >>> + xfer_len = min(count, 4); >>> break; >>> - case 2: >>> - *((u16 *)rxbuf) = readw(qspi->base + QSPI_SPI_DATA_REG); >>> + case SPI_NBITS_DUAL: >>> + if (count >= 1) >>> + *rxbuf++ = rx >> 8; >>> + if (count >= 2) >>> + *rxbuf++ = rx; >>> + xfer_len = min(count, 2); >>> break; >>> - case 4: >>> - *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); >>> + default: >>> + *rxbuf++ = rx; >>> + xfer_len = 1; >>> break; >> >> This will fail in case of t->rx_nbits = 1 and t->bits_per_word = 32 >> (1 bit SPI bus bit slave with 32-bit addressable registers) > I do not see why this would fail. If bits_per_word is 32, count (in > bytes) is a multiple of 4 and 1 byte will be read every time the loop > runs. > Is that correct? Can you elaborate more on why this would fail? > With t->rx_nbits = 1 and t->bits_per_word = 32, controller should always read in 4 byte chunks on the SPI bus, but we have: >>> + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); which sets word size to 8 bits and thus breaks the transaction to 1 byte chunks on the bus, which is bad. Regards Vignesh > Thanks for reviewing, regards, > Jean > >> >> bits_per_word indicate the address granularity within SPI Slave >> (represented by WLEN field in TI QSPI) and rx_nbits tells about buswidth >> (bits received per bus clock) >> >>> } >>> - rxbuf += wlen; >>> - count -= wlen; >>> + count -= xfer_len; >>> } >>> >>> return 0; >>> >> >> -- >> Regards >> Vignesh
On Tue, Dec 10, 2019 at 1:55 PM Vignesh Raghavendra <vigneshr@ti.com> wrote: > > > > On 10/12/19 5:55 pm, Jean Pihet wrote: > > Hi Vignesh, > > > > On Tue, Dec 10, 2019 at 9:40 AM Vignesh Raghavendra <vigneshr@ti.com> wrote: > >> > >> > >> > >> On 06/12/19 9:30 pm, Jean Pihet wrote: > >>> By reading the 32 bits data register and copy the contents to the > >>> receive buffer, according to the single/dual/quad read mode and > >>> the data length to read. > >>> > >>> The speed improvement is 3.5x using quad read. > >>> --- > >>> drivers/spi/spi-ti-qspi.c | 48 ++++++++++++++++++++++++++------------- > >>> 1 file changed, 32 insertions(+), 16 deletions(-) > >>> > >>> diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c > >>> index 13916232a959..65ec3bcb25ae 100644 > >>> --- a/drivers/spi/spi-ti-qspi.c > >>> +++ b/drivers/spi/spi-ti-qspi.c > >>> @@ -313,24 +313,25 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t, > >>> static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, > >>> int count) > >>> { > >>> - int wlen; > >>> unsigned int cmd; > >>> + u32 rx; > >>> u8 *rxbuf; > >>> + u8 xfer_len; > >>> > >>> rxbuf = t->rx_buf; > >>> cmd = qspi->cmd; > >>> + /* Optimize the transfers for dual and quad read */ > >>> switch (t->rx_nbits) { > >>> - case SPI_NBITS_DUAL: > >>> - cmd |= QSPI_RD_DUAL; > >>> - break; > >>> case SPI_NBITS_QUAD: > >>> - cmd |= QSPI_RD_QUAD; > >>> + cmd |= QSPI_RD_QUAD | QSPI_WLEN(32); > >> > >> Why does QUAD mode mean 32 bit words? > > This is based on the assumption that every transfer is multiple of 8 > > clock cycles. > > For SPI flash devices where bits_per_word is 8, the original code > > always reads data by 1 byte, which > > can be optimized. > > > >> > >> IO mode and word size are independent of each other. If you want to > >> optimize for speed, why not set FLEN field to max and WLEN to max and > >> use ti_qspi_adjust_op_size to clamp max read size to 4096 words when > >> required. This should work irrespective of IO modes. > >> Driver already does something similar to this in the write path. > > > > Ok! A new patch series is coming. > > > >> > >>> + break; > >>> + case SPI_NBITS_DUAL: > >>> + cmd |= QSPI_RD_DUAL | QSPI_WLEN(16); > >>> break; > >>> default: > >>> - cmd |= QSPI_RD_SNGL; > >>> + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); > >>> break; > >>> } > >>> - wlen = t->bits_per_word >> 3; /* in bytes */ > >>> > >>> while (count) { > >>> dev_dbg(qspi->dev, "rx cmd %08x dc %08x\n", cmd, qspi->dc); > >>> @@ -342,19 +343,34 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, > >>> dev_err(qspi->dev, "read timed out\n"); > >>> return -ETIMEDOUT; > >>> } > >>> - switch (wlen) { > >>> - case 1: > >>> - *rxbuf = readb(qspi->base + QSPI_SPI_DATA_REG); > >>> + > >>> + /* Optimize the transfers for dual and quad read */ > >>> + rx = readl(qspi->base + QSPI_SPI_DATA_REG); > >>> + switch (t->rx_nbits) { > >>> + case SPI_NBITS_QUAD: > >>> + if (count >= 1) > >>> + *rxbuf++ = rx >> 24; > >>> + if (count >= 2) > >>> + *rxbuf++ = rx >> 16; > >>> + if (count >= 3) > >>> + *rxbuf++ = rx >> 8; > >>> + if (count >= 4) > >>> + *rxbuf++ = rx; > >>> + xfer_len = min(count, 4); > >>> break; > >>> - case 2: > >>> - *((u16 *)rxbuf) = readw(qspi->base + QSPI_SPI_DATA_REG); > >>> + case SPI_NBITS_DUAL: > >>> + if (count >= 1) > >>> + *rxbuf++ = rx >> 8; > >>> + if (count >= 2) > >>> + *rxbuf++ = rx; > >>> + xfer_len = min(count, 2); > >>> break; > >>> - case 4: > >>> - *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); > >>> + default: > >>> + *rxbuf++ = rx; > >>> + xfer_len = 1; > >>> break; > >> > >> This will fail in case of t->rx_nbits = 1 and t->bits_per_word = 32 > >> (1 bit SPI bus bit slave with 32-bit addressable registers) > > I do not see why this would fail. If bits_per_word is 32, count (in > > bytes) is a multiple of 4 and 1 byte will be read every time the loop > > runs. > > Is that correct? Can you elaborate more on why this would fail? > > > > With t->rx_nbits = 1 and t->bits_per_word = 32, controller should always > read in 4 byte chunks on the SPI bus, but we have: > > >>> + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); > > which sets word size to 8 bits and thus breaks the transaction to 1 byte > chunks on the bus, which is bad. In that case there are 4 1-byte reads on the bus, which succeeds. The trace on the logic analyzer shows 4 times 8 clock cycles. > > Regards > Vignesh > > > Thanks for reviewing, regards, > > Jean > > > >> > >> bits_per_word indicate the address granularity within SPI Slave > >> (represented by WLEN field in TI QSPI) and rx_nbits tells about buswidth > >> (bits received per bus clock) > >> > >>> } > >>> - rxbuf += wlen; > >>> - count -= wlen; > >>> + count -= xfer_len; > >>> } > >>> > >>> return 0; > >>> > >> > >> -- > >> Regards > >> Vignesh > > -- > Regards > Vignesh
On 10/12/19 6:31 pm, Jean Pihet wrote: > On Tue, Dec 10, 2019 at 1:55 PM Vignesh Raghavendra <vigneshr@ti.com> wrote: >> >> >> >> On 10/12/19 5:55 pm, Jean Pihet wrote: >>> Hi Vignesh, >>> >>> On Tue, Dec 10, 2019 at 9:40 AM Vignesh Raghavendra <vigneshr@ti.com> wrote: >>>> >>>> >>>> >>>> On 06/12/19 9:30 pm, Jean Pihet wrote: >>>>> By reading the 32 bits data register and copy the contents to the >>>>> receive buffer, according to the single/dual/quad read mode and >>>>> the data length to read. >>>>> >>>>> The speed improvement is 3.5x using quad read. >>>>> --- >>>>> drivers/spi/spi-ti-qspi.c | 48 ++++++++++++++++++++++++++------------- >>>>> 1 file changed, 32 insertions(+), 16 deletions(-) >>>>> >>>>> diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c >>>>> index 13916232a959..65ec3bcb25ae 100644 >>>>> --- a/drivers/spi/spi-ti-qspi.c >>>>> +++ b/drivers/spi/spi-ti-qspi.c >>>>> @@ -313,24 +313,25 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t, >>>>> static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, >>>>> int count) >>>>> { >>>>> - int wlen; >>>>> unsigned int cmd; >>>>> + u32 rx; >>>>> u8 *rxbuf; >>>>> + u8 xfer_len; >>>>> >>>>> rxbuf = t->rx_buf; >>>>> cmd = qspi->cmd; >>>>> + /* Optimize the transfers for dual and quad read */ >>>>> switch (t->rx_nbits) { >>>>> - case SPI_NBITS_DUAL: >>>>> - cmd |= QSPI_RD_DUAL; >>>>> - break; >>>>> case SPI_NBITS_QUAD: >>>>> - cmd |= QSPI_RD_QUAD; >>>>> + cmd |= QSPI_RD_QUAD | QSPI_WLEN(32); >>>> >>>> Why does QUAD mode mean 32 bit words? >>> This is based on the assumption that every transfer is multiple of 8 >>> clock cycles. >>> For SPI flash devices where bits_per_word is 8, the original code >>> always reads data by 1 byte, which >>> can be optimized. >>> >>>> >>>> IO mode and word size are independent of each other. If you want to >>>> optimize for speed, why not set FLEN field to max and WLEN to max and >>>> use ti_qspi_adjust_op_size to clamp max read size to 4096 words when >>>> required. This should work irrespective of IO modes. >>>> Driver already does something similar to this in the write path. >>> >>> Ok! A new patch series is coming. >>> >>>> >>>>> + break; >>>>> + case SPI_NBITS_DUAL: >>>>> + cmd |= QSPI_RD_DUAL | QSPI_WLEN(16); >>>>> break; >>>>> default: >>>>> - cmd |= QSPI_RD_SNGL; >>>>> + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); >>>>> break; >>>>> } >>>>> - wlen = t->bits_per_word >> 3; /* in bytes */ >>>>> >>>>> while (count) { >>>>> dev_dbg(qspi->dev, "rx cmd %08x dc %08x\n", cmd, qspi->dc); >>>>> @@ -342,19 +343,34 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, >>>>> dev_err(qspi->dev, "read timed out\n"); >>>>> return -ETIMEDOUT; >>>>> } >>>>> - switch (wlen) { >>>>> - case 1: >>>>> - *rxbuf = readb(qspi->base + QSPI_SPI_DATA_REG); >>>>> + >>>>> + /* Optimize the transfers for dual and quad read */ >>>>> + rx = readl(qspi->base + QSPI_SPI_DATA_REG); >>>>> + switch (t->rx_nbits) { >>>>> + case SPI_NBITS_QUAD: >>>>> + if (count >= 1) >>>>> + *rxbuf++ = rx >> 24; >>>>> + if (count >= 2) >>>>> + *rxbuf++ = rx >> 16; >>>>> + if (count >= 3) >>>>> + *rxbuf++ = rx >> 8; >>>>> + if (count >= 4) >>>>> + *rxbuf++ = rx; >>>>> + xfer_len = min(count, 4); >>>>> break; >>>>> - case 2: >>>>> - *((u16 *)rxbuf) = readw(qspi->base + QSPI_SPI_DATA_REG); >>>>> + case SPI_NBITS_DUAL: >>>>> + if (count >= 1) >>>>> + *rxbuf++ = rx >> 8; >>>>> + if (count >= 2) >>>>> + *rxbuf++ = rx; >>>>> + xfer_len = min(count, 2); >>>>> break; >>>>> - case 4: >>>>> - *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); >>>>> + default: >>>>> + *rxbuf++ = rx; >>>>> + xfer_len = 1; >>>>> break; >>>> >>>> This will fail in case of t->rx_nbits = 1 and t->bits_per_word = 32 >>>> (1 bit SPI bus bit slave with 32-bit addressable registers) >>> I do not see why this would fail. If bits_per_word is 32, count (in >>> bytes) is a multiple of 4 and 1 byte will be read every time the loop >>> runs. >>> Is that correct? Can you elaborate more on why this would fail? >>> >> >> With t->rx_nbits = 1 and t->bits_per_word = 32, controller should always >> read in 4 byte chunks on the SPI bus, but we have: >> >>>>> + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); >> >> which sets word size to 8 bits and thus breaks the transaction to 1 byte >> chunks on the bus, which is bad. > > In that case there are 4 1-byte reads on the bus, which succeeds. The > trace on the logic analyzer shows 4 times 8 clock cycles. > Ah, sorry, there is no CS toggle, so this case will work. Although its less efficient as you could have set WLEN to 32 and read entire QSPI_SPI_DATA_REG in one transaction. But this patch definitely changes the behvior when t->rx_nbits = 4 and t->bits_per_word = 32. Previous code did: *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); This patch does: + rx = readl(qspi->base + QSPI_SPI_DATA_REG); [...] + case SPI_NBITS_QUAD: + if (count >= 1) + *rxbuf++ = rx >> 24; + if (count >= 2) + *rxbuf++ = rx >> 16; + if (count >= 3) + *rxbuf++ = rx >> 8; + if (count >= 4) + *rxbuf++ = rx; So there is change in the endianess...
Vignesh, On Tue, Dec 10, 2019 at 2:17 PM Vignesh Raghavendra <vigneshr@ti.com> wrote: > > > > On 10/12/19 6:31 pm, Jean Pihet wrote: > > On Tue, Dec 10, 2019 at 1:55 PM Vignesh Raghavendra <vigneshr@ti.com> wrote: > >> > >> > >> > >> On 10/12/19 5:55 pm, Jean Pihet wrote: > >>> Hi Vignesh, > >>> > >>> On Tue, Dec 10, 2019 at 9:40 AM Vignesh Raghavendra <vigneshr@ti.com> wrote: > >>>> > >>>> > >>>> > >>>> On 06/12/19 9:30 pm, Jean Pihet wrote: > >>>>> By reading the 32 bits data register and copy the contents to the > >>>>> receive buffer, according to the single/dual/quad read mode and > >>>>> the data length to read. > >>>>> > >>>>> The speed improvement is 3.5x using quad read. > >>>>> --- > >>>>> drivers/spi/spi-ti-qspi.c | 48 ++++++++++++++++++++++++++------------- > >>>>> 1 file changed, 32 insertions(+), 16 deletions(-) > >>>>> > >>>>> diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c > >>>>> index 13916232a959..65ec3bcb25ae 100644 > >>>>> --- a/drivers/spi/spi-ti-qspi.c > >>>>> +++ b/drivers/spi/spi-ti-qspi.c > >>>>> @@ -313,24 +313,25 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t, > >>>>> static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, > >>>>> int count) > >>>>> { > >>>>> - int wlen; > >>>>> unsigned int cmd; > >>>>> + u32 rx; > >>>>> u8 *rxbuf; > >>>>> + u8 xfer_len; > >>>>> > >>>>> rxbuf = t->rx_buf; > >>>>> cmd = qspi->cmd; > >>>>> + /* Optimize the transfers for dual and quad read */ > >>>>> switch (t->rx_nbits) { > >>>>> - case SPI_NBITS_DUAL: > >>>>> - cmd |= QSPI_RD_DUAL; > >>>>> - break; > >>>>> case SPI_NBITS_QUAD: > >>>>> - cmd |= QSPI_RD_QUAD; > >>>>> + cmd |= QSPI_RD_QUAD | QSPI_WLEN(32); > >>>> > >>>> Why does QUAD mode mean 32 bit words? > >>> This is based on the assumption that every transfer is multiple of 8 > >>> clock cycles. > >>> For SPI flash devices where bits_per_word is 8, the original code > >>> always reads data by 1 byte, which > >>> can be optimized. > >>> > >>>> > >>>> IO mode and word size are independent of each other. If you want to > >>>> optimize for speed, why not set FLEN field to max and WLEN to max and > >>>> use ti_qspi_adjust_op_size to clamp max read size to 4096 words when > >>>> required. This should work irrespective of IO modes. > >>>> Driver already does something similar to this in the write path. > >>> > >>> Ok! A new patch series is coming. > >>> > >>>> > >>>>> + break; > >>>>> + case SPI_NBITS_DUAL: > >>>>> + cmd |= QSPI_RD_DUAL | QSPI_WLEN(16); > >>>>> break; > >>>>> default: > >>>>> - cmd |= QSPI_RD_SNGL; > >>>>> + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); > >>>>> break; > >>>>> } > >>>>> - wlen = t->bits_per_word >> 3; /* in bytes */ > >>>>> > >>>>> while (count) { > >>>>> dev_dbg(qspi->dev, "rx cmd %08x dc %08x\n", cmd, qspi->dc); > >>>>> @@ -342,19 +343,34 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, > >>>>> dev_err(qspi->dev, "read timed out\n"); > >>>>> return -ETIMEDOUT; > >>>>> } > >>>>> - switch (wlen) { > >>>>> - case 1: > >>>>> - *rxbuf = readb(qspi->base + QSPI_SPI_DATA_REG); > >>>>> + > >>>>> + /* Optimize the transfers for dual and quad read */ > >>>>> + rx = readl(qspi->base + QSPI_SPI_DATA_REG); > >>>>> + switch (t->rx_nbits) { > >>>>> + case SPI_NBITS_QUAD: > >>>>> + if (count >= 1) > >>>>> + *rxbuf++ = rx >> 24; > >>>>> + if (count >= 2) > >>>>> + *rxbuf++ = rx >> 16; > >>>>> + if (count >= 3) > >>>>> + *rxbuf++ = rx >> 8; > >>>>> + if (count >= 4) > >>>>> + *rxbuf++ = rx; > >>>>> + xfer_len = min(count, 4); > >>>>> break; > >>>>> - case 2: > >>>>> - *((u16 *)rxbuf) = readw(qspi->base + QSPI_SPI_DATA_REG); > >>>>> + case SPI_NBITS_DUAL: > >>>>> + if (count >= 1) > >>>>> + *rxbuf++ = rx >> 8; > >>>>> + if (count >= 2) > >>>>> + *rxbuf++ = rx; > >>>>> + xfer_len = min(count, 2); > >>>>> break; > >>>>> - case 4: > >>>>> - *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); > >>>>> + default: > >>>>> + *rxbuf++ = rx; > >>>>> + xfer_len = 1; > >>>>> break; > >>>> > >>>> This will fail in case of t->rx_nbits = 1 and t->bits_per_word = 32 > >>>> (1 bit SPI bus bit slave with 32-bit addressable registers) > >>> I do not see why this would fail. If bits_per_word is 32, count (in > >>> bytes) is a multiple of 4 and 1 byte will be read every time the loop > >>> runs. > >>> Is that correct? Can you elaborate more on why this would fail? > >>> > >> > >> With t->rx_nbits = 1 and t->bits_per_word = 32, controller should always > >> read in 4 byte chunks on the SPI bus, but we have: > >> > >>>>> + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); > >> > >> which sets word size to 8 bits and thus breaks the transaction to 1 byte > >> chunks on the bus, which is bad. > > > > In that case there are 4 1-byte reads on the bus, which succeeds. The > > trace on the logic analyzer shows 4 times 8 clock cycles. > > > > Ah, sorry, there is no CS toggle, so this case will work. Although its > less efficient as you could have set WLEN to 32 and read entire > QSPI_SPI_DATA_REG in one transaction. > > But this patch definitely changes the behvior when t->rx_nbits = 4 and > t->bits_per_word = 32. Previous code did: > > *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); > > This patch does: > > + rx = readl(qspi->base + QSPI_SPI_DATA_REG); > [...] > + case SPI_NBITS_QUAD: > + if (count >= 1) > + *rxbuf++ = rx >> 24; > + if (count >= 2) > + *rxbuf++ = rx >> 16; > + if (count >= 3) > + *rxbuf++ = rx >> 8; > + if (count >= 4) > + *rxbuf++ = rx; > > > So there is change in the endianess... Oh! The cases where bits_per_word is different than 8 definitely needs more thinking. I have tested the patches with SPI flash only. How to test it with 32 bits per word? Thanks & regards; Jean > > -- > Regards > Vignesh
On 12/10/2019 8:38 PM, Jean Pihet wrote: > Vignesh, > > On Tue, Dec 10, 2019 at 2:17 PM Vignesh Raghavendra <vigneshr@ti.com> wrote: [...] >> Ah, sorry, there is no CS toggle, so this case will work. Although its >> less efficient as you could have set WLEN to 32 and read entire >> QSPI_SPI_DATA_REG in one transaction. >> >> But this patch definitely changes the behvior when t->rx_nbits = 4 and >> t->bits_per_word = 32. Previous code did: >> >> *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); >> >> This patch does: >> >> + rx = readl(qspi->base + QSPI_SPI_DATA_REG); >> [...] >> + case SPI_NBITS_QUAD: >> + if (count >= 1) >> + *rxbuf++ = rx >> 24; >> + if (count >= 2) >> + *rxbuf++ = rx >> 16; >> + if (count >= 3) >> + *rxbuf++ = rx >> 8; >> + if (count >= 4) >> + *rxbuf++ = rx; >> >> >> So there is change in the endianess... > Oh! The cases where bits_per_word is different than 8 definitely needs > more thinking. I have tested the patches > with SPI flash only. How to test it with 32 bits per word? > My suggestion would be to restrict optimizations to just address SPI Flash usecase, i.e t->bits_per_word == 8. And not touch any other word sizes. See qspi_write_msg() on how this can be done for Single bit mode, same can be extended for other modes. Regards Vignesh
diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c index 13916232a959..65ec3bcb25ae 100644 --- a/drivers/spi/spi-ti-qspi.c +++ b/drivers/spi/spi-ti-qspi.c @@ -313,24 +313,25 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t, static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, int count) { - int wlen; unsigned int cmd; + u32 rx; u8 *rxbuf; + u8 xfer_len; rxbuf = t->rx_buf; cmd = qspi->cmd; + /* Optimize the transfers for dual and quad read */ switch (t->rx_nbits) { - case SPI_NBITS_DUAL: - cmd |= QSPI_RD_DUAL; - break; case SPI_NBITS_QUAD: - cmd |= QSPI_RD_QUAD; + cmd |= QSPI_RD_QUAD | QSPI_WLEN(32); + break; + case SPI_NBITS_DUAL: + cmd |= QSPI_RD_DUAL | QSPI_WLEN(16); break; default: - cmd |= QSPI_RD_SNGL; + cmd |= QSPI_RD_SNGL | QSPI_WLEN(8); break; } - wlen = t->bits_per_word >> 3; /* in bytes */ while (count) { dev_dbg(qspi->dev, "rx cmd %08x dc %08x\n", cmd, qspi->dc); @@ -342,19 +343,34 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, dev_err(qspi->dev, "read timed out\n"); return -ETIMEDOUT; } - switch (wlen) { - case 1: - *rxbuf = readb(qspi->base + QSPI_SPI_DATA_REG); + + /* Optimize the transfers for dual and quad read */ + rx = readl(qspi->base + QSPI_SPI_DATA_REG); + switch (t->rx_nbits) { + case SPI_NBITS_QUAD: + if (count >= 1) + *rxbuf++ = rx >> 24; + if (count >= 2) + *rxbuf++ = rx >> 16; + if (count >= 3) + *rxbuf++ = rx >> 8; + if (count >= 4) + *rxbuf++ = rx; + xfer_len = min(count, 4); break; - case 2: - *((u16 *)rxbuf) = readw(qspi->base + QSPI_SPI_DATA_REG); + case SPI_NBITS_DUAL: + if (count >= 1) + *rxbuf++ = rx >> 8; + if (count >= 2) + *rxbuf++ = rx; + xfer_len = min(count, 2); break; - case 4: - *((u32 *)rxbuf) = readl(qspi->base + QSPI_SPI_DATA_REG); + default: + *rxbuf++ = rx; + xfer_len = 1; break; } - rxbuf += wlen; - count -= wlen; + count -= xfer_len; } return 0;