diff mbox

[v2] mmc: mxcmmc: fix bug that may block a data transfer forever.

Message ID 1347014617-16238-1-git-send-email-javier.martin@vista-silicon.com (mailing list archive)
State Accepted, archived
Headers show

Commit Message

Javier Martin Sept. 7, 2012, 10:43 a.m. UTC
The problem can be easily reproduced using a script that loops
copying a file in an SD card to another place in the same SD card
and its related to read transfers. This only happens with DMA enabled.

This is related to the fact that, when reading, an MMC irq signals
the fact that all data from the SD card has been copied to the
internal buffers. However, it doesn't signal whether the DMA transfer
that is in charge of moving data from these internal buffers to RAM
has finished or not. Thus, calling dmaengine_terminate_all() in the
MMC irq routine can cancel an ongoing DMA transfer leaving some data
in the internal buffers that produces an accumulative effect which,
in the end, blocks a read data transfer forever.

The following patch watches DMA irq for reading and MMC irqs for
writing transfers. The 'dangerous' usage of dmaengine_terminate_all()
is removed and a timeout of 10 seconds is added so that the MMC won't
block forever anymore.

Signed-off-by: Javier Martin <javier.martin@vista-silicon.com>
---
Changes since v1:
 - Use DMA callbacks for reading and MMC irqs for writing instead
 of monitoring both irqs for each transfer.
 - Increase timeout to 10 seconds to give enough time to slow cards.

---
 drivers/mmc/host/mxcmmc.c |   76 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 70 insertions(+), 6 deletions(-)

Comments

Sascha Hauer Sept. 10, 2012, 7:47 a.m. UTC | #1
On Fri, Sep 07, 2012 at 12:43:37PM +0200, Javier Martin wrote:
> The problem can be easily reproduced using a script that loops
> copying a file in an SD card to another place in the same SD card
> and its related to read transfers. This only happens with DMA enabled.
> 
> This is related to the fact that, when reading, an MMC irq signals
> the fact that all data from the SD card has been copied to the
> internal buffers. However, it doesn't signal whether the DMA transfer
> that is in charge of moving data from these internal buffers to RAM
> has finished or not. Thus, calling dmaengine_terminate_all() in the
> MMC irq routine can cancel an ongoing DMA transfer leaving some data
> in the internal buffers that produces an accumulative effect which,
> in the end, blocks a read data transfer forever.
> 
> The following patch watches DMA irq for reading and MMC irqs for
> writing transfers. The 'dangerous' usage of dmaengine_terminate_all()
> is removed and a timeout of 10 seconds is added so that the MMC won't
> block forever anymore.
> 
> Signed-off-by: Javier Martin <javier.martin@vista-silicon.com>
> ---
> Changes since v1:
>  - Use DMA callbacks for reading and MMC irqs for writing instead
>  of monitoring both irqs for each transfer.
>  - Increase timeout to 10 seconds to give enough time to slow cards.
> 

Looks good now, thanks.

Reviewed-by: Sascha Hauer <s.hauer@pengutronix.de>


> ---
>  drivers/mmc/host/mxcmmc.c |   76 +++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 70 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c
> index 28ed52d..fc42a2e 100644
> --- a/drivers/mmc/host/mxcmmc.c
> +++ b/drivers/mmc/host/mxcmmc.c
> @@ -44,6 +44,7 @@
>  #include <mach/hardware.h>
>  
>  #define DRIVER_NAME "mxc-mmc"
> +#define MXCMCI_TIMEOUT_MS 10000
>  
>  #define MMC_REG_STR_STP_CLK		0x00
>  #define MMC_REG_STATUS			0x04
> @@ -150,6 +151,8 @@ struct mxcmci_host {
>  	int			dmareq;
>  	struct dma_slave_config dma_slave_config;
>  	struct imx_dma_data	dma_data;
> +
> +	struct timer_list	watchdog;
>  };
>  
>  static void mxcmci_set_clk_rate(struct mxcmci_host *host, unsigned int clk_ios);
> @@ -271,9 +274,32 @@ static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
>  	dmaengine_submit(host->desc);
>  	dma_async_issue_pending(host->dma);
>  
> +	mod_timer(&host->watchdog, jiffies + msecs_to_jiffies(MXCMCI_TIMEOUT_MS));
> +
>  	return 0;
>  }
>  
> +static void mxcmci_cmd_done(struct mxcmci_host *host, unsigned int stat);
> +static void mxcmci_data_done(struct mxcmci_host *host, unsigned int stat);
> +
> +static void mxcmci_dma_callback(void *data)
> +{
> +	struct mxcmci_host *host = data;
> +	u32 stat;
> +
> +	del_timer(&host->watchdog);
> +
> +	stat = readl(host->base + MMC_REG_STATUS);
> +	writel(stat & ~STATUS_DATA_TRANS_DONE, host->base + MMC_REG_STATUS);
> +
> +	dev_dbg(mmc_dev(host->mmc), "%s: 0x%08x\n", __func__, stat);
> +
> +	if (stat & STATUS_READ_OP_DONE)
> +		writel(STATUS_READ_OP_DONE, host->base + MMC_REG_STATUS);
> +
> +	mxcmci_data_done(host, stat);
> +}
> +
>  static int mxcmci_start_cmd(struct mxcmci_host *host, struct mmc_command *cmd,
>  		unsigned int cmdat)
>  {
> @@ -305,8 +331,14 @@ static int mxcmci_start_cmd(struct mxcmci_host *host, struct mmc_command *cmd,
>  
>  	int_cntr = INT_END_CMD_RES_EN;
>  
> -	if (mxcmci_use_dma(host))
> -		int_cntr |= INT_READ_OP_EN | INT_WRITE_OP_DONE_EN;
> +	if (mxcmci_use_dma(host)) {
> +		if (host->dma_dir == DMA_FROM_DEVICE) {
> +			host->desc->callback = mxcmci_dma_callback;
> +			host->desc->callback_param = host;
> +		} else {
> +			int_cntr |= INT_WRITE_OP_DONE_EN;
> +		}
> +	}
>  
>  	spin_lock_irqsave(&host->lock, flags);
>  	if (host->use_sdio)
> @@ -345,11 +377,9 @@ static int mxcmci_finish_data(struct mxcmci_host *host, unsigned int stat)
>  	struct mmc_data *data = host->data;
>  	int data_error;
>  
> -	if (mxcmci_use_dma(host)) {
> -		dmaengine_terminate_all(host->dma);
> +	if (mxcmci_use_dma(host))
>  		dma_unmap_sg(host->dma->device->dev, data->sg, data->sg_len,
>  				host->dma_dir);
> -	}
>  
>  	if (stat & STATUS_ERR_MASK) {
>  		dev_dbg(mmc_dev(host->mmc), "request failed. status: 0x%08x\n",
> @@ -624,8 +654,10 @@ static irqreturn_t mxcmci_irq(int irq, void *devid)
>  		mxcmci_cmd_done(host, stat);
>  
>  	if (mxcmci_use_dma(host) &&
> -		  (stat & (STATUS_DATA_TRANS_DONE | STATUS_WRITE_OP_DONE)))
> +		  (stat & (STATUS_DATA_TRANS_DONE | STATUS_WRITE_OP_DONE))) {
> +		del_timer(&host->watchdog);
>  		mxcmci_data_done(host, stat);
> +	}
>  
>  	if (host->default_irq_mask &&
>  		  (stat & (STATUS_CARD_INSERTION | STATUS_CARD_REMOVAL)))
> @@ -836,6 +868,34 @@ static bool filter(struct dma_chan *chan, void *param)
>  	return true;
>  }
>  
> +static void mxcmci_watchdog(unsigned long data)
> +{
> +	struct mmc_host *mmc = (struct mmc_host *)data;
> +	struct mxcmci_host *host = mmc_priv(mmc);
> +	struct mmc_request *req = host->req;
> +	unsigned int stat = readl(host->base + MMC_REG_STATUS);
> +
> +	if (host->dma_dir == DMA_FROM_DEVICE) {
> +		dmaengine_terminate_all(host->dma);
> +		dev_err(mmc_dev(host->mmc),
> +			"%s: read time out (status = 0x%08x)\n",
> +			__func__, stat);
> +	} else {
> +		dev_err(mmc_dev(host->mmc),
> +			"%s: write time out (status = 0x%08x)\n",
> +			__func__, stat);
> +		mxcmci_softreset(host);
> +	}
> +
> +	/* Mark transfer as erroneus and inform the upper layers */
> +
> +	host->data->error = -ETIMEDOUT;
> +	host->req = NULL;
> +	host->cmd = NULL;
> +	host->data = NULL;
> +	mmc_request_done(host->mmc, req);
> +}
> +
>  static const struct mmc_host_ops mxcmci_ops = {
>  	.request		= mxcmci_request,
>  	.set_ios		= mxcmci_set_ios,
> @@ -968,6 +1028,10 @@ static int mxcmci_probe(struct platform_device *pdev)
>  
>  	mmc_add_host(mmc);
>  
> +	init_timer(&host->watchdog);
> +	host->watchdog.function = &mxcmci_watchdog;
> +	host->watchdog.data = (unsigned long)mmc;
> +
>  	return 0;
>  
>  out_free_irq:
> -- 
> 1.7.9.5
> 
>
Vinod Koul Sept. 14, 2012, 2:52 a.m. UTC | #2
On Fri, 2012-09-07 at 12:43 +0200, Javier Martin wrote:
> 
> The problem can be easily reproduced using a script that loops
> copying a file in an SD card to another place in the same SD card
> and its related to read transfers. This only happens with DMA enabled.
> 
> This is related to the fact that, when reading, an MMC irq signals
> the fact that all data from the SD card has been copied to the
> internal buffers. However, it doesn't signal whether the DMA transfer
> that is in charge of moving data from these internal buffers to RAM
> has finished or not. Thus, calling dmaengine_terminate_all() in the
> MMC irq routine can cancel an ongoing DMA transfer leaving some data
> in the internal buffers that produces an accumulative effect which,
> in the end, blocks a read data transfer forever.
> 
> The following patch watches DMA irq for reading and MMC irqs for
> writing transfers. The 'dangerous' usage of dmaengine_terminate_all()
> is removed and a timeout of 10 seconds is added so that the MMC won't
> block forever anymore. 
For a normal transactions why should you call dmaengine_terminate_all().
This should be called when you doing abort or cleanup in some erranous
situation.

why was it called in first place?
Javier Martin Sept. 14, 2012, 12:50 p.m. UTC | #3
Hi Vinod,

On 14 September 2012 04:52, Vinod Koul <vinod.koul@linux.intel.com> wrote:
> On Fri, 2012-09-07 at 12:43 +0200, Javier Martin wrote:
>>
>> The problem can be easily reproduced using a script that loops
>> copying a file in an SD card to another place in the same SD card
>> and its related to read transfers. This only happens with DMA enabled.
>>
>> This is related to the fact that, when reading, an MMC irq signals
>> the fact that all data from the SD card has been copied to the
>> internal buffers. However, it doesn't signal whether the DMA transfer
>> that is in charge of moving data from these internal buffers to RAM
>> has finished or not. Thus, calling dmaengine_terminate_all() in the
>> MMC irq routine can cancel an ongoing DMA transfer leaving some data
>> in the internal buffers that produces an accumulative effect which,
>> in the end, blocks a read data transfer forever.
>>
>> The following patch watches DMA irq for reading and MMC irqs for
>> writing transfers. The 'dangerous' usage of dmaengine_terminate_all()
>> is removed and a timeout of 10 seconds is added so that the MMC won't
>> block forever anymore.
> For a normal transactions why should you call dmaengine_terminate_all().

Agree. In fact, this patch fixes that.

> This should be called when you doing abort or cleanup in some erranous
> situation.

After this patch, dmaengine_terminate_all() will only be called if a
DMA transfer lasts more than 10 seconds, which is an explicit
error/abort that can happen sometimes (ie. when an SD card is worn
out).

> why was it called in first place?

I don't know, because I didn't write the original support but it's not
relevant because this patch removes the incorrect use of
dmaengine_terminate_all().

Since you took the time to review the patch, could you please give me
your ack too in case you find everything seems fine?

Regards.
Chris Ball Sept. 19, 2012, 5:52 a.m. UTC | #4
Hi,

On Mon, Sep 10 2012, Sascha Hauer wrote:
> On Fri, Sep 07, 2012 at 12:43:37PM +0200, Javier Martin wrote:
>> The problem can be easily reproduced using a script that loops
>> copying a file in an SD card to another place in the same SD card
>> and its related to read transfers. This only happens with DMA enabled.
>> 
>> This is related to the fact that, when reading, an MMC irq signals
>> the fact that all data from the SD card has been copied to the
>> internal buffers. However, it doesn't signal whether the DMA transfer
>> that is in charge of moving data from these internal buffers to RAM
>> has finished or not. Thus, calling dmaengine_terminate_all() in the
>> MMC irq routine can cancel an ongoing DMA transfer leaving some data
>> in the internal buffers that produces an accumulative effect which,
>> in the end, blocks a read data transfer forever.
>> 
>> The following patch watches DMA irq for reading and MMC irqs for
>> writing transfers. The 'dangerous' usage of dmaengine_terminate_all()
>> is removed and a timeout of 10 seconds is added so that the MMC won't
>> block forever anymore.
>> 
>> Signed-off-by: Javier Martin <javier.martin@vista-silicon.com>
>> ---
>> Changes since v1:
>>  - Use DMA callbacks for reading and MMC irqs for writing instead
>>  of monitoring both irqs for each transfer.
>>  - Increase timeout to 10 seconds to give enough time to slow cards.
>> 
>
> Looks good now, thanks.
>
> Reviewed-by: Sascha Hauer <s.hauer@pengutronix.de>

Thanks, queued in mmc-next for 3.7.  Vinod, would be good to add your
Reviewed-by if you're happy with this patch now.

- Chris.
Anatolij Gustschin Feb. 19, 2013, 2:14 p.m. UTC | #5
On Fri,  7 Sep 2012 12:43:37 +0200
Javier Martin <javier.martin@vista-silicon.com> wrote:
...
> +static void mxcmci_dma_callback(void *data)
> +{
> +	struct mxcmci_host *host = data;
> +	u32 stat;
> +
> +	del_timer(&host->watchdog);
> +
> +	stat = readl(host->base + MMC_REG_STATUS);
> +	writel(stat & ~STATUS_DATA_TRANS_DONE, host->base + MMC_REG_STATUS);
> +
> +	dev_dbg(mmc_dev(host->mmc), "%s: 0x%08x\n", __func__, stat);
> +
> +	if (stat & STATUS_READ_OP_DONE)
> +		writel(STATUS_READ_OP_DONE, host->base + MMC_REG_STATUS);
> +
> +	mxcmci_data_done(host, stat);

this change introduces a race condition for host->req (and maybe
for host->data) accesses. The callback is running in soft-irq context and can
be interrupted by the mxcmci_irq() interrupt which can finish the request and
set host->req to NULL. Then mxcmci_data_done() crashes with a null pointer
dereference. How extensively was it tested?

Anatolij
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Javier Martin Feb. 21, 2013, 12:32 p.m. UTC | #6
Hi,

On 19 February 2013 15:14, Anatolij Gustschin <agust@denx.de> wrote:
> On Fri,  7 Sep 2012 12:43:37 +0200
> Javier Martin <javier.martin@vista-silicon.com> wrote:
> ...
>> +static void mxcmci_dma_callback(void *data)
>> +{
>> +     struct mxcmci_host *host = data;
>> +     u32 stat;
>> +
>> +     del_timer(&host->watchdog);
>> +
>> +     stat = readl(host->base + MMC_REG_STATUS);
>> +     writel(stat & ~STATUS_DATA_TRANS_DONE, host->base + MMC_REG_STATUS);
>> +
>> +     dev_dbg(mmc_dev(host->mmc), "%s: 0x%08x\n", __func__, stat);
>> +
>> +     if (stat & STATUS_READ_OP_DONE)
>> +             writel(STATUS_READ_OP_DONE, host->base + MMC_REG_STATUS);
>> +
>> +     mxcmci_data_done(host, stat);
>
> this change introduces a race condition for host->req (and maybe
> for host->data) accesses. The callback is running in soft-irq context and can
> be interrupted by the mxcmci_irq() interrupt which can finish the request and
> set host->req to NULL. Then mxcmci_data_done() crashes with a null pointer
> dereference. How extensively was it tested?

It was tested executing a loop with bonnie++ for a week.
I didn't notice the race condition. Have you had real issues or just
found  it by analysing the code?

Regards.
Anatolij Gustschin Feb. 21, 2013, 12:57 p.m. UTC | #7
Hi,

On Thu, 21 Feb 2013 13:32:08 +0100
javier Martin <javier.martin@vista-silicon.com> wrote:
...
> It was tested executing a loop with bonnie++ for a week.
> I didn't notice the race condition. Have you had real issues or just
> found  it by analysing the code?

I've had real issues, but my setup is different. I'm using this mxcmmc
driver with extensions on mpc5121e (with different DMA controller/driver).
It is quite difficult to reproduce the issue on my test system. I've also
an i.mx31 based pcm037 board but the SD-Card slot is broken on it, so I
can not test the driver on i.mx31.

Thanks,

Anatolij
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c
index 28ed52d..fc42a2e 100644
--- a/drivers/mmc/host/mxcmmc.c
+++ b/drivers/mmc/host/mxcmmc.c
@@ -44,6 +44,7 @@ 
 #include <mach/hardware.h>
 
 #define DRIVER_NAME "mxc-mmc"
+#define MXCMCI_TIMEOUT_MS 10000
 
 #define MMC_REG_STR_STP_CLK		0x00
 #define MMC_REG_STATUS			0x04
@@ -150,6 +151,8 @@  struct mxcmci_host {
 	int			dmareq;
 	struct dma_slave_config dma_slave_config;
 	struct imx_dma_data	dma_data;
+
+	struct timer_list	watchdog;
 };
 
 static void mxcmci_set_clk_rate(struct mxcmci_host *host, unsigned int clk_ios);
@@ -271,9 +274,32 @@  static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
 	dmaengine_submit(host->desc);
 	dma_async_issue_pending(host->dma);
 
+	mod_timer(&host->watchdog, jiffies + msecs_to_jiffies(MXCMCI_TIMEOUT_MS));
+
 	return 0;
 }
 
+static void mxcmci_cmd_done(struct mxcmci_host *host, unsigned int stat);
+static void mxcmci_data_done(struct mxcmci_host *host, unsigned int stat);
+
+static void mxcmci_dma_callback(void *data)
+{
+	struct mxcmci_host *host = data;
+	u32 stat;
+
+	del_timer(&host->watchdog);
+
+	stat = readl(host->base + MMC_REG_STATUS);
+	writel(stat & ~STATUS_DATA_TRANS_DONE, host->base + MMC_REG_STATUS);
+
+	dev_dbg(mmc_dev(host->mmc), "%s: 0x%08x\n", __func__, stat);
+
+	if (stat & STATUS_READ_OP_DONE)
+		writel(STATUS_READ_OP_DONE, host->base + MMC_REG_STATUS);
+
+	mxcmci_data_done(host, stat);
+}
+
 static int mxcmci_start_cmd(struct mxcmci_host *host, struct mmc_command *cmd,
 		unsigned int cmdat)
 {
@@ -305,8 +331,14 @@  static int mxcmci_start_cmd(struct mxcmci_host *host, struct mmc_command *cmd,
 
 	int_cntr = INT_END_CMD_RES_EN;
 
-	if (mxcmci_use_dma(host))
-		int_cntr |= INT_READ_OP_EN | INT_WRITE_OP_DONE_EN;
+	if (mxcmci_use_dma(host)) {
+		if (host->dma_dir == DMA_FROM_DEVICE) {
+			host->desc->callback = mxcmci_dma_callback;
+			host->desc->callback_param = host;
+		} else {
+			int_cntr |= INT_WRITE_OP_DONE_EN;
+		}
+	}
 
 	spin_lock_irqsave(&host->lock, flags);
 	if (host->use_sdio)
@@ -345,11 +377,9 @@  static int mxcmci_finish_data(struct mxcmci_host *host, unsigned int stat)
 	struct mmc_data *data = host->data;
 	int data_error;
 
-	if (mxcmci_use_dma(host)) {
-		dmaengine_terminate_all(host->dma);
+	if (mxcmci_use_dma(host))
 		dma_unmap_sg(host->dma->device->dev, data->sg, data->sg_len,
 				host->dma_dir);
-	}
 
 	if (stat & STATUS_ERR_MASK) {
 		dev_dbg(mmc_dev(host->mmc), "request failed. status: 0x%08x\n",
@@ -624,8 +654,10 @@  static irqreturn_t mxcmci_irq(int irq, void *devid)
 		mxcmci_cmd_done(host, stat);
 
 	if (mxcmci_use_dma(host) &&
-		  (stat & (STATUS_DATA_TRANS_DONE | STATUS_WRITE_OP_DONE)))
+		  (stat & (STATUS_DATA_TRANS_DONE | STATUS_WRITE_OP_DONE))) {
+		del_timer(&host->watchdog);
 		mxcmci_data_done(host, stat);
+	}
 
 	if (host->default_irq_mask &&
 		  (stat & (STATUS_CARD_INSERTION | STATUS_CARD_REMOVAL)))
@@ -836,6 +868,34 @@  static bool filter(struct dma_chan *chan, void *param)
 	return true;
 }
 
+static void mxcmci_watchdog(unsigned long data)
+{
+	struct mmc_host *mmc = (struct mmc_host *)data;
+	struct mxcmci_host *host = mmc_priv(mmc);
+	struct mmc_request *req = host->req;
+	unsigned int stat = readl(host->base + MMC_REG_STATUS);
+
+	if (host->dma_dir == DMA_FROM_DEVICE) {
+		dmaengine_terminate_all(host->dma);
+		dev_err(mmc_dev(host->mmc),
+			"%s: read time out (status = 0x%08x)\n",
+			__func__, stat);
+	} else {
+		dev_err(mmc_dev(host->mmc),
+			"%s: write time out (status = 0x%08x)\n",
+			__func__, stat);
+		mxcmci_softreset(host);
+	}
+
+	/* Mark transfer as erroneus and inform the upper layers */
+
+	host->data->error = -ETIMEDOUT;
+	host->req = NULL;
+	host->cmd = NULL;
+	host->data = NULL;
+	mmc_request_done(host->mmc, req);
+}
+
 static const struct mmc_host_ops mxcmci_ops = {
 	.request		= mxcmci_request,
 	.set_ios		= mxcmci_set_ios,
@@ -968,6 +1028,10 @@  static int mxcmci_probe(struct platform_device *pdev)
 
 	mmc_add_host(mmc);
 
+	init_timer(&host->watchdog);
+	host->watchdog.function = &mxcmci_watchdog;
+	host->watchdog.data = (unsigned long)mmc;
+
 	return 0;
 
 out_free_irq: