diff mbox series

[v4] dmaengine: tegra-apb: Support per-burst residue granularity

Message ID 20190703012836.16568-1-digetx@gmail.com (mailing list archive)
State Superseded
Headers show
Series [v4] dmaengine: tegra-apb: Support per-burst residue granularity | expand

Commit Message

Dmitry Osipenko July 3, 2019, 1:28 a.m. UTC
Tegra's APB DMA engine updates words counter after each transferred burst
of data, hence it can report transfer's residual with more fidelity which
may be required in cases like audio playback. In particular this fixes
audio stuttering during playback in a chromium web browser. The patch is
based on the original work that was made by Ben Dooks and a patch from
downstream kernel. It was tested on Tegra20 and Tegra30 devices.

Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---

Changelog:

v4: The words_xferred is now also reset on a new iteration of a cyclic
    transfer by ISR, so that dmaengine_tx_status() won't produce a
    misleading warning splat on TX status re-checking after a cycle
    completion when cyclic transfer consists of a single SG.

v3: Added workaround for a hardware design shortcoming that results
    in a words counter wraparound before end-of-transfer bit is set
    in a cyclic mode.

v2: Addressed review comments made by Jon Hunter to v1. We won't try
    to get words count if dma_desc is on free list as it will result
    in a NULL dereference because this case wasn't handled properly.

    The residual value is now updated properly, avoiding potential
    integer overflow by adding the "bytes" to the "bytes_transferred"
    instead of the subtraction.

 drivers/dma/tegra20-apb-dma.c | 72 +++++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 7 deletions(-)

Comments

Jon Hunter July 3, 2019, 4:37 p.m. UTC | #1
On 03/07/2019 02:28, Dmitry Osipenko wrote:
> Tegra's APB DMA engine updates words counter after each transferred burst
> of data, hence it can report transfer's residual with more fidelity which
> may be required in cases like audio playback. In particular this fixes
> audio stuttering during playback in a chromium web browser. The patch is
> based on the original work that was made by Ben Dooks and a patch from
> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
> 
> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
> Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
> 
> Changelog:
> 
> v4: The words_xferred is now also reset on a new iteration of a cyclic
>     transfer by ISR, so that dmaengine_tx_status() won't produce a
>     misleading warning splat on TX status re-checking after a cycle
>     completion when cyclic transfer consists of a single SG.
> 
> v3: Added workaround for a hardware design shortcoming that results
>     in a words counter wraparound before end-of-transfer bit is set
>     in a cyclic mode.
> 
> v2: Addressed review comments made by Jon Hunter to v1. We won't try
>     to get words count if dma_desc is on free list as it will result
>     in a NULL dereference because this case wasn't handled properly.
> 
>     The residual value is now updated properly, avoiding potential
>     integer overflow by adding the "bytes" to the "bytes_transferred"
>     instead of the subtraction.
> 
>  drivers/dma/tegra20-apb-dma.c | 72 +++++++++++++++++++++++++++++++----
>  1 file changed, 65 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
> index 79e9593815f1..148d136191d7 100644
> --- a/drivers/dma/tegra20-apb-dma.c
> +++ b/drivers/dma/tegra20-apb-dma.c
> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>  	bool				last_sg;
>  	struct list_head		node;
>  	struct tegra_dma_desc		*dma_desc;
> +	unsigned int			words_xferred;
>  };
>  
>  /*
> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>  	nsg_req->configured = true;
> +	nsg_req->words_xferred = 0;
>  
>  	tegra_dma_resume(tdc);
>  }
> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>  					typeof(*sg_req), node);
>  	tegra_dma_start(tdc, sg_req);
>  	sg_req->configured = true;
> +	sg_req->words_xferred = 0;
>  	tdc->busy = true;
>  }
>  
> @@ -638,6 +641,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
>  		list_add_tail(&dma_desc->cb_node, &tdc->cb_desc);
>  	dma_desc->cb_count++;
>  
> +	sgreq->words_xferred = 0;
> +
>  	/* If not last req then put at end of pending list */
>  	if (!list_is_last(&sgreq->node, &tdc->pending_sg_req)) {
>  		list_move_tail(&sgreq->node, &tdc->pending_sg_req);
> @@ -797,6 +802,62 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>  	return 0;
>  }
>  
> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
> +					       struct tegra_dma_sg_req *sg_req)
> +{
> +	unsigned long status, wcount = 0;
> +
> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
> +		return 0;
> +
> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
> +
> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
> +
> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = status;
> +
> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
> +		return sg_req->req_len;
> +
> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
> +
> +	if (!wcount) {
> +		/*
> +		 * If wcount wasn't ever polled for this SG before, then
> +		 * simply assume that transfer hasn't started yet.
> +		 *
> +		 * Otherwise it's the end of the transfer.
> +		 *
> +		 * The alternative would be to poll the status register
> +		 * until EOC bit is set or wcount goes UP. That's so
> +		 * because EOC bit is getting set only after the last
> +		 * burst's completion and counter is less than the actual
> +		 * transfer size by 4 bytes. The counter value wraps around
> +		 * in a cyclic mode before EOC is set(!), so we can't easily
> +		 * distinguish start of transfer from its end.
> +		 */
> +		if (sg_req->words_xferred)
> +			wcount = sg_req->req_len - 4;
> +
> +	} else if (wcount < sg_req->words_xferred) {
> +		/*
> +		 * This case shall not ever happen because EOC bit
> +		 * must be set once next cyclic transfer is started.

Should this still be cyclic here?

Cheers
Jon
Dmitry Osipenko July 3, 2019, 5 p.m. UTC | #2
03.07.2019 19:37, Jon Hunter пишет:
> 
> On 03/07/2019 02:28, Dmitry Osipenko wrote:
>> Tegra's APB DMA engine updates words counter after each transferred burst
>> of data, hence it can report transfer's residual with more fidelity which
>> may be required in cases like audio playback. In particular this fixes
>> audio stuttering during playback in a chromium web browser. The patch is
>> based on the original work that was made by Ben Dooks and a patch from
>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>
>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>> Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>
>> Changelog:
>>
>> v4: The words_xferred is now also reset on a new iteration of a cyclic
>>     transfer by ISR, so that dmaengine_tx_status() won't produce a
>>     misleading warning splat on TX status re-checking after a cycle
>>     completion when cyclic transfer consists of a single SG.
>>
>> v3: Added workaround for a hardware design shortcoming that results
>>     in a words counter wraparound before end-of-transfer bit is set
>>     in a cyclic mode.
>>
>> v2: Addressed review comments made by Jon Hunter to v1. We won't try
>>     to get words count if dma_desc is on free list as it will result
>>     in a NULL dereference because this case wasn't handled properly.
>>
>>     The residual value is now updated properly, avoiding potential
>>     integer overflow by adding the "bytes" to the "bytes_transferred"
>>     instead of the subtraction.
>>
>>  drivers/dma/tegra20-apb-dma.c | 72 +++++++++++++++++++++++++++++++----
>>  1 file changed, 65 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>> index 79e9593815f1..148d136191d7 100644
>> --- a/drivers/dma/tegra20-apb-dma.c
>> +++ b/drivers/dma/tegra20-apb-dma.c
>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>  	bool				last_sg;
>>  	struct list_head		node;
>>  	struct tegra_dma_desc		*dma_desc;
>> +	unsigned int			words_xferred;
>>  };
>>  
>>  /*
>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>  	nsg_req->configured = true;
>> +	nsg_req->words_xferred = 0;
>>  
>>  	tegra_dma_resume(tdc);
>>  }
>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>  					typeof(*sg_req), node);
>>  	tegra_dma_start(tdc, sg_req);
>>  	sg_req->configured = true;
>> +	sg_req->words_xferred = 0;
>>  	tdc->busy = true;
>>  }
>>  
>> @@ -638,6 +641,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
>>  		list_add_tail(&dma_desc->cb_node, &tdc->cb_desc);
>>  	dma_desc->cb_count++;
>>  
>> +	sgreq->words_xferred = 0;
>> +
>>  	/* If not last req then put at end of pending list */
>>  	if (!list_is_last(&sgreq->node, &tdc->pending_sg_req)) {
>>  		list_move_tail(&sgreq->node, &tdc->pending_sg_req);
>> @@ -797,6 +802,62 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>  	return 0;
>>  }
>>  
>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>> +					       struct tegra_dma_sg_req *sg_req)
>> +{
>> +	unsigned long status, wcount = 0;
>> +
>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>> +		return 0;
>> +
>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>> +
>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>> +
>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>> +		wcount = status;
>> +
>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>> +		return sg_req->req_len;
>> +
>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>> +
>> +	if (!wcount) {
>> +		/*
>> +		 * If wcount wasn't ever polled for this SG before, then
>> +		 * simply assume that transfer hasn't started yet.
>> +		 *
>> +		 * Otherwise it's the end of the transfer.
>> +		 *
>> +		 * The alternative would be to poll the status register
>> +		 * until EOC bit is set or wcount goes UP. That's so
>> +		 * because EOC bit is getting set only after the last
>> +		 * burst's completion and counter is less than the actual
>> +		 * transfer size by 4 bytes. The counter value wraps around
>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>> +		 * distinguish start of transfer from its end.
>> +		 */
>> +		if (sg_req->words_xferred)
>> +			wcount = sg_req->req_len - 4;
>> +
>> +	} else if (wcount < sg_req->words_xferred) {
>> +		/*
>> +		 * This case shall not ever happen because EOC bit
>> +		 * must be set once next cyclic transfer is started.
> 
> Should this still be cyclic here?

Do you mean the "comment" by "here"?

It will be absolutely terrible if this case happens for oneshot transfer, assume
kernel/hardware is on fire.
Jon Hunter July 4, 2019, 7:10 a.m. UTC | #3
On 03/07/2019 18:00, Dmitry Osipenko wrote:
> 03.07.2019 19:37, Jon Hunter пишет:
>>
>> On 03/07/2019 02:28, Dmitry Osipenko wrote:
>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>> of data, hence it can report transfer's residual with more fidelity which
>>> may be required in cases like audio playback. In particular this fixes
>>> audio stuttering during playback in a chromium web browser. The patch is
>>> based on the original work that was made by Ben Dooks and a patch from
>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>
>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>> Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>> ---
>>>
>>> Changelog:
>>>
>>> v4: The words_xferred is now also reset on a new iteration of a cyclic
>>>     transfer by ISR, so that dmaengine_tx_status() won't produce a
>>>     misleading warning splat on TX status re-checking after a cycle
>>>     completion when cyclic transfer consists of a single SG.
>>>
>>> v3: Added workaround for a hardware design shortcoming that results
>>>     in a words counter wraparound before end-of-transfer bit is set
>>>     in a cyclic mode.
>>>
>>> v2: Addressed review comments made by Jon Hunter to v1. We won't try
>>>     to get words count if dma_desc is on free list as it will result
>>>     in a NULL dereference because this case wasn't handled properly.
>>>
>>>     The residual value is now updated properly, avoiding potential
>>>     integer overflow by adding the "bytes" to the "bytes_transferred"
>>>     instead of the subtraction.
>>>
>>>  drivers/dma/tegra20-apb-dma.c | 72 +++++++++++++++++++++++++++++++----
>>>  1 file changed, 65 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>> index 79e9593815f1..148d136191d7 100644
>>> --- a/drivers/dma/tegra20-apb-dma.c
>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>  	bool				last_sg;
>>>  	struct list_head		node;
>>>  	struct tegra_dma_desc		*dma_desc;
>>> +	unsigned int			words_xferred;
>>>  };
>>>  
>>>  /*
>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>  	nsg_req->configured = true;
>>> +	nsg_req->words_xferred = 0;
>>>  
>>>  	tegra_dma_resume(tdc);
>>>  }
>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>  					typeof(*sg_req), node);
>>>  	tegra_dma_start(tdc, sg_req);
>>>  	sg_req->configured = true;
>>> +	sg_req->words_xferred = 0;
>>>  	tdc->busy = true;
>>>  }
>>>  
>>> @@ -638,6 +641,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
>>>  		list_add_tail(&dma_desc->cb_node, &tdc->cb_desc);
>>>  	dma_desc->cb_count++;
>>>  
>>> +	sgreq->words_xferred = 0;
>>> +
>>>  	/* If not last req then put at end of pending list */
>>>  	if (!list_is_last(&sgreq->node, &tdc->pending_sg_req)) {
>>>  		list_move_tail(&sgreq->node, &tdc->pending_sg_req);
>>> @@ -797,6 +802,62 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>  	return 0;
>>>  }
>>>  
>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>> +					       struct tegra_dma_sg_req *sg_req)
>>> +{
>>> +	unsigned long status, wcount = 0;
>>> +
>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>> +		return 0;
>>> +
>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>> +
>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>> +
>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +		wcount = status;
>>> +
>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>> +		return sg_req->req_len;
>>> +
>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>> +
>>> +	if (!wcount) {
>>> +		/*
>>> +		 * If wcount wasn't ever polled for this SG before, then
>>> +		 * simply assume that transfer hasn't started yet.
>>> +		 *
>>> +		 * Otherwise it's the end of the transfer.
>>> +		 *
>>> +		 * The alternative would be to poll the status register
>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>> +		 * because EOC bit is getting set only after the last
>>> +		 * burst's completion and counter is less than the actual
>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>> +		 * distinguish start of transfer from its end.
>>> +		 */
>>> +		if (sg_req->words_xferred)
>>> +			wcount = sg_req->req_len - 4;
>>> +
>>> +	} else if (wcount < sg_req->words_xferred) {
>>> +		/*
>>> +		 * This case shall not ever happen because EOC bit
>>> +		 * must be set once next cyclic transfer is started.
>>
>> Should this still be cyclic here?
> 
> Do you mean the "comment" by "here"?
> 
> It will be absolutely terrible if this case happens for oneshot transfer, assume
> kernel/hardware is on fire.

Or more likely a SW bug :-)

Yes should never happen for either sg or cyclic, but there is no mention
of sg transfers. Maybe the sg case is more obvious but in general this
case should never happen for any transfer.

Jon
Dmitry Osipenko July 4, 2019, 10:49 a.m. UTC | #4
04.07.2019 10:10, Jon Hunter пишет:
> 
> On 03/07/2019 18:00, Dmitry Osipenko wrote:
>> 03.07.2019 19:37, Jon Hunter пишет:
>>>
>>> On 03/07/2019 02:28, Dmitry Osipenko wrote:
>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>> of data, hence it can report transfer's residual with more fidelity which
>>>> may be required in cases like audio playback. In particular this fixes
>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>> based on the original work that was made by Ben Dooks and a patch from
>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>
>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>> Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>> ---
>>>>
>>>> Changelog:
>>>>
>>>> v4: The words_xferred is now also reset on a new iteration of a cyclic
>>>>     transfer by ISR, so that dmaengine_tx_status() won't produce a
>>>>     misleading warning splat on TX status re-checking after a cycle
>>>>     completion when cyclic transfer consists of a single SG.
>>>>
>>>> v3: Added workaround for a hardware design shortcoming that results
>>>>     in a words counter wraparound before end-of-transfer bit is set
>>>>     in a cyclic mode.
>>>>
>>>> v2: Addressed review comments made by Jon Hunter to v1. We won't try
>>>>     to get words count if dma_desc is on free list as it will result
>>>>     in a NULL dereference because this case wasn't handled properly.
>>>>
>>>>     The residual value is now updated properly, avoiding potential
>>>>     integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>     instead of the subtraction.
>>>>
>>>>  drivers/dma/tegra20-apb-dma.c | 72 +++++++++++++++++++++++++++++++----
>>>>  1 file changed, 65 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>> index 79e9593815f1..148d136191d7 100644
>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>  	bool				last_sg;
>>>>  	struct list_head		node;
>>>>  	struct tegra_dma_desc		*dma_desc;
>>>> +	unsigned int			words_xferred;
>>>>  };
>>>>  
>>>>  /*
>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>  	nsg_req->configured = true;
>>>> +	nsg_req->words_xferred = 0;
>>>>  
>>>>  	tegra_dma_resume(tdc);
>>>>  }
>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>  					typeof(*sg_req), node);
>>>>  	tegra_dma_start(tdc, sg_req);
>>>>  	sg_req->configured = true;
>>>> +	sg_req->words_xferred = 0;
>>>>  	tdc->busy = true;
>>>>  }
>>>>  
>>>> @@ -638,6 +641,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
>>>>  		list_add_tail(&dma_desc->cb_node, &tdc->cb_desc);
>>>>  	dma_desc->cb_count++;
>>>>  
>>>> +	sgreq->words_xferred = 0;
>>>> +
>>>>  	/* If not last req then put at end of pending list */
>>>>  	if (!list_is_last(&sgreq->node, &tdc->pending_sg_req)) {
>>>>  		list_move_tail(&sgreq->node, &tdc->pending_sg_req);
>>>> @@ -797,6 +802,62 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>> +{
>>>> +	unsigned long status, wcount = 0;
>>>> +
>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>> +		return 0;
>>>> +
>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>> +
>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>> +
>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = status;
>>>> +
>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>> +		return sg_req->req_len;
>>>> +
>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>> +
>>>> +	if (!wcount) {
>>>> +		/*
>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>> +		 * simply assume that transfer hasn't started yet.
>>>> +		 *
>>>> +		 * Otherwise it's the end of the transfer.
>>>> +		 *
>>>> +		 * The alternative would be to poll the status register
>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>> +		 * because EOC bit is getting set only after the last
>>>> +		 * burst's completion and counter is less than the actual
>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>> +		 * distinguish start of transfer from its end.
>>>> +		 */
>>>> +		if (sg_req->words_xferred)
>>>> +			wcount = sg_req->req_len - 4;
>>>> +
>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>> +		/*
>>>> +		 * This case shall not ever happen because EOC bit
>>>> +		 * must be set once next cyclic transfer is started.
>>>
>>> Should this still be cyclic here?
>>
>> Do you mean the "comment" by "here"?
>>
>> It will be absolutely terrible if this case happens for oneshot transfer, assume
>> kernel/hardware is on fire.
> 
> Or more likely a SW bug :-)
> 
> Yes should never happen for either sg or cyclic, but there is no mention
> of sg transfers. Maybe the sg case is more obvious but in general this
> case should never happen for any transfer.

Alright, so what the change you are proposing? Or is it fine now?

I can certainly change the comment's wording, just please tell me what you want it to be.

/*
 * This case shall not ever happen because EOC bit
 * must be set once transfer is actually finished.

Does this sound better?
Jon Hunter July 4, 2019, 12:08 p.m. UTC | #5
On 04/07/2019 11:49, Dmitry Osipenko wrote:
> 04.07.2019 10:10, Jon Hunter пишет:
>>
>> On 03/07/2019 18:00, Dmitry Osipenko wrote:
>>> 03.07.2019 19:37, Jon Hunter пишет:
>>>>
>>>> On 03/07/2019 02:28, Dmitry Osipenko wrote:
>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>> may be required in cases like audio playback. In particular this fixes
>>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>>> based on the original work that was made by Ben Dooks and a patch from
>>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>>
>>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>> Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>> ---
>>>>>
>>>>> Changelog:
>>>>>
>>>>> v4: The words_xferred is now also reset on a new iteration of a cyclic
>>>>>     transfer by ISR, so that dmaengine_tx_status() won't produce a
>>>>>     misleading warning splat on TX status re-checking after a cycle
>>>>>     completion when cyclic transfer consists of a single SG.
>>>>>
>>>>> v3: Added workaround for a hardware design shortcoming that results
>>>>>     in a words counter wraparound before end-of-transfer bit is set
>>>>>     in a cyclic mode.
>>>>>
>>>>> v2: Addressed review comments made by Jon Hunter to v1. We won't try
>>>>>     to get words count if dma_desc is on free list as it will result
>>>>>     in a NULL dereference because this case wasn't handled properly.
>>>>>
>>>>>     The residual value is now updated properly, avoiding potential
>>>>>     integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>>     instead of the subtraction.
>>>>>
>>>>>  drivers/dma/tegra20-apb-dma.c | 72 +++++++++++++++++++++++++++++++----
>>>>>  1 file changed, 65 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>> index 79e9593815f1..148d136191d7 100644
>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>>  	bool				last_sg;
>>>>>  	struct list_head		node;
>>>>>  	struct tegra_dma_desc		*dma_desc;
>>>>> +	unsigned int			words_xferred;
>>>>>  };
>>>>>  
>>>>>  /*
>>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>>  	nsg_req->configured = true;
>>>>> +	nsg_req->words_xferred = 0;
>>>>>  
>>>>>  	tegra_dma_resume(tdc);
>>>>>  }
>>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>>  					typeof(*sg_req), node);
>>>>>  	tegra_dma_start(tdc, sg_req);
>>>>>  	sg_req->configured = true;
>>>>> +	sg_req->words_xferred = 0;
>>>>>  	tdc->busy = true;
>>>>>  }
>>>>>  
>>>>> @@ -638,6 +641,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
>>>>>  		list_add_tail(&dma_desc->cb_node, &tdc->cb_desc);
>>>>>  	dma_desc->cb_count++;
>>>>>  
>>>>> +	sgreq->words_xferred = 0;
>>>>> +
>>>>>  	/* If not last req then put at end of pending list */
>>>>>  	if (!list_is_last(&sgreq->node, &tdc->pending_sg_req)) {
>>>>>  		list_move_tail(&sgreq->node, &tdc->pending_sg_req);
>>>>> @@ -797,6 +802,62 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>  	return 0;
>>>>>  }
>>>>>  
>>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>>> +{
>>>>> +	unsigned long status, wcount = 0;
>>>>> +
>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>> +		return 0;
>>>>> +
>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>> +
>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>> +
>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +		wcount = status;
>>>>> +
>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>> +		return sg_req->req_len;
>>>>> +
>>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>>> +
>>>>> +	if (!wcount) {
>>>>> +		/*
>>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>>> +		 * simply assume that transfer hasn't started yet.
>>>>> +		 *
>>>>> +		 * Otherwise it's the end of the transfer.
>>>>> +		 *
>>>>> +		 * The alternative would be to poll the status register
>>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>>> +		 * because EOC bit is getting set only after the last
>>>>> +		 * burst's completion and counter is less than the actual
>>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>>> +		 * distinguish start of transfer from its end.
>>>>> +		 */
>>>>> +		if (sg_req->words_xferred)
>>>>> +			wcount = sg_req->req_len - 4;
>>>>> +
>>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>>> +		/*
>>>>> +		 * This case shall not ever happen because EOC bit
>>>>> +		 * must be set once next cyclic transfer is started.
>>>>
>>>> Should this still be cyclic here?
>>>
>>> Do you mean the "comment" by "here"?
>>>
>>> It will be absolutely terrible if this case happens for oneshot transfer, assume
>>> kernel/hardware is on fire.
>>
>> Or more likely a SW bug :-)
>>
>> Yes should never happen for either sg or cyclic, but there is no mention
>> of sg transfers. Maybe the sg case is more obvious but in general this
>> case should never happen for any transfer.
> 
> Alright, so what the change you are proposing? Or is it fine now?
> 
> I can certainly change the comment's wording, just please tell me what you want it to be.
> 
> /*
>  * This case shall not ever happen because EOC bit
>  * must be set once transfer is actually finished.
> 
> Does this sound better?

If I think it would be good to say ...

This case will never happen for a non-cyclic transfer. For a cyclic
transfer, although it is possible for the next transfer to have already
started (resetting the word count), this case should still not happen
because we should have detected that the EOC bit is set and hence the
transfer was completed.

At least that should remind me of what we are doing here in the future :-)

Jon
Dmitry Osipenko July 4, 2019, 12:35 p.m. UTC | #6
04.07.2019 15:08, Jon Hunter пишет:
> 
> On 04/07/2019 11:49, Dmitry Osipenko wrote:
>> 04.07.2019 10:10, Jon Hunter пишет:
>>>
>>> On 03/07/2019 18:00, Dmitry Osipenko wrote:
>>>> 03.07.2019 19:37, Jon Hunter пишет:
>>>>>
>>>>> On 03/07/2019 02:28, Dmitry Osipenko wrote:
>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>>>> based on the original work that was made by Ben Dooks and a patch from
>>>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>>>
>>>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>> Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>> ---
>>>>>>
>>>>>> Changelog:
>>>>>>
>>>>>> v4: The words_xferred is now also reset on a new iteration of a cyclic
>>>>>>     transfer by ISR, so that dmaengine_tx_status() won't produce a
>>>>>>     misleading warning splat on TX status re-checking after a cycle
>>>>>>     completion when cyclic transfer consists of a single SG.
>>>>>>
>>>>>> v3: Added workaround for a hardware design shortcoming that results
>>>>>>     in a words counter wraparound before end-of-transfer bit is set
>>>>>>     in a cyclic mode.
>>>>>>
>>>>>> v2: Addressed review comments made by Jon Hunter to v1. We won't try
>>>>>>     to get words count if dma_desc is on free list as it will result
>>>>>>     in a NULL dereference because this case wasn't handled properly.
>>>>>>
>>>>>>     The residual value is now updated properly, avoiding potential
>>>>>>     integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>>>     instead of the subtraction.
>>>>>>
>>>>>>  drivers/dma/tegra20-apb-dma.c | 72 +++++++++++++++++++++++++++++++----
>>>>>>  1 file changed, 65 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>> index 79e9593815f1..148d136191d7 100644
>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>>>  	bool				last_sg;
>>>>>>  	struct list_head		node;
>>>>>>  	struct tegra_dma_desc		*dma_desc;
>>>>>> +	unsigned int			words_xferred;
>>>>>>  };
>>>>>>  
>>>>>>  /*
>>>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>>>  	nsg_req->configured = true;
>>>>>> +	nsg_req->words_xferred = 0;
>>>>>>  
>>>>>>  	tegra_dma_resume(tdc);
>>>>>>  }
>>>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>>>  					typeof(*sg_req), node);
>>>>>>  	tegra_dma_start(tdc, sg_req);
>>>>>>  	sg_req->configured = true;
>>>>>> +	sg_req->words_xferred = 0;
>>>>>>  	tdc->busy = true;
>>>>>>  }
>>>>>>  
>>>>>> @@ -638,6 +641,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
>>>>>>  		list_add_tail(&dma_desc->cb_node, &tdc->cb_desc);
>>>>>>  	dma_desc->cb_count++;
>>>>>>  
>>>>>> +	sgreq->words_xferred = 0;
>>>>>> +
>>>>>>  	/* If not last req then put at end of pending list */
>>>>>>  	if (!list_is_last(&sgreq->node, &tdc->pending_sg_req)) {
>>>>>>  		list_move_tail(&sgreq->node, &tdc->pending_sg_req);
>>>>>> @@ -797,6 +802,62 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>  	return 0;
>>>>>>  }
>>>>>>  
>>>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>>>> +{
>>>>>> +	unsigned long status, wcount = 0;
>>>>>> +
>>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>> +		return 0;
>>>>>> +
>>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>> +
>>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>> +
>>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +		wcount = status;
>>>>>> +
>>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>> +		return sg_req->req_len;
>>>>>> +
>>>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>>>> +
>>>>>> +	if (!wcount) {
>>>>>> +		/*
>>>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>>>> +		 * simply assume that transfer hasn't started yet.
>>>>>> +		 *
>>>>>> +		 * Otherwise it's the end of the transfer.
>>>>>> +		 *
>>>>>> +		 * The alternative would be to poll the status register
>>>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>>>> +		 * because EOC bit is getting set only after the last
>>>>>> +		 * burst's completion and counter is less than the actual
>>>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>>>> +		 * distinguish start of transfer from its end.
>>>>>> +		 */
>>>>>> +		if (sg_req->words_xferred)
>>>>>> +			wcount = sg_req->req_len - 4;
>>>>>> +
>>>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>>>> +		/*
>>>>>> +		 * This case shall not ever happen because EOC bit
>>>>>> +		 * must be set once next cyclic transfer is started.
>>>>>
>>>>> Should this still be cyclic here?
>>>>
>>>> Do you mean the "comment" by "here"?
>>>>
>>>> It will be absolutely terrible if this case happens for oneshot transfer, assume
>>>> kernel/hardware is on fire.
>>>
>>> Or more likely a SW bug :-)
>>>
>>> Yes should never happen for either sg or cyclic, but there is no mention
>>> of sg transfers. Maybe the sg case is more obvious but in general this
>>> case should never happen for any transfer.
>>
>> Alright, so what the change you are proposing? Or is it fine now?
>>
>> I can certainly change the comment's wording, just please tell me what you want it to be.
>>
>> /*
>>  * This case shall not ever happen because EOC bit
>>  * must be set once transfer is actually finished.
>>
>> Does this sound better?
> 
> If I think it would be good to say ...
> 
> This case will never happen for a non-cyclic transfer. For a cyclic
> transfer, although it is possible for the next transfer to have already
> started (resetting the word count), this case should still not happen
> because we should have detected that the EOC bit is set and hence the
> transfer was completed.
> 
> At least that should remind me of what we are doing here in the future :-)

Okay, good to me. Will make a v5!
diff mbox series

Patch

diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
index 79e9593815f1..148d136191d7 100644
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -152,6 +152,7 @@  struct tegra_dma_sg_req {
 	bool				last_sg;
 	struct list_head		node;
 	struct tegra_dma_desc		*dma_desc;
+	unsigned int			words_xferred;
 };
 
 /*
@@ -496,6 +497,7 @@  static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
 	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
 				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
 	nsg_req->configured = true;
+	nsg_req->words_xferred = 0;
 
 	tegra_dma_resume(tdc);
 }
@@ -511,6 +513,7 @@  static void tdc_start_head_req(struct tegra_dma_channel *tdc)
 					typeof(*sg_req), node);
 	tegra_dma_start(tdc, sg_req);
 	sg_req->configured = true;
+	sg_req->words_xferred = 0;
 	tdc->busy = true;
 }
 
@@ -638,6 +641,8 @@  static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
 		list_add_tail(&dma_desc->cb_node, &tdc->cb_desc);
 	dma_desc->cb_count++;
 
+	sgreq->words_xferred = 0;
+
 	/* If not last req then put at end of pending list */
 	if (!list_is_last(&sgreq->node, &tdc->pending_sg_req)) {
 		list_move_tail(&sgreq->node, &tdc->pending_sg_req);
@@ -797,6 +802,62 @@  static int tegra_dma_terminate_all(struct dma_chan *dc)
 	return 0;
 }
 
+static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
+					       struct tegra_dma_sg_req *sg_req)
+{
+	unsigned long status, wcount = 0;
+
+	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
+		return 0;
+
+	if (tdc->tdma->chip_data->support_separate_wcount_reg)
+		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
+
+	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
+
+	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
+		wcount = status;
+
+	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
+		return sg_req->req_len;
+
+	wcount = get_current_xferred_count(tdc, sg_req, wcount);
+
+	if (!wcount) {
+		/*
+		 * If wcount wasn't ever polled for this SG before, then
+		 * simply assume that transfer hasn't started yet.
+		 *
+		 * Otherwise it's the end of the transfer.
+		 *
+		 * The alternative would be to poll the status register
+		 * until EOC bit is set or wcount goes UP. That's so
+		 * because EOC bit is getting set only after the last
+		 * burst's completion and counter is less than the actual
+		 * transfer size by 4 bytes. The counter value wraps around
+		 * in a cyclic mode before EOC is set(!), so we can't easily
+		 * distinguish start of transfer from its end.
+		 */
+		if (sg_req->words_xferred)
+			wcount = sg_req->req_len - 4;
+
+	} else if (wcount < sg_req->words_xferred) {
+		/*
+		 * This case shall not ever happen because EOC bit
+		 * must be set once next cyclic transfer is started.
+		 * Assume that hardware is malfunctioning or there is
+		 * a software bug.
+		 */
+		WARN_ON_ONCE(1);
+
+		wcount = sg_req->req_len - 4;
+	} else {
+		sg_req->words_xferred = wcount;
+	}
+
+	return wcount;
+}
+
 static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 	dma_cookie_t cookie, struct dma_tx_state *txstate)
 {
@@ -806,6 +867,7 @@  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 	enum dma_status ret;
 	unsigned long flags;
 	unsigned int residual;
+	unsigned int bytes = 0;
 
 	ret = dma_cookie_status(dc, cookie, txstate);
 	if (ret == DMA_COMPLETE)
@@ -825,6 +887,7 @@  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 	list_for_each_entry(sg_req, &tdc->pending_sg_req, node) {
 		dma_desc = sg_req->dma_desc;
 		if (dma_desc->txd.cookie == cookie) {
+			bytes = tegra_dma_sg_bytes_xferred(tdc, sg_req);
 			ret = dma_desc->dma_status;
 			goto found;
 		}
@@ -836,7 +899,7 @@  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 found:
 	if (dma_desc && txstate) {
 		residual = dma_desc->bytes_requested -
-			   (dma_desc->bytes_transferred %
+			   ((dma_desc->bytes_transferred + bytes) %
 			    dma_desc->bytes_requested);
 		dma_set_residue(txstate, residual);
 	}
@@ -1441,12 +1504,7 @@  static int tegra_dma_probe(struct platform_device *pdev)
 		BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) |
 		BIT(DMA_SLAVE_BUSWIDTH_8_BYTES);
 	tdma->dma_dev.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
-	/*
-	 * XXX The hardware appears to support
-	 * DMA_RESIDUE_GRANULARITY_BURST-level reporting, but it's
-	 * only used by this driver during tegra_dma_terminate_all()
-	 */
-	tdma->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_SEGMENT;
+	tdma->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
 	tdma->dma_dev.device_config = tegra_dma_slave_config;
 	tdma->dma_dev.device_terminate_all = tegra_dma_terminate_all;
 	tdma->dma_dev.device_tx_status = tegra_dma_tx_status;