diff mbox series

[v1] dmaengine: tegra-apb: Support per-burst residue granularity

Message ID 20190613210849.10382-1-digetx@gmail.com (mailing list archive)
State Changes Requested
Headers show
Series [v1] dmaengine: tegra-apb: Support per-burst residue granularity | expand

Commit Message

Dmitry Osipenko June 13, 2019, 9:08 p.m. UTC
Tegra's APB DMA engine updates words counter after each transferred burst
of data, hence it can report transfer's residual with more fidelity which
may be required in cases like audio playback. In particular this fixes
audio stuttering during playback in a chromiuim web browser. The patch is
based on the original work that was made by Ben Dooks [1]. It was tested
on Tegra20 and Tegra30 devices.

[1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/

Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

Comments

Jon Hunter June 14, 2019, 3:21 p.m. UTC | #1
On 13/06/2019 22:08, Dmitry Osipenko wrote:
> Tegra's APB DMA engine updates words counter after each transferred burst
> of data, hence it can report transfer's residual with more fidelity which
> may be required in cases like audio playback. In particular this fixes
> audio stuttering during playback in a chromiuim web browser. The patch is
> based on the original work that was made by Ben Dooks [1]. It was tested
> on Tegra20 and Tegra30 devices.
> 
> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
> 
> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
>  drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>  1 file changed, 28 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
> index 79e9593815f1..c5af8f703548 100644
> --- a/drivers/dma/tegra20-apb-dma.c
> +++ b/drivers/dma/tegra20-apb-dma.c
> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>  	return 0;
>  }
>  
> +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
> +					      struct tegra_dma_sg_req *sg_req,
> +					      struct tegra_dma_desc *dma_desc,
> +					      unsigned int residual)
> +{
> +	unsigned long status, wcount = 0;
> +
> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
> +		return residual;
> +
> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
> +
> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
> +
> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = status;
> +
> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
> +		return residual - sg_req->req_len;
> +
> +	return residual - get_current_xferred_count(tdc, sg_req, wcount);
> +}
> +
>  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>  	dma_cookie_t cookie, struct dma_tx_state *txstate)
>  {
>  	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
> +	struct tegra_dma_sg_req *sg_req = NULL;
>  	struct tegra_dma_desc *dma_desc;
> -	struct tegra_dma_sg_req *sg_req;
>  	enum dma_status ret;
>  	unsigned long flags;
>  	unsigned int residual;
> @@ -838,6 +862,8 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>  		residual = dma_desc->bytes_requested -
>  			   (dma_desc->bytes_transferred %
>  			    dma_desc->bytes_requested);
> +		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
> +						     residual);

I had a quick look at this, I am not sure that we want to call
tegra_dma_update_residual() here for cases where the dma_desc is on the
free_dma_desc list. In fact, couldn't this be simplified a bit for case
where the dma_desc is on the free list? In that case I believe that the
residual should always be 0.

Cheers
Jon
Jon Hunter June 14, 2019, 3:24 p.m. UTC | #2
On 14/06/2019 16:21, Jon Hunter wrote:
> 
> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>> Tegra's APB DMA engine updates words counter after each transferred burst
>> of data, hence it can report transfer's residual with more fidelity which
>> may be required in cases like audio playback. In particular this fixes
>> audio stuttering during playback in a chromiuim web browser. The patch is
>> based on the original work that was made by Ben Dooks [1]. It was tested
>> on Tegra20 and Tegra30 devices.
>>
>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>
>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>  drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>  1 file changed, 28 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>> index 79e9593815f1..c5af8f703548 100644
>> --- a/drivers/dma/tegra20-apb-dma.c
>> +++ b/drivers/dma/tegra20-apb-dma.c
>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>  	return 0;
>>  }
>>  
>> +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>> +					      struct tegra_dma_sg_req *sg_req,
>> +					      struct tegra_dma_desc *dma_desc,
>> +					      unsigned int residual)
>> +{
>> +	unsigned long status, wcount = 0;
>> +
>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>> +		return residual;
>> +
>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>> +
>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>> +
>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>> +		wcount = status;
>> +
>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>> +		return residual - sg_req->req_len;
>> +
>> +	return residual - get_current_xferred_count(tdc, sg_req, wcount);
>> +}
>> +
>>  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>  	dma_cookie_t cookie, struct dma_tx_state *txstate)
>>  {
>>  	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
>> +	struct tegra_dma_sg_req *sg_req = NULL;
>>  	struct tegra_dma_desc *dma_desc;
>> -	struct tegra_dma_sg_req *sg_req;
>>  	enum dma_status ret;
>>  	unsigned long flags;
>>  	unsigned int residual;
>> @@ -838,6 +862,8 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>  		residual = dma_desc->bytes_requested -
>>  			   (dma_desc->bytes_transferred %
>>  			    dma_desc->bytes_requested);
>> +		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
>> +						     residual);
> 
> I had a quick look at this, I am not sure that we want to call
> tegra_dma_update_residual() here for cases where the dma_desc is on the
> free_dma_desc list. In fact, couldn't this be simplified a bit for case
> where the dma_desc is on the free list? In that case I believe that the
> residual should always be 0.

Actually, no, it could be non-zero in the case the transfer is aborted.

Jon
Dmitry Osipenko June 14, 2019, 4:44 p.m. UTC | #3
14.06.2019 18:24, Jon Hunter пишет:
> 
> On 14/06/2019 16:21, Jon Hunter wrote:
>>
>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>> of data, hence it can report transfer's residual with more fidelity which
>>> may be required in cases like audio playback. In particular this fixes
>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>> on Tegra20 and Tegra30 devices.
>>>
>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>
>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>> ---
>>>  drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>  1 file changed, 28 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>> index 79e9593815f1..c5af8f703548 100644
>>> --- a/drivers/dma/tegra20-apb-dma.c
>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>  	return 0;
>>>  }
>>>  
>>> +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>> +					      struct tegra_dma_sg_req *sg_req,
>>> +					      struct tegra_dma_desc *dma_desc,
>>> +					      unsigned int residual)
>>> +{
>>> +	unsigned long status, wcount = 0;
>>> +
>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>> +		return residual;
>>> +
>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>> +
>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>> +
>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +		wcount = status;
>>> +
>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>> +		return residual - sg_req->req_len;
>>> +
>>> +	return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>> +}
>>> +
>>>  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>  	dma_cookie_t cookie, struct dma_tx_state *txstate)
>>>  {
>>>  	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
>>> +	struct tegra_dma_sg_req *sg_req = NULL;
>>>  	struct tegra_dma_desc *dma_desc;
>>> -	struct tegra_dma_sg_req *sg_req;
>>>  	enum dma_status ret;
>>>  	unsigned long flags;
>>>  	unsigned int residual;
>>> @@ -838,6 +862,8 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>  		residual = dma_desc->bytes_requested -
>>>  			   (dma_desc->bytes_transferred %
>>>  			    dma_desc->bytes_requested);
>>> +		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
>>> +						     residual);
>>
>> I had a quick look at this, I am not sure that we want to call
>> tegra_dma_update_residual() here for cases where the dma_desc is on the
>> free_dma_desc list. In fact, couldn't this be simplified a bit for case
>> where the dma_desc is on the free list? In that case I believe that the
>> residual should always be 0.
> 
> Actually, no, it could be non-zero in the case the transfer is aborted.

Looks like everything should be fine as-is.

BTW, it's a bit hard to believe that there is any real benefit from the
free_dma_desc list at all, maybe worth to just remove it?
Jon Hunter June 17, 2019, 10:57 a.m. UTC | #4
On 14/06/2019 17:44, Dmitry Osipenko wrote:
> 14.06.2019 18:24, Jon Hunter пишет:
>>
>> On 14/06/2019 16:21, Jon Hunter wrote:
>>>
>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>> of data, hence it can report transfer's residual with more fidelity which
>>>> may be required in cases like audio playback. In particular this fixes
>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>> on Tegra20 and Tegra30 devices.
>>>>
>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>
>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>> ---
>>>>  drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>  1 file changed, 28 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>> index 79e9593815f1..c5af8f703548 100644
>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>> +					      struct tegra_dma_sg_req *sg_req,
>>>> +					      struct tegra_dma_desc *dma_desc,
>>>> +					      unsigned int residual)
>>>> +{
>>>> +	unsigned long status, wcount = 0;
>>>> +
>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>> +		return residual;
>>>> +
>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>> +
>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>> +
>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = status;
>>>> +
>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>> +		return residual - sg_req->req_len;
>>>> +
>>>> +	return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>> +}
>>>> +
>>>>  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>>  	dma_cookie_t cookie, struct dma_tx_state *txstate)
>>>>  {
>>>>  	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
>>>> +	struct tegra_dma_sg_req *sg_req = NULL;
>>>>  	struct tegra_dma_desc *dma_desc;
>>>> -	struct tegra_dma_sg_req *sg_req;
>>>>  	enum dma_status ret;
>>>>  	unsigned long flags;
>>>>  	unsigned int residual;
>>>> @@ -838,6 +862,8 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>>  		residual = dma_desc->bytes_requested -
>>>>  			   (dma_desc->bytes_transferred %
>>>>  			    dma_desc->bytes_requested);
>>>> +		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
>>>> +						     residual);
>>>
>>> I had a quick look at this, I am not sure that we want to call
>>> tegra_dma_update_residual() here for cases where the dma_desc is on the
>>> free_dma_desc list. In fact, couldn't this be simplified a bit for case
>>> where the dma_desc is on the free list? In that case I believe that the
>>> residual should always be 0.
>>
>> Actually, no, it could be non-zero in the case the transfer is aborted.
> 
> Looks like everything should be fine as-is.

I am still not sure we want to call this for the case where dma_desc is
on the free list.

> BTW, it's a bit hard to believe that there is any real benefit from the
> free_dma_desc list at all, maybe worth to just remove it?

I think you need to elaborate a bit more here. I am not a massive fan of
this driver, but I am also not in the mood for changing unless there is
a good reason.

Cheers
Jon
Dmitry Osipenko June 17, 2019, 12:41 p.m. UTC | #5
17.06.2019 13:57, Jon Hunter пишет:
> 
> On 14/06/2019 17:44, Dmitry Osipenko wrote:
>> 14.06.2019 18:24, Jon Hunter пишет:
>>>
>>> On 14/06/2019 16:21, Jon Hunter wrote:
>>>>
>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>> may be required in cases like audio playback. In particular this fixes
>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>> on Tegra20 and Tegra30 devices.
>>>>>
>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>
>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>> ---
>>>>>  drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>  1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>  	return 0;
>>>>>  }
>>>>>  
>>>>> +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>> +					      struct tegra_dma_sg_req *sg_req,
>>>>> +					      struct tegra_dma_desc *dma_desc,
>>>>> +					      unsigned int residual)
>>>>> +{
>>>>> +	unsigned long status, wcount = 0;
>>>>> +
>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>> +		return residual;
>>>>> +
>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>> +
>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>> +
>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +		wcount = status;
>>>>> +
>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>> +		return residual - sg_req->req_len;
>>>>> +
>>>>> +	return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>> +}
>>>>> +
>>>>>  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>>>  	dma_cookie_t cookie, struct dma_tx_state *txstate)
>>>>>  {
>>>>>  	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
>>>>> +	struct tegra_dma_sg_req *sg_req = NULL;
>>>>>  	struct tegra_dma_desc *dma_desc;
>>>>> -	struct tegra_dma_sg_req *sg_req;
>>>>>  	enum dma_status ret;
>>>>>  	unsigned long flags;
>>>>>  	unsigned int residual;
>>>>> @@ -838,6 +862,8 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>>>  		residual = dma_desc->bytes_requested -
>>>>>  			   (dma_desc->bytes_transferred %
>>>>>  			    dma_desc->bytes_requested);
>>>>> +		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
>>>>> +						     residual);
>>>>
>>>> I had a quick look at this, I am not sure that we want to call
>>>> tegra_dma_update_residual() here for cases where the dma_desc is on the
>>>> free_dma_desc list. In fact, couldn't this be simplified a bit for case
>>>> where the dma_desc is on the free list? In that case I believe that the
>>>> residual should always be 0.
>>>
>>> Actually, no, it could be non-zero in the case the transfer is aborted.
>>
>> Looks like everything should be fine as-is.
> 
> I am still not sure we want to call this for the case where dma_desc is
> on the free list.

You're right! It's a bug there! The sg_req=NULL if dma_desc is on the free list, hence
it will result in a NULL dereference. I'll fix it in v2 and will avoid the offending
call, like you're suggesting.

>> BTW, it's a bit hard to believe that there is any real benefit from the
>> free_dma_desc list at all, maybe worth to just remove it?
> 
> I think you need to elaborate a bit more here. I am not a massive fan of
> this driver, but I am also not in the mood for changing unless there is
> a good reason.

It looks like the whole point of the free list is to have a cache of preallocated
dma_desc's, but dma_desc allocation and initialization doesn't cost anything in
comparison to the free list because memory is allocated from a SLAB cache and then the
initialization will happen on CPU's cache.

So the free list is quite pointless in terms of optimization. Moreover what if driver
allocates a lot of dma_desc's and uses them just once? Looks like it will be quite a
lot of wasted memory on the free list.
Jon Hunter June 18, 2019, 8:47 a.m. UTC | #6
On 17/06/2019 13:41, Dmitry Osipenko wrote:
> 17.06.2019 13:57, Jon Hunter пишет:
>>
>> On 14/06/2019 17:44, Dmitry Osipenko wrote:
>>> 14.06.2019 18:24, Jon Hunter пишет:
>>>>
>>>> On 14/06/2019 16:21, Jon Hunter wrote:
>>>>>
>>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>>> on Tegra20 and Tegra30 devices.
>>>>>>
>>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>
>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>> ---
>>>>>>  drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>>  1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>  	return 0;
>>>>>>  }
>>>>>>  
>>>>>> +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>>> +					      struct tegra_dma_sg_req *sg_req,
>>>>>> +					      struct tegra_dma_desc *dma_desc,
>>>>>> +					      unsigned int residual)
>>>>>> +{
>>>>>> +	unsigned long status, wcount = 0;
>>>>>> +
>>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>> +		return residual;
>>>>>> +
>>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>> +
>>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>> +
>>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +		wcount = status;
>>>>>> +
>>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>> +		return residual - sg_req->req_len;
>>>>>> +
>>>>>> +	return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>>> +}
>>>>>> +
>>>>>>  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>>>>  	dma_cookie_t cookie, struct dma_tx_state *txstate)
>>>>>>  {
>>>>>>  	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
>>>>>> +	struct tegra_dma_sg_req *sg_req = NULL;
>>>>>>  	struct tegra_dma_desc *dma_desc;
>>>>>> -	struct tegra_dma_sg_req *sg_req;
>>>>>>  	enum dma_status ret;
>>>>>>  	unsigned long flags;
>>>>>>  	unsigned int residual;
>>>>>> @@ -838,6 +862,8 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>>>>  		residual = dma_desc->bytes_requested -
>>>>>>  			   (dma_desc->bytes_transferred %
>>>>>>  			    dma_desc->bytes_requested);
>>>>>> +		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
>>>>>> +						     residual);
>>>>>
>>>>> I had a quick look at this, I am not sure that we want to call
>>>>> tegra_dma_update_residual() here for cases where the dma_desc is on the
>>>>> free_dma_desc list. In fact, couldn't this be simplified a bit for case
>>>>> where the dma_desc is on the free list? In that case I believe that the
>>>>> residual should always be 0.
>>>>
>>>> Actually, no, it could be non-zero in the case the transfer is aborted.
>>>
>>> Looks like everything should be fine as-is.
>>
>> I am still not sure we want to call this for the case where dma_desc is
>> on the free list.
> 
> You're right! It's a bug there! The sg_req=NULL if dma_desc is on the free list, hence
> it will result in a NULL dereference. I'll fix it in v2 and will avoid the offending
> call, like you're suggesting.
> 
>>> BTW, it's a bit hard to believe that there is any real benefit from the
>>> free_dma_desc list at all, maybe worth to just remove it?
>>
>> I think you need to elaborate a bit more here. I am not a massive fan of
>> this driver, but I am also not in the mood for changing unless there is
>> a good reason.
> 
> It looks like the whole point of the free list is to have a cache of preallocated
> dma_desc's, but dma_desc allocation and initialization doesn't cost anything in
> comparison to the free list because memory is allocated from a SLAB cache and then the
> initialization will happen on CPU's cache.
> 
> So the free list is quite pointless in terms of optimization. Moreover what if driver
> allocates a lot of dma_desc's and uses them just once? Looks like it will be quite a
> lot of wasted memory on the free list.

Yes indeed and for the ADMA we allocate and free on-demand as you are
suggesting. I don't know why it was done like this, but to make the
change it would be good to get some data about how much memory it is
consuming to see if it is actually worth it.

Cheers
Jon
Dmitry Osipenko June 18, 2019, 9:45 a.m. UTC | #7
18.06.2019 11:47, Jon Hunter пишет:
> 
> On 17/06/2019 13:41, Dmitry Osipenko wrote:
>> 17.06.2019 13:57, Jon Hunter пишет:
>>>
>>> On 14/06/2019 17:44, Dmitry Osipenko wrote:
>>>> 14.06.2019 18:24, Jon Hunter пишет:
>>>>>
>>>>> On 14/06/2019 16:21, Jon Hunter wrote:
>>>>>>
>>>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>>>> on Tegra20 and Tegra30 devices.
>>>>>>>
>>>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>>
>>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>>> ---
>>>>>>>  drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>>>  1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>>  	return 0;
>>>>>>>  }
>>>>>>>  
>>>>>>> +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>>>> +					      struct tegra_dma_sg_req *sg_req,
>>>>>>> +					      struct tegra_dma_desc *dma_desc,
>>>>>>> +					      unsigned int residual)
>>>>>>> +{
>>>>>>> +	unsigned long status, wcount = 0;
>>>>>>> +
>>>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>>> +		return residual;
>>>>>>> +
>>>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>>> +
>>>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>>> +
>>>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>> +		wcount = status;
>>>>>>> +
>>>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>>> +		return residual - sg_req->req_len;
>>>>>>> +
>>>>>>> +	return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>>>> +}
>>>>>>> +
>>>>>>>  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>>>>>  	dma_cookie_t cookie, struct dma_tx_state *txstate)
>>>>>>>  {
>>>>>>>  	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
>>>>>>> +	struct tegra_dma_sg_req *sg_req = NULL;
>>>>>>>  	struct tegra_dma_desc *dma_desc;
>>>>>>> -	struct tegra_dma_sg_req *sg_req;
>>>>>>>  	enum dma_status ret;
>>>>>>>  	unsigned long flags;
>>>>>>>  	unsigned int residual;
>>>>>>> @@ -838,6 +862,8 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>>>>>>>  		residual = dma_desc->bytes_requested -
>>>>>>>  			   (dma_desc->bytes_transferred %
>>>>>>>  			    dma_desc->bytes_requested);
>>>>>>> +		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
>>>>>>> +						     residual);
>>>>>>
>>>>>> I had a quick look at this, I am not sure that we want to call
>>>>>> tegra_dma_update_residual() here for cases where the dma_desc is on the
>>>>>> free_dma_desc list. In fact, couldn't this be simplified a bit for case
>>>>>> where the dma_desc is on the free list? In that case I believe that the
>>>>>> residual should always be 0.
>>>>>
>>>>> Actually, no, it could be non-zero in the case the transfer is aborted.
>>>>
>>>> Looks like everything should be fine as-is.
>>>
>>> I am still not sure we want to call this for the case where dma_desc is
>>> on the free list.
>>
>> You're right! It's a bug there! The sg_req=NULL if dma_desc is on the free list, hence
>> it will result in a NULL dereference. I'll fix it in v2 and will avoid the offending
>> call, like you're suggesting.
>>
>>>> BTW, it's a bit hard to believe that there is any real benefit from the
>>>> free_dma_desc list at all, maybe worth to just remove it?
>>>
>>> I think you need to elaborate a bit more here. I am not a massive fan of
>>> this driver, but I am also not in the mood for changing unless there is
>>> a good reason.
>>
>> It looks like the whole point of the free list is to have a cache of preallocated
>> dma_desc's, but dma_desc allocation and initialization doesn't cost anything in
>> comparison to the free list because memory is allocated from a SLAB cache and then the
>> initialization will happen on CPU's cache.
>>
>> So the free list is quite pointless in terms of optimization. Moreover what if driver
>> allocates a lot of dma_desc's and uses them just once? Looks like it will be quite a
>> lot of wasted memory on the free list.
> 
> Yes indeed and for the ADMA we allocate and free on-demand as you are
> suggesting. I don't know why it was done like this, but to make the
> change it would be good to get some data about how much memory it is
> consuming to see if it is actually worth it.

Yeah, that's something to check in the future.
Ben Dooks June 18, 2019, 10:22 p.m. UTC | #8
On 13/06/2019 22:08, Dmitry Osipenko wrote:
> Tegra's APB DMA engine updates words counter after each transferred burst
> of data, hence it can report transfer's residual with more fidelity which
> may be required in cases like audio playback. In particular this fixes
> audio stuttering during playback in a chromiuim web browser. The patch is
> based on the original work that was made by Ben Dooks [1]. It was tested
> on Tegra20 and Tegra30 devices.
> 
> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
> 
> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>   1 file changed, 28 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
> index 79e9593815f1..c5af8f703548 100644
> --- a/drivers/dma/tegra20-apb-dma.c
> +++ b/drivers/dma/tegra20-apb-dma.c
> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>   	return 0;
>   }
>   
> +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
> +					      struct tegra_dma_sg_req *sg_req,
> +					      struct tegra_dma_desc *dma_desc,
> +					      unsigned int residual)
> +{
> +	unsigned long status, wcount = 0;
> +
> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
> +		return residual;
> +
> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
> +
> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
> +
> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = status;
> +
> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
> +		return residual - sg_req->req_len;
> +
> +	return residual - get_current_xferred_count(tdc, sg_req, wcount);
> +}

I am unfortunately nowhere near my notes, so can't completely
review this. I think the complexity of my patch series is due
to an issue with the count being updated before the EOC IRQ
is actually flagged (and most definetly before it gets to the
CPU IRQ handler).

The test system I was using, which i've not really got any
access to at the moment would show these internal inconsistent
states every few hours, however it was moving 48kHz 8ch 16bit
TDM data.

Thanks for looking into this, I am not sure if I am going to
get any time to look into this within the next couple of
months.

>   static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>   	dma_cookie_t cookie, struct dma_tx_state *txstate)
>   {
>   	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
> +	struct tegra_dma_sg_req *sg_req = NULL;
>   	struct tegra_dma_desc *dma_desc;
> -	struct tegra_dma_sg_req *sg_req;
>   	enum dma_status ret;
>   	unsigned long flags;
>   	unsigned int residual;
> @@ -838,6 +862,8 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
>   		residual = dma_desc->bytes_requested -
>   			   (dma_desc->bytes_transferred %
>   			    dma_desc->bytes_requested);
> +		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
> +						     residual);
>   		dma_set_residue(txstate, residual);
>   	}
>   
> @@ -1441,12 +1467,7 @@ static int tegra_dma_probe(struct platform_device *pdev)
>   		BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) |
>   		BIT(DMA_SLAVE_BUSWIDTH_8_BYTES);
>   	tdma->dma_dev.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
> -	/*
> -	 * XXX The hardware appears to support
> -	 * DMA_RESIDUE_GRANULARITY_BURST-level reporting, but it's
> -	 * only used by this driver during tegra_dma_terminate_all()
> -	 */
> -	tdma->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_SEGMENT;
> +	tdma->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
>   	tdma->dma_dev.device_config = tegra_dma_slave_config;
>   	tdma->dma_dev.device_terminate_all = tegra_dma_terminate_all;
>   	tdma->dma_dev.device_tx_status = tegra_dma_tx_status;
>
Dmitry Osipenko June 18, 2019, 11:27 p.m. UTC | #9
19.06.2019 1:22, Ben Dooks пишет:
> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>> Tegra's APB DMA engine updates words counter after each transferred burst
>> of data, hence it can report transfer's residual with more fidelity which
>> may be required in cases like audio playback. In particular this fixes
>> audio stuttering during playback in a chromiuim web browser. The patch is
>> based on the original work that was made by Ben Dooks [1]. It was tested
>> on Tegra20 and Tegra30 devices.
>>
>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>
>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>   1 file changed, 28 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>> index 79e9593815f1..c5af8f703548 100644
>> --- a/drivers/dma/tegra20-apb-dma.c
>> +++ b/drivers/dma/tegra20-apb-dma.c
>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>       return 0;
>>   }
>>   +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>> +                          struct tegra_dma_sg_req *sg_req,
>> +                          struct tegra_dma_desc *dma_desc,
>> +                          unsigned int residual)
>> +{
>> +    unsigned long status, wcount = 0;
>> +
>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>> +        return residual;
>> +
>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>> +
>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>> +
>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>> +        wcount = status;
>> +
>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>> +        return residual - sg_req->req_len;
>> +
>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>> +}
> 
> I am unfortunately nowhere near my notes, so can't completely
> review this. I think the complexity of my patch series is due
> to an issue with the count being updated before the EOC IRQ
> is actually flagged (and most definetly before it gets to the
> CPU IRQ handler).
> 
> The test system I was using, which i've not really got any
> access to at the moment would show these internal inconsistent
> states every few hours, however it was moving 48kHz 8ch 16bit
> TDM data.
> 
> Thanks for looking into this, I am not sure if I am going to
> get any time to look into this within the next couple of
> months.

I'll try to add some debug checks to try to catch the case where count is updated before EOC
is set. Thank you very much for the clarification of the problem. So far I haven't spotted
anything going wrong.

Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
vs EOC? Assuming the cyclic transfer mode.
Jon Hunter June 19, 2019, 10:04 a.m. UTC | #10
On 19/06/2019 00:27, Dmitry Osipenko wrote:
> 19.06.2019 1:22, Ben Dooks пишет:
>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>> of data, hence it can report transfer's residual with more fidelity which
>>> may be required in cases like audio playback. In particular this fixes
>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>> on Tegra20 and Tegra30 devices.
>>>
>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>
>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>> ---
>>>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>   1 file changed, 28 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>> index 79e9593815f1..c5af8f703548 100644
>>> --- a/drivers/dma/tegra20-apb-dma.c
>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>       return 0;
>>>   }
>>>   +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>> +                          struct tegra_dma_sg_req *sg_req,
>>> +                          struct tegra_dma_desc *dma_desc,
>>> +                          unsigned int residual)
>>> +{
>>> +    unsigned long status, wcount = 0;
>>> +
>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>> +        return residual;
>>> +
>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>> +
>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>> +
>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +        wcount = status;
>>> +
>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>> +        return residual - sg_req->req_len;
>>> +
>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>> +}
>>
>> I am unfortunately nowhere near my notes, so can't completely
>> review this. I think the complexity of my patch series is due
>> to an issue with the count being updated before the EOC IRQ
>> is actually flagged (and most definetly before it gets to the
>> CPU IRQ handler).
>>
>> The test system I was using, which i've not really got any
>> access to at the moment would show these internal inconsistent
>> states every few hours, however it was moving 48kHz 8ch 16bit
>> TDM data.
>>
>> Thanks for looking into this, I am not sure if I am going to
>> get any time to look into this within the next couple of
>> months.
> 
> I'll try to add some debug checks to try to catch the case where count is updated before EOC
> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
> anything going wrong.
> 
> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
> vs EOC? Assuming the cyclic transfer mode.

I can't say that I am. However, for the case of cyclic transfer, given
that the next transfer is always programmed into the registers before
the last one completes, I could see that by the time the interrupt is
serviced that the DMA has moved on to the next transfer (which I assume
would reset the count).

Interestingly, our downstream kernel implemented a change to avoid the
count appearing to move backwards. I am curious if this also works,
which would be a lot simpler that what Ben has implemented and may
mitigate that race condition that Ben is describing.

Cheers
Jon

[0]
https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
Ben Dooks June 19, 2019, 10:08 a.m. UTC | #11
On 19/06/2019 11:04, Jon Hunter wrote:
> 
> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>> 19.06.2019 1:22, Ben Dooks пишет:
>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>> of data, hence it can report transfer's residual with more fidelity which
>>>> may be required in cases like audio playback. In particular this fixes
>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>> on Tegra20 and Tegra30 devices.
>>>>
>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>
>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>> ---
>>>>    drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>    1 file changed, 28 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>> index 79e9593815f1..c5af8f703548 100644
>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>        return 0;
>>>>    }
>>>>    +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>> +                          struct tegra_dma_desc *dma_desc,
>>>> +                          unsigned int residual)
>>>> +{
>>>> +    unsigned long status, wcount = 0;
>>>> +
>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>> +        return residual;
>>>> +
>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>> +
>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>> +
>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +        wcount = status;
>>>> +
>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>> +        return residual - sg_req->req_len;
>>>> +
>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>> +}
>>>
>>> I am unfortunately nowhere near my notes, so can't completely
>>> review this. I think the complexity of my patch series is due
>>> to an issue with the count being updated before the EOC IRQ
>>> is actually flagged (and most definetly before it gets to the
>>> CPU IRQ handler).
>>>
>>> The test system I was using, which i've not really got any
>>> access to at the moment would show these internal inconsistent
>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>> TDM data.
>>>
>>> Thanks for looking into this, I am not sure if I am going to
>>> get any time to look into this within the next couple of
>>> months.
>>
>> I'll try to add some debug checks to try to catch the case where count is updated before EOC
>> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
>> anything going wrong.
>>
>> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
>> vs EOC? Assuming the cyclic transfer mode.
> 
> I can't say that I am. However, for the case of cyclic transfer, given
> that the next transfer is always programmed into the registers before
> the last one completes, I could see that by the time the interrupt is
> serviced that the DMA has moved on to the next transfer (which I assume
> would reset the count).
> 
> Interestingly, our downstream kernel implemented a change to avoid the
> count appearing to move backwards. I am curious if this also works,
> which would be a lot simpler that what Ben has implemented and may
> mitigate that race condition that Ben is describing.

That might be the same thing we saw. IIRC it looks like the DMA has
moved on, but the count gets re-set before the EOC? I can't see that
git site so can't comment.

The only way to prove this would be to spend time running this up
with tracing (which is how we found the issue) and analysing the
output (it's also why we adding the kernel tracing in earlier
patches)
Jon Hunter June 19, 2019, 10:13 a.m. UTC | #12
On 19/06/2019 11:08, Ben Dooks wrote:
> On 19/06/2019 11:04, Jon Hunter wrote:
>>
>> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>>> 19.06.2019 1:22, Ben Dooks пишет:
>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>> Tegra's APB DMA engine updates words counter after each transferred
>>>>> burst
>>>>> of data, hence it can report transfer's residual with more fidelity
>>>>> which
>>>>> may be required in cases like audio playback. In particular this fixes
>>>>> audio stuttering during playback in a chromiuim web browser. The
>>>>> patch is
>>>>> based on the original work that was made by Ben Dooks [1]. It was
>>>>> tested
>>>>> on Tegra20 and Tegra30 devices.
>>>>>
>>>>> [1]
>>>>> https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>
>>>>>
>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>> ---
>>>>>    drivers/dma/tegra20-apb-dma.c | 35
>>>>> ++++++++++++++++++++++++++++-------
>>>>>    1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c
>>>>> b/drivers/dma/tegra20-apb-dma.c
>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct
>>>>> dma_chan *dc)
>>>>>        return 0;
>>>>>    }
>>>>>    +static unsigned int tegra_dma_update_residual(struct
>>>>> tegra_dma_channel *tdc,
>>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>>> +                          struct tegra_dma_desc *dma_desc,
>>>>> +                          unsigned int residual)
>>>>> +{
>>>>> +    unsigned long status, wcount = 0;
>>>>> +
>>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>> +        return residual;
>>>>> +
>>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>> +
>>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>> +
>>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +        wcount = status;
>>>>> +
>>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>> +        return residual - sg_req->req_len;
>>>>> +
>>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>> +}
>>>>
>>>> I am unfortunately nowhere near my notes, so can't completely
>>>> review this. I think the complexity of my patch series is due
>>>> to an issue with the count being updated before the EOC IRQ
>>>> is actually flagged (and most definetly before it gets to the
>>>> CPU IRQ handler).
>>>>
>>>> The test system I was using, which i've not really got any
>>>> access to at the moment would show these internal inconsistent
>>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>>> TDM data.
>>>>
>>>> Thanks for looking into this, I am not sure if I am going to
>>>> get any time to look into this within the next couple of
>>>> months.
>>>
>>> I'll try to add some debug checks to try to catch the case where
>>> count is updated before EOC
>>> is set. Thank you very much for the clarification of the problem. So
>>> far I haven't spotted
>>> anything going wrong.
>>>
>>> Jon / Laxman, are you aware about the possibility to get such
>>> inconsistency of words count
>>> vs EOC? Assuming the cyclic transfer mode.
>>
>> I can't say that I am. However, for the case of cyclic transfer, given
>> that the next transfer is always programmed into the registers before
>> the last one completes, I could see that by the time the interrupt is
>> serviced that the DMA has moved on to the next transfer (which I assume
>> would reset the count).
>>
>> Interestingly, our downstream kernel implemented a change to avoid the
>> count appearing to move backwards. I am curious if this also works,
>> which would be a lot simpler that what Ben has implemented and may
>> mitigate that race condition that Ben is describing.
> 
> That might be the same thing we saw. IIRC it looks like the DMA has
> moved on, but the count gets re-set before the EOC? I can't see that
> git site so can't comment.

That's odd, that should be a public site AFAIK. Does the following not
work ...

https://nv-tegra.nvidia.com/gitweb/

Jon
Ben Dooks June 19, 2019, 10:16 a.m. UTC | #13
On 19/06/2019 11:13, Jon Hunter wrote:
> 
> On 19/06/2019 11:08, Ben Dooks wrote:
>> On 19/06/2019 11:04, Jon Hunter wrote:
>>>
>>> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>>>> 19.06.2019 1:22, Ben Dooks пишет:
>>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>>> Tegra's APB DMA engine updates words counter after each transferred
>>>>>> burst
>>>>>> of data, hence it can report transfer's residual with more fidelity
>>>>>> which
>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>> audio stuttering during playback in a chromiuim web browser. The
>>>>>> patch is
>>>>>> based on the original work that was made by Ben Dooks [1]. It was
>>>>>> tested
>>>>>> on Tegra20 and Tegra30 devices.
>>>>>>
>>>>>> [1]
>>>>>> https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>
>>>>>>
>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>> ---
>>>>>>     drivers/dma/tegra20-apb-dma.c | 35
>>>>>> ++++++++++++++++++++++++++++-------
>>>>>>     1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c
>>>>>> b/drivers/dma/tegra20-apb-dma.c
>>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct
>>>>>> dma_chan *dc)
>>>>>>         return 0;
>>>>>>     }
>>>>>>     +static unsigned int tegra_dma_update_residual(struct
>>>>>> tegra_dma_channel *tdc,
>>>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>>>> +                          struct tegra_dma_desc *dma_desc,
>>>>>> +                          unsigned int residual)
>>>>>> +{
>>>>>> +    unsigned long status, wcount = 0;
>>>>>> +
>>>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>> +        return residual;
>>>>>> +
>>>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>> +
>>>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>> +
>>>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +        wcount = status;
>>>>>> +
>>>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>> +        return residual - sg_req->req_len;
>>>>>> +
>>>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>>> +}
>>>>>
>>>>> I am unfortunately nowhere near my notes, so can't completely
>>>>> review this. I think the complexity of my patch series is due
>>>>> to an issue with the count being updated before the EOC IRQ
>>>>> is actually flagged (and most definetly before it gets to the
>>>>> CPU IRQ handler).
>>>>>
>>>>> The test system I was using, which i've not really got any
>>>>> access to at the moment would show these internal inconsistent
>>>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>>>> TDM data.
>>>>>
>>>>> Thanks for looking into this, I am not sure if I am going to
>>>>> get any time to look into this within the next couple of
>>>>> months.
>>>>
>>>> I'll try to add some debug checks to try to catch the case where
>>>> count is updated before EOC
>>>> is set. Thank you very much for the clarification of the problem. So
>>>> far I haven't spotted
>>>> anything going wrong.
>>>>
>>>> Jon / Laxman, are you aware about the possibility to get such
>>>> inconsistency of words count
>>>> vs EOC? Assuming the cyclic transfer mode.
>>>
>>> I can't say that I am. However, for the case of cyclic transfer, given
>>> that the next transfer is always programmed into the registers before
>>> the last one completes, I could see that by the time the interrupt is
>>> serviced that the DMA has moved on to the next transfer (which I assume
>>> would reset the count).
>>>
>>> Interestingly, our downstream kernel implemented a change to avoid the
>>> count appearing to move backwards. I am curious if this also works,
>>> which would be a lot simpler that what Ben has implemented and may
>>> mitigate that race condition that Ben is describing.
>>
>> That might be the same thing we saw. IIRC it looks like the DMA has
>> moved on, but the count gets re-set before the EOC? I can't see that
>> git site so can't comment.
> 
> That's odd, that should be a public site AFAIK. Does the following not
> work ...
> 
> https://nv-tegra.nvidia.com/gitweb/
> 
> Jon
> 

Getting ssl errors which i think is causing a firewall issue here.

I can try again when back at the hotel instead of client site.
Dmitry Osipenko June 19, 2019, 10:27 a.m. UTC | #14
19.06.2019 13:04, Jon Hunter пишет:
> 
> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>> 19.06.2019 1:22, Ben Dooks пишет:
>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>> of data, hence it can report transfer's residual with more fidelity which
>>>> may be required in cases like audio playback. In particular this fixes
>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>> on Tegra20 and Tegra30 devices.
>>>>
>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>
>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>> ---
>>>>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>   1 file changed, 28 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>> index 79e9593815f1..c5af8f703548 100644
>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>       return 0;
>>>>   }
>>>>   +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>> +                          struct tegra_dma_desc *dma_desc,
>>>> +                          unsigned int residual)
>>>> +{
>>>> +    unsigned long status, wcount = 0;
>>>> +
>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>> +        return residual;
>>>> +
>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>> +
>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>> +
>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +        wcount = status;
>>>> +
>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>> +        return residual - sg_req->req_len;
>>>> +
>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>> +}
>>>
>>> I am unfortunately nowhere near my notes, so can't completely
>>> review this. I think the complexity of my patch series is due
>>> to an issue with the count being updated before the EOC IRQ
>>> is actually flagged (and most definetly before it gets to the
>>> CPU IRQ handler).
>>>
>>> The test system I was using, which i've not really got any
>>> access to at the moment would show these internal inconsistent
>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>> TDM data.
>>>
>>> Thanks for looking into this, I am not sure if I am going to
>>> get any time to look into this within the next couple of
>>> months.
>>
>> I'll try to add some debug checks to try to catch the case where count is updated before EOC
>> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
>> anything going wrong.
>>
>> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
>> vs EOC? Assuming the cyclic transfer mode.
> 
> I can't say that I am. However, for the case of cyclic transfer, given
> that the next transfer is always programmed into the registers before
> the last one completes, I could see that by the time the interrupt is
> serviced that the DMA has moved on to the next transfer (which I assume
> would reset the count).
> 
> Interestingly, our downstream kernel implemented a change to avoid the
> count appearing to move backwards. I am curious if this also works,
> which would be a lot simpler that what Ben has implemented and may
> mitigate that race condition that Ben is describing.
> 
> Cheers
> Jon
> 
> [0]
> https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
> 

The downstream patch doesn't check for EOC and has no comments about it, so it's hard to
tell if it's intentional. Secondly, looks like the downstream patch is mucked up because it
doesn't check whether the dma_desc is *the active* transfer and not a pending!

Jon, thanks for the pointer anyway! I'll try to catch the case that Ben is describing by
adding some extra debug checks, will come back with a report after thorough testing.
Jon Hunter June 19, 2019, 10:55 a.m. UTC | #15
On 19/06/2019 11:27, Dmitry Osipenko wrote:
> 19.06.2019 13:04, Jon Hunter пишет:
>>
>> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>>> 19.06.2019 1:22, Ben Dooks пишет:
>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>> may be required in cases like audio playback. In particular this fixes
>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>> on Tegra20 and Tegra30 devices.
>>>>>
>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>
>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>> ---
>>>>>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>   1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>       return 0;
>>>>>   }
>>>>>   +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>>> +                          struct tegra_dma_desc *dma_desc,
>>>>> +                          unsigned int residual)
>>>>> +{
>>>>> +    unsigned long status, wcount = 0;
>>>>> +
>>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>> +        return residual;
>>>>> +
>>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>> +
>>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>> +
>>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +        wcount = status;
>>>>> +
>>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>> +        return residual - sg_req->req_len;
>>>>> +
>>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>> +}
>>>>
>>>> I am unfortunately nowhere near my notes, so can't completely
>>>> review this. I think the complexity of my patch series is due
>>>> to an issue with the count being updated before the EOC IRQ
>>>> is actually flagged (and most definetly before it gets to the
>>>> CPU IRQ handler).
>>>>
>>>> The test system I was using, which i've not really got any
>>>> access to at the moment would show these internal inconsistent
>>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>>> TDM data.
>>>>
>>>> Thanks for looking into this, I am not sure if I am going to
>>>> get any time to look into this within the next couple of
>>>> months.
>>>
>>> I'll try to add some debug checks to try to catch the case where count is updated before EOC
>>> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
>>> anything going wrong.
>>>
>>> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
>>> vs EOC? Assuming the cyclic transfer mode.
>>
>> I can't say that I am. However, for the case of cyclic transfer, given
>> that the next transfer is always programmed into the registers before
>> the last one completes, I could see that by the time the interrupt is
>> serviced that the DMA has moved on to the next transfer (which I assume
>> would reset the count).
>>
>> Interestingly, our downstream kernel implemented a change to avoid the
>> count appearing to move backwards. I am curious if this also works,
>> which would be a lot simpler that what Ben has implemented and may
>> mitigate that race condition that Ben is describing.
>>
>> Cheers
>> Jon
>>
>> [0]
>> https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>
> 
> The downstream patch doesn't check for EOC and has no comments about it, so it's hard to
> tell if it's intentional. Secondly, looks like the downstream patch is mucked up because it
> doesn't check whether the dma_desc is *the active* transfer and not a pending!

I agree that it should check to see if it is active. I assume that what
this patch is doing is not updating the dma position if it appears to
have gone backwards, implying we have moved on to the next buffer. Yes
this is still probably not as accurate as Ben's implementation because
most likely we have finished that transfer and this patch would report
that it is not quite finished.

If Ben's patch works for you then why not go with this?

Cheers
Jon
Dmitry Osipenko June 19, 2019, 11:10 a.m. UTC | #16
19.06.2019 13:55, Jon Hunter пишет:
> 
> On 19/06/2019 11:27, Dmitry Osipenko wrote:
>> 19.06.2019 13:04, Jon Hunter пишет:
>>>
>>> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>>>> 19.06.2019 1:22, Ben Dooks пишет:
>>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>>> on Tegra20 and Tegra30 devices.
>>>>>>
>>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>
>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>> ---
>>>>>>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>>   1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>       return 0;
>>>>>>   }
>>>>>>   +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>>>> +                          struct tegra_dma_desc *dma_desc,
>>>>>> +                          unsigned int residual)
>>>>>> +{
>>>>>> +    unsigned long status, wcount = 0;
>>>>>> +
>>>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>> +        return residual;
>>>>>> +
>>>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>> +
>>>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>> +
>>>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +        wcount = status;
>>>>>> +
>>>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>> +        return residual - sg_req->req_len;
>>>>>> +
>>>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>>> +}
>>>>>
>>>>> I am unfortunately nowhere near my notes, so can't completely
>>>>> review this. I think the complexity of my patch series is due
>>>>> to an issue with the count being updated before the EOC IRQ
>>>>> is actually flagged (and most definetly before it gets to the
>>>>> CPU IRQ handler).
>>>>>
>>>>> The test system I was using, which i've not really got any
>>>>> access to at the moment would show these internal inconsistent
>>>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>>>> TDM data.
>>>>>
>>>>> Thanks for looking into this, I am not sure if I am going to
>>>>> get any time to look into this within the next couple of
>>>>> months.
>>>>
>>>> I'll try to add some debug checks to try to catch the case where count is updated before EOC
>>>> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
>>>> anything going wrong.
>>>>
>>>> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
>>>> vs EOC? Assuming the cyclic transfer mode.
>>>
>>> I can't say that I am. However, for the case of cyclic transfer, given
>>> that the next transfer is always programmed into the registers before
>>> the last one completes, I could see that by the time the interrupt is
>>> serviced that the DMA has moved on to the next transfer (which I assume
>>> would reset the count).
>>>
>>> Interestingly, our downstream kernel implemented a change to avoid the
>>> count appearing to move backwards. I am curious if this also works,
>>> which would be a lot simpler that what Ben has implemented and may
>>> mitigate that race condition that Ben is describing.
>>>
>>> Cheers
>>> Jon
>>>
>>> [0]
>>> https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>
>>
>> The downstream patch doesn't check for EOC and has no comments about it, so it's hard to
>> tell if it's intentional. Secondly, looks like the downstream patch is mucked up because it
>> doesn't check whether the dma_desc is *the active* transfer and not a pending!
> 
> I agree that it should check to see if it is active. I assume that what
> this patch is doing is not updating the dma position if it appears to
> have gone backwards, implying we have moved on to the next buffer. Yes
> this is still probably not as accurate as Ben's implementation because
> most likely we have finished that transfer and this patch would report
> that it is not quite finished.
> 
> If Ben's patch works for you then why not go with this?

Because I'm doubtful that it is really the case and not something else. It will be very odd
if hardware updates words count and sets EOC asynchronously, I'd call it as a faulty design
and thus a bug that need to worked around in software if that's really happening.
Jon Hunter June 19, 2019, 12:22 p.m. UTC | #17
On 19/06/2019 12:10, Dmitry Osipenko wrote:
> 19.06.2019 13:55, Jon Hunter пишет:
>>
>> On 19/06/2019 11:27, Dmitry Osipenko wrote:
>>> 19.06.2019 13:04, Jon Hunter пишет:
>>>>
>>>> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>>>>> 19.06.2019 1:22, Ben Dooks пишет:
>>>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>>>> on Tegra20 and Tegra30 devices.
>>>>>>>
>>>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>>
>>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>>> ---
>>>>>>>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>>>   1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>>       return 0;
>>>>>>>   }
>>>>>>>   +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>>>>> +                          struct tegra_dma_desc *dma_desc,
>>>>>>> +                          unsigned int residual)
>>>>>>> +{
>>>>>>> +    unsigned long status, wcount = 0;
>>>>>>> +
>>>>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>>> +        return residual;
>>>>>>> +
>>>>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>>> +
>>>>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>>> +
>>>>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>> +        wcount = status;
>>>>>>> +
>>>>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>>> +        return residual - sg_req->req_len;
>>>>>>> +
>>>>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>>>> +}
>>>>>>
>>>>>> I am unfortunately nowhere near my notes, so can't completely
>>>>>> review this. I think the complexity of my patch series is due
>>>>>> to an issue with the count being updated before the EOC IRQ
>>>>>> is actually flagged (and most definetly before it gets to the
>>>>>> CPU IRQ handler).
>>>>>>
>>>>>> The test system I was using, which i've not really got any
>>>>>> access to at the moment would show these internal inconsistent
>>>>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>>>>> TDM data.
>>>>>>
>>>>>> Thanks for looking into this, I am not sure if I am going to
>>>>>> get any time to look into this within the next couple of
>>>>>> months.
>>>>>
>>>>> I'll try to add some debug checks to try to catch the case where count is updated before EOC
>>>>> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
>>>>> anything going wrong.
>>>>>
>>>>> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
>>>>> vs EOC? Assuming the cyclic transfer mode.
>>>>
>>>> I can't say that I am. However, for the case of cyclic transfer, given
>>>> that the next transfer is always programmed into the registers before
>>>> the last one completes, I could see that by the time the interrupt is
>>>> serviced that the DMA has moved on to the next transfer (which I assume
>>>> would reset the count).
>>>>
>>>> Interestingly, our downstream kernel implemented a change to avoid the
>>>> count appearing to move backwards. I am curious if this also works,
>>>> which would be a lot simpler that what Ben has implemented and may
>>>> mitigate that race condition that Ben is describing.
>>>>
>>>> Cheers
>>>> Jon
>>>>
>>>> [0]
>>>> https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>
>>>
>>> The downstream patch doesn't check for EOC and has no comments about it, so it's hard to
>>> tell if it's intentional. Secondly, looks like the downstream patch is mucked up because it
>>> doesn't check whether the dma_desc is *the active* transfer and not a pending!
>>
>> I agree that it should check to see if it is active. I assume that what
>> this patch is doing is not updating the dma position if it appears to
>> have gone backwards, implying we have moved on to the next buffer. Yes
>> this is still probably not as accurate as Ben's implementation because
>> most likely we have finished that transfer and this patch would report
>> that it is not quite finished.
>>
>> If Ben's patch works for you then why not go with this?
> 
> Because I'm doubtful that it is really the case and not something else. It will be very odd
> if hardware updates words count and sets EOC asynchronously, I'd call it as a faulty design
> and thus a bug that need to worked around in software if that's really happening.

I don't see it that way. Probably as soon as the EOC happens, if there
is another transfer queued up, the next transfer will start and count
gets reset. So if you happen to asynchronously read the count at the
very end of the transfer, then it is possible you are doing so at the
same time that the EOC occurs but before the ISR has been triggered.

Jon
Ben Dooks June 19, 2019, 12:55 p.m. UTC | #18
On 19/06/2019 12:10, Dmitry Osipenko wrote:
> 19.06.2019 13:55, Jon Hunter пишет:
>>
>> On 19/06/2019 11:27, Dmitry Osipenko wrote:
>>> 19.06.2019 13:04, Jon Hunter пишет:
>>>>
>>>> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>>>>> 19.06.2019 1:22, Ben Dooks пишет:
>>>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>>>> on Tegra20 and Tegra30 devices.
>>>>>>>
>>>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>>
>>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>>> ---
>>>>>>>    drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>>>    1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>>        return 0;
>>>>>>>    }
>>>>>>>    +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>>>>> +                          struct tegra_dma_desc *dma_desc,
>>>>>>> +                          unsigned int residual)
>>>>>>> +{
>>>>>>> +    unsigned long status, wcount = 0;
>>>>>>> +
>>>>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>>> +        return residual;
>>>>>>> +
>>>>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>>> +
>>>>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>>> +
>>>>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>> +        wcount = status;
>>>>>>> +
>>>>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>>> +        return residual - sg_req->req_len;
>>>>>>> +
>>>>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>>>> +}
>>>>>>
>>>>>> I am unfortunately nowhere near my notes, so can't completely
>>>>>> review this. I think the complexity of my patch series is due
>>>>>> to an issue with the count being updated before the EOC IRQ
>>>>>> is actually flagged (and most definetly before it gets to the
>>>>>> CPU IRQ handler).
>>>>>>
>>>>>> The test system I was using, which i've not really got any
>>>>>> access to at the moment would show these internal inconsistent
>>>>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>>>>> TDM data.
>>>>>>
>>>>>> Thanks for looking into this, I am not sure if I am going to
>>>>>> get any time to look into this within the next couple of
>>>>>> months.
>>>>>
>>>>> I'll try to add some debug checks to try to catch the case where count is updated before EOC
>>>>> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
>>>>> anything going wrong.
>>>>>
>>>>> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
>>>>> vs EOC? Assuming the cyclic transfer mode.
>>>>
>>>> I can't say that I am. However, for the case of cyclic transfer, given
>>>> that the next transfer is always programmed into the registers before
>>>> the last one completes, I could see that by the time the interrupt is
>>>> serviced that the DMA has moved on to the next transfer (which I assume
>>>> would reset the count).
>>>>
>>>> Interestingly, our downstream kernel implemented a change to avoid the
>>>> count appearing to move backwards. I am curious if this also works,
>>>> which would be a lot simpler that what Ben has implemented and may
>>>> mitigate that race condition that Ben is describing.
>>>>
>>>> Cheers
>>>> Jon
>>>>
>>>> [0]
>>>> https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>
>>>
>>> The downstream patch doesn't check for EOC and has no comments about it, so it's hard to
>>> tell if it's intentional. Secondly, looks like the downstream patch is mucked up because it
>>> doesn't check whether the dma_desc is *the active* transfer and not a pending!
>>
>> I agree that it should check to see if it is active. I assume that what
>> this patch is doing is not updating the dma position if it appears to
>> have gone backwards, implying we have moved on to the next buffer. Yes
>> this is still probably not as accurate as Ben's implementation because
>> most likely we have finished that transfer and this patch would report
>> that it is not quite finished.
>>
>> If Ben's patch works for you then why not go with this?
> 
> Because I'm doubtful that it is really the case and not something else. It will be very odd
> if hardware updates words count and sets EOC asynchronously, I'd call it as a faulty design
> and thus a bug that need to worked around in software if that's really happening.

Unfortunately someone designed hardware which does not update all the
state in one go. Find the designer and make them explain why they did
this.
Dmitry Osipenko June 19, 2019, 1:52 p.m. UTC | #19
19.06.2019 15:22, Jon Hunter пишет:
> 
> On 19/06/2019 12:10, Dmitry Osipenko wrote:
>> 19.06.2019 13:55, Jon Hunter пишет:
>>>
>>> On 19/06/2019 11:27, Dmitry Osipenko wrote:
>>>> 19.06.2019 13:04, Jon Hunter пишет:
>>>>>
>>>>> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>>>>>> 19.06.2019 1:22, Ben Dooks пишет:
>>>>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>>>>> on Tegra20 and Tegra30 devices.
>>>>>>>>
>>>>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>>>
>>>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>>>> ---
>>>>>>>>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>>>>   1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>>>       return 0;
>>>>>>>>   }
>>>>>>>>   +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>>>>>> +                          struct tegra_dma_desc *dma_desc,
>>>>>>>> +                          unsigned int residual)
>>>>>>>> +{
>>>>>>>> +    unsigned long status, wcount = 0;
>>>>>>>> +
>>>>>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>>>> +        return residual;
>>>>>>>> +
>>>>>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>>>> +
>>>>>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>>>> +
>>>>>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>>> +        wcount = status;
>>>>>>>> +
>>>>>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>>>> +        return residual - sg_req->req_len;
>>>>>>>> +
>>>>>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>>>>> +}
>>>>>>>
>>>>>>> I am unfortunately nowhere near my notes, so can't completely
>>>>>>> review this. I think the complexity of my patch series is due
>>>>>>> to an issue with the count being updated before the EOC IRQ
>>>>>>> is actually flagged (and most definetly before it gets to the
>>>>>>> CPU IRQ handler).
>>>>>>>
>>>>>>> The test system I was using, which i've not really got any
>>>>>>> access to at the moment would show these internal inconsistent
>>>>>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>>>>>> TDM data.
>>>>>>>
>>>>>>> Thanks for looking into this, I am not sure if I am going to
>>>>>>> get any time to look into this within the next couple of
>>>>>>> months.
>>>>>>
>>>>>> I'll try to add some debug checks to try to catch the case where count is updated before EOC
>>>>>> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
>>>>>> anything going wrong.
>>>>>>
>>>>>> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
>>>>>> vs EOC? Assuming the cyclic transfer mode.
>>>>>
>>>>> I can't say that I am. However, for the case of cyclic transfer, given
>>>>> that the next transfer is always programmed into the registers before
>>>>> the last one completes, I could see that by the time the interrupt is
>>>>> serviced that the DMA has moved on to the next transfer (which I assume
>>>>> would reset the count).
>>>>>
>>>>> Interestingly, our downstream kernel implemented a change to avoid the
>>>>> count appearing to move backwards. I am curious if this also works,
>>>>> which would be a lot simpler that what Ben has implemented and may
>>>>> mitigate that race condition that Ben is describing.
>>>>>
>>>>> Cheers
>>>>> Jon
>>>>>
>>>>> [0]
>>>>> https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>>
>>>>
>>>> The downstream patch doesn't check for EOC and has no comments about it, so it's hard to
>>>> tell if it's intentional. Secondly, looks like the downstream patch is mucked up because it
>>>> doesn't check whether the dma_desc is *the active* transfer and not a pending!
>>>
>>> I agree that it should check to see if it is active. I assume that what
>>> this patch is doing is not updating the dma position if it appears to
>>> have gone backwards, implying we have moved on to the next buffer. Yes
>>> this is still probably not as accurate as Ben's implementation because
>>> most likely we have finished that transfer and this patch would report
>>> that it is not quite finished.
>>>
>>> If Ben's patch works for you then why not go with this?
>>
>> Because I'm doubtful that it is really the case and not something else. It will be very odd
>> if hardware updates words count and sets EOC asynchronously, I'd call it as a faulty design
>> and thus a bug that need to worked around in software if that's really happening.
> 
> I don't see it that way. Probably as soon as the EOC happens, if there
> is another transfer queued up, the next transfer will start and count
> gets reset. So if you happen to asynchronously read the count at the
> very end of the transfer, then it is possible you are doing so at the
> same time that the EOC occurs but before the ISR has been triggered.

In our case we can't read out EOC status and words count asynchronously because the
interrupt status and words count are within the same hardware register on pre-T114 and on
T114+ we're reading words count register and only then the status register. ISR has nothing
to do with it, if you're taking about tegra_dma_isr.

Ben claims that the words count may get updated in hardware before EOC bit is set in the
status register.
Dmitry Osipenko June 19, 2019, 9:56 p.m. UTC | #20
19.06.2019 16:52, Dmitry Osipenko пишет:
> 19.06.2019 15:22, Jon Hunter пишет:
>>
>> On 19/06/2019 12:10, Dmitry Osipenko wrote:
>>> 19.06.2019 13:55, Jon Hunter пишет:
>>>>
>>>> On 19/06/2019 11:27, Dmitry Osipenko wrote:
>>>>> 19.06.2019 13:04, Jon Hunter пишет:
>>>>>>
>>>>>> On 19/06/2019 00:27, Dmitry Osipenko wrote:
>>>>>>> 19.06.2019 1:22, Ben Dooks пишет:
>>>>>>>> On 13/06/2019 22:08, Dmitry Osipenko wrote:
>>>>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>>>>> audio stuttering during playback in a chromiuim web browser. The patch is
>>>>>>>>> based on the original work that was made by Ben Dooks [1]. It was tested
>>>>>>>>> on Tegra20 and Tegra30 devices.
>>>>>>>>>
>>>>>>>>> [1] https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>>>>
>>>>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>>>>> ---
>>>>>>>>>   drivers/dma/tegra20-apb-dma.c | 35 ++++++++++++++++++++++++++++-------
>>>>>>>>>   1 file changed, 28 insertions(+), 7 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>>>>> index 79e9593815f1..c5af8f703548 100644
>>>>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>>>>> @@ -797,12 +797,36 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>>>>       return 0;
>>>>>>>>>   }
>>>>>>>>>   +static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
>>>>>>>>> +                          struct tegra_dma_sg_req *sg_req,
>>>>>>>>> +                          struct tegra_dma_desc *dma_desc,
>>>>>>>>> +                          unsigned int residual)
>>>>>>>>> +{
>>>>>>>>> +    unsigned long status, wcount = 0;
>>>>>>>>> +
>>>>>>>>> +    if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>>>>> +        return residual;
>>>>>>>>> +
>>>>>>>>> +    if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>>>> +        wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>>>>> +
>>>>>>>>> +    status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>>>>> +
>>>>>>>>> +    if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>>>> +        wcount = status;
>>>>>>>>> +
>>>>>>>>> +    if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>>>>> +        return residual - sg_req->req_len;
>>>>>>>>> +
>>>>>>>>> +    return residual - get_current_xferred_count(tdc, sg_req, wcount);
>>>>>>>>> +}
>>>>>>>>
>>>>>>>> I am unfortunately nowhere near my notes, so can't completely
>>>>>>>> review this. I think the complexity of my patch series is due
>>>>>>>> to an issue with the count being updated before the EOC IRQ
>>>>>>>> is actually flagged (and most definetly before it gets to the
>>>>>>>> CPU IRQ handler).
>>>>>>>>
>>>>>>>> The test system I was using, which i've not really got any
>>>>>>>> access to at the moment would show these internal inconsistent
>>>>>>>> states every few hours, however it was moving 48kHz 8ch 16bit
>>>>>>>> TDM data.
>>>>>>>>
>>>>>>>> Thanks for looking into this, I am not sure if I am going to
>>>>>>>> get any time to look into this within the next couple of
>>>>>>>> months.
>>>>>>>
>>>>>>> I'll try to add some debug checks to try to catch the case where count is updated before EOC
>>>>>>> is set. Thank you very much for the clarification of the problem. So far I haven't spotted
>>>>>>> anything going wrong.
>>>>>>>
>>>>>>> Jon / Laxman, are you aware about the possibility to get such inconsistency of words count
>>>>>>> vs EOC? Assuming the cyclic transfer mode.
>>>>>>
>>>>>> I can't say that I am. However, for the case of cyclic transfer, given
>>>>>> that the next transfer is always programmed into the registers before
>>>>>> the last one completes, I could see that by the time the interrupt is
>>>>>> serviced that the DMA has moved on to the next transfer (which I assume
>>>>>> would reset the count).
>>>>>>
>>>>>> Interestingly, our downstream kernel implemented a change to avoid the
>>>>>> count appearing to move backwards. I am curious if this also works,
>>>>>> which would be a lot simpler that what Ben has implemented and may
>>>>>> mitigate that race condition that Ben is describing.
>>>>>>
>>>>>> Cheers
>>>>>> Jon
>>>>>>
>>>>>> [0]
>>>>>> https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>>>
>>>>>
>>>>> The downstream patch doesn't check for EOC and has no comments about it, so it's hard to
>>>>> tell if it's intentional. Secondly, looks like the downstream patch is mucked up because it
>>>>> doesn't check whether the dma_desc is *the active* transfer and not a pending!
>>>>
>>>> I agree that it should check to see if it is active. I assume that what
>>>> this patch is doing is not updating the dma position if it appears to
>>>> have gone backwards, implying we have moved on to the next buffer. Yes
>>>> this is still probably not as accurate as Ben's implementation because
>>>> most likely we have finished that transfer and this patch would report
>>>> that it is not quite finished.
>>>>
>>>> If Ben's patch works for you then why not go with this?
>>>
>>> Because I'm doubtful that it is really the case and not something else. It will be very odd
>>> if hardware updates words count and sets EOC asynchronously, I'd call it as a faulty design
>>> and thus a bug that need to worked around in software if that's really happening.
>>
>> I don't see it that way. Probably as soon as the EOC happens, if there
>> is another transfer queued up, the next transfer will start and count
>> gets reset. So if you happen to asynchronously read the count at the
>> very end of the transfer, then it is possible you are doing so at the
>> same time that the EOC occurs but before the ISR has been triggered.
> 
> In our case we can't read out EOC status and words count asynchronously because the
> interrupt status and words count are within the same hardware register on pre-T114 and on
> T114+ we're reading words count register and only then the status register. ISR has nothing
> to do with it, if you're taking about tegra_dma_isr.
> 
> Ben claims that the words count may get updated in hardware before EOC bit is set in the
> status register.
> 

I made a test setup where several millions test-runs were made, checking
the EOC vs words count wraparound in a cyclic transfer mode (polling
words count until EOC is set) with different buffer sizes (buffer size =
period, single SG nent) and different burst configurations. Result is
good, the case Ben describes never happens on T30 nor on T20. So I'm
confident now that hardware is fine and likely that Ben was experiencing
some other problem.

Jon, please take a look at the v2. It should be good to go!
diff mbox series

Patch

diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
index 79e9593815f1..c5af8f703548 100644
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -797,12 +797,36 @@  static int tegra_dma_terminate_all(struct dma_chan *dc)
 	return 0;
 }
 
+static unsigned int tegra_dma_update_residual(struct tegra_dma_channel *tdc,
+					      struct tegra_dma_sg_req *sg_req,
+					      struct tegra_dma_desc *dma_desc,
+					      unsigned int residual)
+{
+	unsigned long status, wcount = 0;
+
+	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
+		return residual;
+
+	if (tdc->tdma->chip_data->support_separate_wcount_reg)
+		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
+
+	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
+
+	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
+		wcount = status;
+
+	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
+		return residual - sg_req->req_len;
+
+	return residual - get_current_xferred_count(tdc, sg_req, wcount);
+}
+
 static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 	dma_cookie_t cookie, struct dma_tx_state *txstate)
 {
 	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
+	struct tegra_dma_sg_req *sg_req = NULL;
 	struct tegra_dma_desc *dma_desc;
-	struct tegra_dma_sg_req *sg_req;
 	enum dma_status ret;
 	unsigned long flags;
 	unsigned int residual;
@@ -838,6 +862,8 @@  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 		residual = dma_desc->bytes_requested -
 			   (dma_desc->bytes_transferred %
 			    dma_desc->bytes_requested);
+		residual = tegra_dma_update_residual(tdc, sg_req, dma_desc,
+						     residual);
 		dma_set_residue(txstate, residual);
 	}
 
@@ -1441,12 +1467,7 @@  static int tegra_dma_probe(struct platform_device *pdev)
 		BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) |
 		BIT(DMA_SLAVE_BUSWIDTH_8_BYTES);
 	tdma->dma_dev.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
-	/*
-	 * XXX The hardware appears to support
-	 * DMA_RESIDUE_GRANULARITY_BURST-level reporting, but it's
-	 * only used by this driver during tegra_dma_terminate_all()
-	 */
-	tdma->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_SEGMENT;
+	tdma->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
 	tdma->dma_dev.device_config = tegra_dma_slave_config;
 	tdma->dma_dev.device_terminate_all = tegra_dma_terminate_all;
 	tdma->dma_dev.device_tx_status = tegra_dma_tx_status;