diff mbox series

Revert "iommu/io-pgtable-arm: Optimise non-coherent unmap"

Message ID 20240905124956.84932-1-robdclark@gmail.com (mailing list archive)
State New, archived
Headers show
Series Revert "iommu/io-pgtable-arm: Optimise non-coherent unmap" | expand

Commit Message

Rob Clark Sept. 5, 2024, 12:49 p.m. UTC
From: Rob Clark <robdclark@chromium.org>

This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.

It was causing gpu smmu faults on x1e80100.

I _think_ what is causing this is the change in ordering of
__arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
this patch is supposed to work correctly in the face of other
concurrent translations (to buffers unrelated to the one being
unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
stale data read back into the tlb.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

Comments

Robin Murphy Sept. 5, 2024, 1:24 p.m. UTC | #1
On 05/09/2024 1:49 pm, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
> 
> It was causing gpu smmu faults on x1e80100.
> 
> I _think_ what is causing this is the change in ordering of
> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
> memory) and io_pgtable_tlb_flush_walk().

As I just commented, how do you believe the order of operations between:

	__arm_lpae_clear_pte();
  	if (!iopte_leaf()) {
  		io_pgtable_tlb_flush_walk();

and:
  	
  	if (!iopte_leaf()) {
		__arm_lpae_clear_pte();
  		io_pgtable_tlb_flush_walk();

fundamentally differs?

I'm not saying there couldn't be some subtle bug in the implementation 
which we've all missed, but I still can't see an issue with the intended 
logic.

>  I'm not entirely sure how
> this patch is supposed to work correctly in the face of other
> concurrent translations (to buffers unrelated to the one being
> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
> stale data read back into the tlb.

Read back from where? The ex-table PTE which was already set to zero 
before tlb_flush_walk was called?

And isn't the hilariously overcomplicated TBU driver supposed to be 
telling you exactly what happened here? Otherwise I'm going to continue 
to seriously question the purpose of shoehorning that upstream at all...

Thanks,
Robin.

> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
>   drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
>   1 file changed, 14 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index 16e51528772d..85261baa3a04 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
>   				   sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
>   }
>   
> -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
> +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
>   {
> -	for (int i = 0; i < num_entries; i++)
> -		ptep[i] = 0;
>   
> -	if (!cfg->coherent_walk && num_entries)
> -		__arm_lpae_sync_pte(ptep, num_entries, cfg);
> +	*ptep = 0;
> +
> +	if (!cfg->coherent_walk)
> +		__arm_lpae_sync_pte(ptep, 1, cfg);
>   }
>   
>   static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> @@ -653,28 +653,25 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>   		max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
>   		num_entries = min_t(int, pgcount, max_entries);
>   
> -		/* Find and handle non-leaf entries */
> -		for (i = 0; i < num_entries; i++) {
> -			pte = READ_ONCE(ptep[i]);
> +		while (i < num_entries) {
> +			pte = READ_ONCE(*ptep);
>   			if (WARN_ON(!pte))
>   				break;
>   
> -			if (!iopte_leaf(pte, lvl, iop->fmt)) {
> -				__arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
> +			__arm_lpae_clear_pte(ptep, &iop->cfg);
>   
> +			if (!iopte_leaf(pte, lvl, iop->fmt)) {
>   				/* Also flush any partial walks */
>   				io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
>   							  ARM_LPAE_GRANULE(data));
>   				__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
> +			} else if (!iommu_iotlb_gather_queued(gather)) {
> +				io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
>   			}
> -		}
>   
> -		/* Clear the remaining entries */
> -		__arm_lpae_clear_pte(ptep, &iop->cfg, i);
> -
> -		if (gather && !iommu_iotlb_gather_queued(gather))
> -			for (int j = 0; j < i; j++)
> -				io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
> +			ptep++;
> +			i++;
> +		}
>   
>   		return i * size;
>   	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
Robin Murphy Sept. 5, 2024, 1:32 p.m. UTC | #2
On 05/09/2024 2:24 pm, Robin Murphy wrote:
> On 05/09/2024 1:49 pm, Rob Clark wrote:
>> From: Rob Clark <robdclark@chromium.org>
>>
>> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
>>
>> It was causing gpu smmu faults on x1e80100.
>>
>> I _think_ what is causing this is the change in ordering of
>> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
>> memory) and io_pgtable_tlb_flush_walk().
> 
> As I just commented, how do you believe the order of operations between:

...or at least I *thought* I'd just commented, but Thunderbird is now 
denying any knowledge of the reply to your other mail I swore I sent a 
couple of hours ago :/

Apologies for any confusion,
Robin.

> 
>      __arm_lpae_clear_pte();
>       if (!iopte_leaf()) {
>           io_pgtable_tlb_flush_walk();
> 
> and:
> 
>       if (!iopte_leaf()) {
>          __arm_lpae_clear_pte();
>           io_pgtable_tlb_flush_walk();
> 
> fundamentally differs?
> 
> I'm not saying there couldn't be some subtle bug in the implementation 
> which we've all missed, but I still can't see an issue with the intended 
> logic.
> 
>>  I'm not entirely sure how
>> this patch is supposed to work correctly in the face of other
>> concurrent translations (to buffers unrelated to the one being
>> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
>> stale data read back into the tlb.
> 
> Read back from where? The ex-table PTE which was already set to zero 
> before tlb_flush_walk was called?
> 
> And isn't the hilariously overcomplicated TBU driver supposed to be 
> telling you exactly what happened here? Otherwise I'm going to continue 
> to seriously question the purpose of shoehorning that upstream at all...
> 
> Thanks,
> Robin.
> 
>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>> ---
>>   drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
>>   1 file changed, 14 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/iommu/io-pgtable-arm.c 
>> b/drivers/iommu/io-pgtable-arm.c
>> index 16e51528772d..85261baa3a04 100644
>> --- a/drivers/iommu/io-pgtable-arm.c
>> +++ b/drivers/iommu/io-pgtable-arm.c
>> @@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte 
>> *ptep, int num_entries,
>>                      sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
>>   }
>> -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct 
>> io_pgtable_cfg *cfg, int num_entries)
>> +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct 
>> io_pgtable_cfg *cfg)
>>   {
>> -    for (int i = 0; i < num_entries; i++)
>> -        ptep[i] = 0;
>> -    if (!cfg->coherent_walk && num_entries)
>> -        __arm_lpae_sync_pte(ptep, num_entries, cfg);
>> +    *ptep = 0;
>> +
>> +    if (!cfg->coherent_walk)
>> +        __arm_lpae_sync_pte(ptep, 1, cfg);
>>   }
>>   static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>> @@ -653,28 +653,25 @@ static size_t __arm_lpae_unmap(struct 
>> arm_lpae_io_pgtable *data,
>>           max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
>>           num_entries = min_t(int, pgcount, max_entries);
>> -        /* Find and handle non-leaf entries */
>> -        for (i = 0; i < num_entries; i++) {
>> -            pte = READ_ONCE(ptep[i]);
>> +        while (i < num_entries) {
>> +            pte = READ_ONCE(*ptep);
>>               if (WARN_ON(!pte))
>>                   break;
>> -            if (!iopte_leaf(pte, lvl, iop->fmt)) {
>> -                __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
>> +            __arm_lpae_clear_pte(ptep, &iop->cfg);
>> +            if (!iopte_leaf(pte, lvl, iop->fmt)) {
>>                   /* Also flush any partial walks */
>>                   io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
>>                                 ARM_LPAE_GRANULE(data));
>>                   __arm_lpae_free_pgtable(data, lvl + 1, 
>> iopte_deref(pte, data));
>> +            } else if (!iommu_iotlb_gather_queued(gather)) {
>> +                io_pgtable_tlb_add_page(iop, gather, iova + i * size, 
>> size);
>>               }
>> -        }
>> -        /* Clear the remaining entries */
>> -        __arm_lpae_clear_pte(ptep, &iop->cfg, i);
>> -
>> -        if (gather && !iommu_iotlb_gather_queued(gather))
>> -            for (int j = 0; j < i; j++)
>> -                io_pgtable_tlb_add_page(iop, gather, iova + j * size, 
>> size);
>> +            ptep++;
>> +            i++;
>> +        }
>>           return i * size;
>>       } else if (iopte_leaf(pte, lvl, iop->fmt)) {
>
Rob Clark Sept. 5, 2024, 1:57 p.m. UTC | #3
On Thu, Sep 5, 2024 at 6:24 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 05/09/2024 1:49 pm, Rob Clark wrote:
> > From: Rob Clark <robdclark@chromium.org>
> >
> > This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
> >
> > It was causing gpu smmu faults on x1e80100.
> >
> > I _think_ what is causing this is the change in ordering of
> > __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
> > memory) and io_pgtable_tlb_flush_walk().
>
> As I just commented, how do you believe the order of operations between:
>
>         __arm_lpae_clear_pte();
>         if (!iopte_leaf()) {
>                 io_pgtable_tlb_flush_walk();
>
> and:
>
>         if (!iopte_leaf()) {
>                 __arm_lpae_clear_pte();
>                 io_pgtable_tlb_flush_walk();
>
> fundamentally differs?

from my reading of the original patch, the ordering is the same for
non-leaf nodes, but not for leaf nodes

> I'm not saying there couldn't be some subtle bug in the implementation
> which we've all missed, but I still can't see an issue with the intended
> logic.
>
> >  I'm not entirely sure how
> > this patch is supposed to work correctly in the face of other
> > concurrent translations (to buffers unrelated to the one being
> > unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
> > stale data read back into the tlb.
>
> Read back from where? The ex-table PTE which was already set to zero
> before tlb_flush_walk was called?
>
> And isn't the hilariously overcomplicated TBU driver supposed to be
> telling you exactly what happened here? Otherwise I'm going to continue
> to seriously question the purpose of shoehorning that upstream at all...

I guess I could try the TBU driver.  But I already had my patchset to
extract the pgtable walk for gpu devcore dump, and that is telling me
that the CPU view of the pgtable is fine.  Which I think just leaves a
tlbinv problem.  If that is the case, swapping the order of leaf node
cpu cache ops and tlbinv ops seems like the cause.  But maybe I'm
missing something.

BR,
-R

> Thanks,
> Robin.
>
> > Signed-off-by: Rob Clark <robdclark@chromium.org>
> > ---
> >   drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
> >   1 file changed, 14 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > index 16e51528772d..85261baa3a04 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
> >                                  sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
> >   }
> >
> > -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
> > +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
> >   {
> > -     for (int i = 0; i < num_entries; i++)
> > -             ptep[i] = 0;
> >
> > -     if (!cfg->coherent_walk && num_entries)
> > -             __arm_lpae_sync_pte(ptep, num_entries, cfg);
> > +     *ptep = 0;
> > +
> > +     if (!cfg->coherent_walk)
> > +             __arm_lpae_sync_pte(ptep, 1, cfg);
> >   }
> >
> >   static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> > @@ -653,28 +653,25 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> >               max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
> >               num_entries = min_t(int, pgcount, max_entries);
> >
> > -             /* Find and handle non-leaf entries */
> > -             for (i = 0; i < num_entries; i++) {
> > -                     pte = READ_ONCE(ptep[i]);
> > +             while (i < num_entries) {
> > +                     pte = READ_ONCE(*ptep);
> >                       if (WARN_ON(!pte))
> >                               break;
> >
> > -                     if (!iopte_leaf(pte, lvl, iop->fmt)) {
> > -                             __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
> > +                     __arm_lpae_clear_pte(ptep, &iop->cfg);
> >
> > +                     if (!iopte_leaf(pte, lvl, iop->fmt)) {
> >                               /* Also flush any partial walks */
> >                               io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
> >                                                         ARM_LPAE_GRANULE(data));
> >                               __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
> > +                     } else if (!iommu_iotlb_gather_queued(gather)) {
> > +                             io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
> >                       }
> > -             }
> >
> > -             /* Clear the remaining entries */
> > -             __arm_lpae_clear_pte(ptep, &iop->cfg, i);
> > -
> > -             if (gather && !iommu_iotlb_gather_queued(gather))
> > -                     for (int j = 0; j < i; j++)
> > -                             io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
> > +                     ptep++;
> > +                     i++;
> > +             }
> >
> >               return i * size;
> >       } else if (iopte_leaf(pte, lvl, iop->fmt)) {
Will Deacon Sept. 5, 2024, 3:53 p.m. UTC | #4
Hi Rob,

On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
> 
> It was causing gpu smmu faults on x1e80100.
> 
> I _think_ what is causing this is the change in ordering of
> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
> memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
> this patch is supposed to work correctly in the face of other
> concurrent translations (to buffers unrelated to the one being
> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
> stale data read back into the tlb.
> 
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
>  drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
>  1 file changed, 14 insertions(+), 17 deletions(-)

Please can you try the diff below, instead?

Will

--->8

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 0e67f1721a3d..0a32e9499e2c 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -672,7 +672,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
                /* Clear the remaining entries */
                __arm_lpae_clear_pte(ptep, &iop->cfg, i);
 
-               if (gather && !iommu_iotlb_gather_queued(gather))
+               if (!iommu_iotlb_gather_queued(gather))
                        for (int j = 0; j < i; j++)
                                io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
Robin Murphy Sept. 5, 2024, 4:22 p.m. UTC | #5
On 05/09/2024 2:57 pm, Rob Clark wrote:
> On Thu, Sep 5, 2024 at 6:24 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>
>> On 05/09/2024 1:49 pm, Rob Clark wrote:
>>> From: Rob Clark <robdclark@chromium.org>
>>>
>>> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
>>>
>>> It was causing gpu smmu faults on x1e80100.
>>>
>>> I _think_ what is causing this is the change in ordering of
>>> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
>>> memory) and io_pgtable_tlb_flush_walk().
>>
>> As I just commented, how do you believe the order of operations between:
>>
>>          __arm_lpae_clear_pte();
>>          if (!iopte_leaf()) {
>>                  io_pgtable_tlb_flush_walk();
>>
>> and:
>>
>>          if (!iopte_leaf()) {
>>                  __arm_lpae_clear_pte();
>>                  io_pgtable_tlb_flush_walk();
>>
>> fundamentally differs?
> 
> from my reading of the original patch, the ordering is the same for
> non-leaf nodes, but not for leaf nodes

But tlb_flush_walk is never called for leaf entries either way, so no 
such ordering exists... And the non-leaf path still calls 
__arm_lpae_clear_pte() and io_pgtable_tlb_add_page() in the same order 
relative to each other too, it's just now done for the whole range in 
one go, after any non-leaf entries have already been dealt with.

Thanks,
Robin.

> 
>> I'm not saying there couldn't be some subtle bug in the implementation
>> which we've all missed, but I still can't see an issue with the intended
>> logic.
>>
>>>   I'm not entirely sure how
>>> this patch is supposed to work correctly in the face of other
>>> concurrent translations (to buffers unrelated to the one being
>>> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
>>> stale data read back into the tlb.
>>
>> Read back from where? The ex-table PTE which was already set to zero
>> before tlb_flush_walk was called?
>>
>> And isn't the hilariously overcomplicated TBU driver supposed to be
>> telling you exactly what happened here? Otherwise I'm going to continue
>> to seriously question the purpose of shoehorning that upstream at all...
> 
> I guess I could try the TBU driver.  But I already had my patchset to
> extract the pgtable walk for gpu devcore dump, and that is telling me
> that the CPU view of the pgtable is fine.  Which I think just leaves a
> tlbinv problem.  If that is the case, swapping the order of leaf node
> cpu cache ops and tlbinv ops seems like the cause.  But maybe I'm
> missing something.
> 
> BR,
> -R
> 
>> Thanks,
>> Robin.
>>
>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>> ---
>>>    drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
>>>    1 file changed, 14 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
>>> index 16e51528772d..85261baa3a04 100644
>>> --- a/drivers/iommu/io-pgtable-arm.c
>>> +++ b/drivers/iommu/io-pgtable-arm.c
>>> @@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
>>>                                   sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
>>>    }
>>>
>>> -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
>>> +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
>>>    {
>>> -     for (int i = 0; i < num_entries; i++)
>>> -             ptep[i] = 0;
>>>
>>> -     if (!cfg->coherent_walk && num_entries)
>>> -             __arm_lpae_sync_pte(ptep, num_entries, cfg);
>>> +     *ptep = 0;
>>> +
>>> +     if (!cfg->coherent_walk)
>>> +             __arm_lpae_sync_pte(ptep, 1, cfg);
>>>    }
>>>
>>>    static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>> @@ -653,28 +653,25 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>>                max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
>>>                num_entries = min_t(int, pgcount, max_entries);
>>>
>>> -             /* Find and handle non-leaf entries */
>>> -             for (i = 0; i < num_entries; i++) {
>>> -                     pte = READ_ONCE(ptep[i]);
>>> +             while (i < num_entries) {
>>> +                     pte = READ_ONCE(*ptep);
>>>                        if (WARN_ON(!pte))
>>>                                break;
>>>
>>> -                     if (!iopte_leaf(pte, lvl, iop->fmt)) {
>>> -                             __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
>>> +                     __arm_lpae_clear_pte(ptep, &iop->cfg);
>>>
>>> +                     if (!iopte_leaf(pte, lvl, iop->fmt)) {
>>>                                /* Also flush any partial walks */
>>>                                io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
>>>                                                          ARM_LPAE_GRANULE(data));
>>>                                __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
>>> +                     } else if (!iommu_iotlb_gather_queued(gather)) {
>>> +                             io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
>>>                        }
>>> -             }
>>>
>>> -             /* Clear the remaining entries */
>>> -             __arm_lpae_clear_pte(ptep, &iop->cfg, i);
>>> -
>>> -             if (gather && !iommu_iotlb_gather_queued(gather))
>>> -                     for (int j = 0; j < i; j++)
>>> -                             io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
>>> +                     ptep++;
>>> +                     i++;
>>> +             }
>>>
>>>                return i * size;
>>>        } else if (iopte_leaf(pte, lvl, iop->fmt)) {
Robin Murphy Sept. 5, 2024, 4:27 p.m. UTC | #6
On 05/09/2024 4:53 pm, Will Deacon wrote:
> Hi Rob,
> 
> On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
>> From: Rob Clark <robdclark@chromium.org>
>>
>> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
>>
>> It was causing gpu smmu faults on x1e80100.
>>
>> I _think_ what is causing this is the change in ordering of
>> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
>> memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
>> this patch is supposed to work correctly in the face of other
>> concurrent translations (to buffers unrelated to the one being
>> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
>> stale data read back into the tlb.
>>
>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>> ---
>>   drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
>>   1 file changed, 14 insertions(+), 17 deletions(-)
> 
> Please can you try the diff below, instead?

Given that the GPU driver's .tlb_add_page is a no-op, I can't see this 
making a difference. In fact, given that msm_iommu_pagetable_unmap() 
still does a brute-force iommu_flush_iotlb_all() after io-pgtable 
returns, and in fact only recently made .tlb_flush_walk start doing 
anything either for the sake of the map path, I'm now really wondering 
how this patch has had any effect at all... :/

> 
> Will
> 
> --->8
> 
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index 0e67f1721a3d..0a32e9499e2c 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -672,7 +672,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>                  /* Clear the remaining entries */
>                  __arm_lpae_clear_pte(ptep, &iop->cfg, i);
>   
> -               if (gather && !iommu_iotlb_gather_queued(gather))
> +               if (!iommu_iotlb_gather_queued(gather))

Note that this would reintroduce the latent issue which was present 
originally, wherein iommu_iotlb_gather_queued(NULL) is false, but if we 
actually allow a NULL gather to be passed to io_pgtable_tlb_add_page() 
it may end up being dereferenced (e.g. in arm-smmu-v3).

Thanks,
Robin.

>                          for (int j = 0; j < i; j++)
>                                  io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
>
Rob Clark Sept. 5, 2024, 5 p.m. UTC | #7
On Thu, Sep 5, 2024 at 9:27 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 05/09/2024 4:53 pm, Will Deacon wrote:
> > Hi Rob,
> >
> > On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
> >> From: Rob Clark <robdclark@chromium.org>
> >>
> >> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
> >>
> >> It was causing gpu smmu faults on x1e80100.
> >>
> >> I _think_ what is causing this is the change in ordering of
> >> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
> >> memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
> >> this patch is supposed to work correctly in the face of other
> >> concurrent translations (to buffers unrelated to the one being
> >> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
> >> stale data read back into the tlb.
> >>
> >> Signed-off-by: Rob Clark <robdclark@chromium.org>
> >> ---
> >>   drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
> >>   1 file changed, 14 insertions(+), 17 deletions(-)
> >
> > Please can you try the diff below, instead?
>
> Given that the GPU driver's .tlb_add_page is a no-op, I can't see this
> making a difference. In fact, given that msm_iommu_pagetable_unmap()
> still does a brute-force iommu_flush_iotlb_all() after io-pgtable
> returns, and in fact only recently made .tlb_flush_walk start doing
> anything either for the sake of the map path, I'm now really wondering
> how this patch has had any effect at all... :/

Yeah..  and unfortunately the TBU code only supports two devices so
far, so I can't easily repro with TBU enabled atm.  Hmm..
__arm_lpae_unmap() is also called in the ->map() path, although not
sure how that changes things.

BR,
-R

> >
> > Will
> >
> > --->8
> >
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > index 0e67f1721a3d..0a32e9499e2c 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -672,7 +672,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> >                  /* Clear the remaining entries */
> >                  __arm_lpae_clear_pte(ptep, &iop->cfg, i);
> >
> > -               if (gather && !iommu_iotlb_gather_queued(gather))
> > +               if (!iommu_iotlb_gather_queued(gather))
>
> Note that this would reintroduce the latent issue which was present
> originally, wherein iommu_iotlb_gather_queued(NULL) is false, but if we
> actually allow a NULL gather to be passed to io_pgtable_tlb_add_page()
> it may end up being dereferenced (e.g. in arm-smmu-v3).
>
> Thanks,
> Robin.
>
> >                          for (int j = 0; j < i; j++)
> >                                  io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
> >
Rob Clark Sept. 5, 2024, 5:10 p.m. UTC | #8
On Thu, Sep 5, 2024 at 10:00 AM Rob Clark <robdclark@gmail.com> wrote:
>
> On Thu, Sep 5, 2024 at 9:27 AM Robin Murphy <robin.murphy@arm.com> wrote:
> >
> > On 05/09/2024 4:53 pm, Will Deacon wrote:
> > > Hi Rob,
> > >
> > > On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
> > >> From: Rob Clark <robdclark@chromium.org>
> > >>
> > >> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
> > >>
> > >> It was causing gpu smmu faults on x1e80100.
> > >>
> > >> I _think_ what is causing this is the change in ordering of
> > >> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
> > >> memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
> > >> this patch is supposed to work correctly in the face of other
> > >> concurrent translations (to buffers unrelated to the one being
> > >> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
> > >> stale data read back into the tlb.
> > >>
> > >> Signed-off-by: Rob Clark <robdclark@chromium.org>
> > >> ---
> > >>   drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
> > >>   1 file changed, 14 insertions(+), 17 deletions(-)
> > >
> > > Please can you try the diff below, instead?
> >
> > Given that the GPU driver's .tlb_add_page is a no-op, I can't see this
> > making a difference. In fact, given that msm_iommu_pagetable_unmap()
> > still does a brute-force iommu_flush_iotlb_all() after io-pgtable
> > returns, and in fact only recently made .tlb_flush_walk start doing
> > anything either for the sake of the map path, I'm now really wondering
> > how this patch has had any effect at all... :/
>
> Yeah..  and unfortunately the TBU code only supports two devices so
> far, so I can't easily repro with TBU enabled atm.  Hmm..
> __arm_lpae_unmap() is also called in the ->map() path, although not
> sure how that changes things.

Ok, an update.. after a reboot, still with this patch reverted, I once
again see faults.  So I guess that vindicates the original patch, and
leaves me still searching..

fwiw, fault info from the gpu devcore:

-------------
fault-info:
  - ttbr0=0000000919306000
  - iova=0000000100c17000
  - dir=WRITE
  - type=UNKNOWN
  - source=CP
pgtable-fault-info:
  - ttbr0: 000000090ca40000
  - asid: 0
  - ptes: 000000095db47003 000000095db48003 0000000914c8f003 00000008fd7f0f47
-------------

the 'ptes' part shows the table walk, which looks ok to me..

BR,
-R

> BR,
> -R
>
> > >
> > > Will
> > >
> > > --->8
> > >
> > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > > index 0e67f1721a3d..0a32e9499e2c 100644
> > > --- a/drivers/iommu/io-pgtable-arm.c
> > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > @@ -672,7 +672,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> > >                  /* Clear the remaining entries */
> > >                  __arm_lpae_clear_pte(ptep, &iop->cfg, i);
> > >
> > > -               if (gather && !iommu_iotlb_gather_queued(gather))
> > > +               if (!iommu_iotlb_gather_queued(gather))
> >
> > Note that this would reintroduce the latent issue which was present
> > originally, wherein iommu_iotlb_gather_queued(NULL) is false, but if we
> > actually allow a NULL gather to be passed to io_pgtable_tlb_add_page()
> > it may end up being dereferenced (e.g. in arm-smmu-v3).
> >
> > Thanks,
> > Robin.
> >
> > >                          for (int j = 0; j < i; j++)
> > >                                  io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
> > >
Will Deacon Sept. 6, 2024, 10:56 a.m. UTC | #9
On Thu, Sep 05, 2024 at 05:27:28PM +0100, Robin Murphy wrote:
> On 05/09/2024 4:53 pm, Will Deacon wrote:
> > On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
> > > From: Rob Clark <robdclark@chromium.org>
> > > 
> > > This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
> > > 
> > > It was causing gpu smmu faults on x1e80100.
> > > 
> > > I _think_ what is causing this is the change in ordering of
> > > __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
> > > memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
> > > this patch is supposed to work correctly in the face of other
> > > concurrent translations (to buffers unrelated to the one being
> > > unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
> > > stale data read back into the tlb.
> > > 
> > > Signed-off-by: Rob Clark <robdclark@chromium.org>
> > > ---
> > >   drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
> > >   1 file changed, 14 insertions(+), 17 deletions(-)
> > 
> > Please can you try the diff below, instead?
> 
> Given that the GPU driver's .tlb_add_page is a no-op, I can't see this
> making a difference. In fact, given that msm_iommu_pagetable_unmap() still
> does a brute-force iommu_flush_iotlb_all() after io-pgtable returns, and in
> fact only recently made .tlb_flush_walk start doing anything either for the
> sake of the map path, I'm now really wondering how this patch has had any
> effect at all... :/

Hmm, yup. Looks like Rob has come back to say the problem lies elsewhere
anyway.

One thing below though...

> > 
> > Will
> > 
> > --->8
> > 
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > index 0e67f1721a3d..0a32e9499e2c 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -672,7 +672,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> >                  /* Clear the remaining entries */
> >                  __arm_lpae_clear_pte(ptep, &iop->cfg, i);
> > -               if (gather && !iommu_iotlb_gather_queued(gather))
> > +               if (!iommu_iotlb_gather_queued(gather))
> 
> Note that this would reintroduce the latent issue which was present
> originally, wherein iommu_iotlb_gather_queued(NULL) is false, but if we
> actually allow a NULL gather to be passed to io_pgtable_tlb_add_page() it
> may end up being dereferenced (e.g. in arm-smmu-v3).

I think there is still something to fix here. arm_lpae_init_pte() can
pass a NULL gather to __arm_lpae_unmap() and I don't think skipping the
invalidation is correct in that case. Either the drivers need to handle
that or we shouldn't be passing NULL.

What do you think?

Will
Robin Murphy Sept. 6, 2024, 12:24 p.m. UTC | #10
On 2024-09-05 6:10 pm, Rob Clark wrote:
> On Thu, Sep 5, 2024 at 10:00 AM Rob Clark <robdclark@gmail.com> wrote:
>>
>> On Thu, Sep 5, 2024 at 9:27 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>>
>>> On 05/09/2024 4:53 pm, Will Deacon wrote:
>>>> Hi Rob,
>>>>
>>>> On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
>>>>> From: Rob Clark <robdclark@chromium.org>
>>>>>
>>>>> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
>>>>>
>>>>> It was causing gpu smmu faults on x1e80100.
>>>>>
>>>>> I _think_ what is causing this is the change in ordering of
>>>>> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
>>>>> memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
>>>>> this patch is supposed to work correctly in the face of other
>>>>> concurrent translations (to buffers unrelated to the one being
>>>>> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
>>>>> stale data read back into the tlb.
>>>>>
>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>>>> ---
>>>>>    drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
>>>>>    1 file changed, 14 insertions(+), 17 deletions(-)
>>>>
>>>> Please can you try the diff below, instead?
>>>
>>> Given that the GPU driver's .tlb_add_page is a no-op, I can't see this
>>> making a difference. In fact, given that msm_iommu_pagetable_unmap()
>>> still does a brute-force iommu_flush_iotlb_all() after io-pgtable
>>> returns, and in fact only recently made .tlb_flush_walk start doing
>>> anything either for the sake of the map path, I'm now really wondering
>>> how this patch has had any effect at all... :/
>>
>> Yeah..  and unfortunately the TBU code only supports two devices so
>> far, so I can't easily repro with TBU enabled atm.  Hmm..
>> __arm_lpae_unmap() is also called in the ->map() path, although not
>> sure how that changes things.
> 
> Ok, an update.. after a reboot, still with this patch reverted, I once
> again see faults.  So I guess that vindicates the original patch, and
> leaves me still searching..
> 
> fwiw, fault info from the gpu devcore:
> 
> -------------
> fault-info:
>    - ttbr0=0000000919306000
>    - iova=0000000100c17000
>    - dir=WRITE
>    - type=UNKNOWN
>    - source=CP
> pgtable-fault-info:
>    - ttbr0: 000000090ca40000
>    - asid: 0
>    - ptes: 000000095db47003 000000095db48003 0000000914c8f003 00000008fd7f0f47
> -------------
> 
> the 'ptes' part shows the table walk, which looks ok to me..

But is it the right pagetable at all, given that the "ttbr0" values 
appear to be indicating different places?

Robin.
Rob Clark Sept. 6, 2024, 2:49 p.m. UTC | #11
On Fri, Sep 6, 2024 at 5:24 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 2024-09-05 6:10 pm, Rob Clark wrote:
> > On Thu, Sep 5, 2024 at 10:00 AM Rob Clark <robdclark@gmail.com> wrote:
> >>
> >> On Thu, Sep 5, 2024 at 9:27 AM Robin Murphy <robin.murphy@arm.com> wrote:
> >>>
> >>> On 05/09/2024 4:53 pm, Will Deacon wrote:
> >>>> Hi Rob,
> >>>>
> >>>> On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
> >>>>> From: Rob Clark <robdclark@chromium.org>
> >>>>>
> >>>>> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
> >>>>>
> >>>>> It was causing gpu smmu faults on x1e80100.
> >>>>>
> >>>>> I _think_ what is causing this is the change in ordering of
> >>>>> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
> >>>>> memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
> >>>>> this patch is supposed to work correctly in the face of other
> >>>>> concurrent translations (to buffers unrelated to the one being
> >>>>> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
> >>>>> stale data read back into the tlb.
> >>>>>
> >>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
> >>>>> ---
> >>>>>    drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
> >>>>>    1 file changed, 14 insertions(+), 17 deletions(-)
> >>>>
> >>>> Please can you try the diff below, instead?
> >>>
> >>> Given that the GPU driver's .tlb_add_page is a no-op, I can't see this
> >>> making a difference. In fact, given that msm_iommu_pagetable_unmap()
> >>> still does a brute-force iommu_flush_iotlb_all() after io-pgtable
> >>> returns, and in fact only recently made .tlb_flush_walk start doing
> >>> anything either for the sake of the map path, I'm now really wondering
> >>> how this patch has had any effect at all... :/
> >>
> >> Yeah..  and unfortunately the TBU code only supports two devices so
> >> far, so I can't easily repro with TBU enabled atm.  Hmm..
> >> __arm_lpae_unmap() is also called in the ->map() path, although not
> >> sure how that changes things.
> >
> > Ok, an update.. after a reboot, still with this patch reverted, I once
> > again see faults.  So I guess that vindicates the original patch, and
> > leaves me still searching..
> >
> > fwiw, fault info from the gpu devcore:
> >
> > -------------
> > fault-info:
> >    - ttbr0=0000000919306000
> >    - iova=0000000100c17000
> >    - dir=WRITE
> >    - type=UNKNOWN
> >    - source=CP
> > pgtable-fault-info:
> >    - ttbr0: 000000090ca40000
> >    - asid: 0
> >    - ptes: 000000095db47003 000000095db48003 0000000914c8f003 00000008fd7f0f47
> > -------------
> >
> > the 'ptes' part shows the table walk, which looks ok to me..
>
> But is it the right pagetable at all, given that the "ttbr0" values
> appear to be indicating different places?

hmm, the gpu does seem to be switching to the new table before it is
done with the old one..

BR,
-R
Robin Murphy Sept. 6, 2024, 3:25 p.m. UTC | #12
On 06/09/2024 11:56 am, Will Deacon wrote:
> On Thu, Sep 05, 2024 at 05:27:28PM +0100, Robin Murphy wrote:
>> On 05/09/2024 4:53 pm, Will Deacon wrote:
>>> On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
>>>> From: Rob Clark <robdclark@chromium.org>
>>>>
>>>> This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
>>>>
>>>> It was causing gpu smmu faults on x1e80100.
>>>>
>>>> I _think_ what is causing this is the change in ordering of
>>>> __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
>>>> memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
>>>> this patch is supposed to work correctly in the face of other
>>>> concurrent translations (to buffers unrelated to the one being
>>>> unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
>>>> stale data read back into the tlb.
>>>>
>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>>> ---
>>>>    drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
>>>>    1 file changed, 14 insertions(+), 17 deletions(-)
>>>
>>> Please can you try the diff below, instead?
>>
>> Given that the GPU driver's .tlb_add_page is a no-op, I can't see this
>> making a difference. In fact, given that msm_iommu_pagetable_unmap() still
>> does a brute-force iommu_flush_iotlb_all() after io-pgtable returns, and in
>> fact only recently made .tlb_flush_walk start doing anything either for the
>> sake of the map path, I'm now really wondering how this patch has had any
>> effect at all... :/
> 
> Hmm, yup. Looks like Rob has come back to say the problem lies elsewhere
> anyway.
> 
> One thing below though...
> 
>>>
>>> Will
>>>
>>> --->8
>>>
>>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
>>> index 0e67f1721a3d..0a32e9499e2c 100644
>>> --- a/drivers/iommu/io-pgtable-arm.c
>>> +++ b/drivers/iommu/io-pgtable-arm.c
>>> @@ -672,7 +672,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>>                   /* Clear the remaining entries */
>>>                   __arm_lpae_clear_pte(ptep, &iop->cfg, i);
>>> -               if (gather && !iommu_iotlb_gather_queued(gather))
>>> +               if (!iommu_iotlb_gather_queued(gather))
>>
>> Note that this would reintroduce the latent issue which was present
>> originally, wherein iommu_iotlb_gather_queued(NULL) is false, but if we
>> actually allow a NULL gather to be passed to io_pgtable_tlb_add_page() it
>> may end up being dereferenced (e.g. in arm-smmu-v3).
> 
> I think there is still something to fix here. arm_lpae_init_pte() can
> pass a NULL gather to __arm_lpae_unmap() and I don't think skipping the
> invalidation is correct in that case. Either the drivers need to handle
> that or we shouldn't be passing NULL.
> 
> What do you think?

The subtlety there is that in that case it's always a non-leaf PTE, so 
all that goes back to the driver is io_pgtable_tlb_flush_walk() and the 
gather is never used.

Thanks,
Robin.
Will Deacon Sept. 9, 2024, 2:50 p.m. UTC | #13
On Fri, Sep 06, 2024 at 04:25:19PM +0100, Robin Murphy wrote:
> On 06/09/2024 11:56 am, Will Deacon wrote:
> > On Thu, Sep 05, 2024 at 05:27:28PM +0100, Robin Murphy wrote:
> > > On 05/09/2024 4:53 pm, Will Deacon wrote:
> > > > On Thu, Sep 05, 2024 at 05:49:56AM -0700, Rob Clark wrote:
> > > > > From: Rob Clark <robdclark@chromium.org>
> > > > > 
> > > > > This reverts commit 85b715a334583488ad7fbd3001fe6fd617b7d4c0.
> > > > > 
> > > > > It was causing gpu smmu faults on x1e80100.
> > > > > 
> > > > > I _think_ what is causing this is the change in ordering of
> > > > > __arm_lpae_clear_pte() (dma_sync_single_for_device() on the pgtable
> > > > > memory) and io_pgtable_tlb_flush_walk().  I'm not entirely sure how
> > > > > this patch is supposed to work correctly in the face of other
> > > > > concurrent translations (to buffers unrelated to the one being
> > > > > unmapped(), because after the io_pgtable_tlb_flush_walk() we can have
> > > > > stale data read back into the tlb.
> > > > > 
> > > > > Signed-off-by: Rob Clark <robdclark@chromium.org>
> > > > > ---
> > > > >    drivers/iommu/io-pgtable-arm.c | 31 ++++++++++++++-----------------
> > > > >    1 file changed, 14 insertions(+), 17 deletions(-)
> > > > 
> > > > Please can you try the diff below, instead?
> > > 
> > > Given that the GPU driver's .tlb_add_page is a no-op, I can't see this
> > > making a difference. In fact, given that msm_iommu_pagetable_unmap() still
> > > does a brute-force iommu_flush_iotlb_all() after io-pgtable returns, and in
> > > fact only recently made .tlb_flush_walk start doing anything either for the
> > > sake of the map path, I'm now really wondering how this patch has had any
> > > effect at all... :/
> > 
> > Hmm, yup. Looks like Rob has come back to say the problem lies elsewhere
> > anyway.
> > 
> > One thing below though...
> > 
> > > > 
> > > > Will
> > > > 
> > > > --->8
> > > > 
> > > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > > > index 0e67f1721a3d..0a32e9499e2c 100644
> > > > --- a/drivers/iommu/io-pgtable-arm.c
> > > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > > @@ -672,7 +672,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> > > >                   /* Clear the remaining entries */
> > > >                   __arm_lpae_clear_pte(ptep, &iop->cfg, i);
> > > > -               if (gather && !iommu_iotlb_gather_queued(gather))
> > > > +               if (!iommu_iotlb_gather_queued(gather))
> > > 
> > > Note that this would reintroduce the latent issue which was present
> > > originally, wherein iommu_iotlb_gather_queued(NULL) is false, but if we
> > > actually allow a NULL gather to be passed to io_pgtable_tlb_add_page() it
> > > may end up being dereferenced (e.g. in arm-smmu-v3).
> > 
> > I think there is still something to fix here. arm_lpae_init_pte() can
> > pass a NULL gather to __arm_lpae_unmap() and I don't think skipping the
> > invalidation is correct in that case. Either the drivers need to handle
> > that or we shouldn't be passing NULL.
> > 
> > What do you think?
> 
> The subtlety there is that in that case it's always a non-leaf PTE, so all
> that goes back to the driver is io_pgtable_tlb_flush_walk() and the gather
> is never used.

Beautiful...

Will
diff mbox series

Patch

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 16e51528772d..85261baa3a04 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -274,13 +274,13 @@  static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
 				   sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
 }
 
-static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
+static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
 {
-	for (int i = 0; i < num_entries; i++)
-		ptep[i] = 0;
 
-	if (!cfg->coherent_walk && num_entries)
-		__arm_lpae_sync_pte(ptep, num_entries, cfg);
+	*ptep = 0;
+
+	if (!cfg->coherent_walk)
+		__arm_lpae_sync_pte(ptep, 1, cfg);
 }
 
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
@@ -653,28 +653,25 @@  static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 		max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
 		num_entries = min_t(int, pgcount, max_entries);
 
-		/* Find and handle non-leaf entries */
-		for (i = 0; i < num_entries; i++) {
-			pte = READ_ONCE(ptep[i]);
+		while (i < num_entries) {
+			pte = READ_ONCE(*ptep);
 			if (WARN_ON(!pte))
 				break;
 
-			if (!iopte_leaf(pte, lvl, iop->fmt)) {
-				__arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
+			__arm_lpae_clear_pte(ptep, &iop->cfg);
 
+			if (!iopte_leaf(pte, lvl, iop->fmt)) {
 				/* Also flush any partial walks */
 				io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
 							  ARM_LPAE_GRANULE(data));
 				__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
+			} else if (!iommu_iotlb_gather_queued(gather)) {
+				io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
 			}
-		}
 
-		/* Clear the remaining entries */
-		__arm_lpae_clear_pte(ptep, &iop->cfg, i);
-
-		if (gather && !iommu_iotlb_gather_queued(gather))
-			for (int j = 0; j < i; j++)
-				io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
+			ptep++;
+			i++;
+		}
 
 		return i * size;
 	} else if (iopte_leaf(pte, lvl, iop->fmt)) {