diff mbox

[3/4] drm: add ARM flush implementations

Message ID 20180117003559.67837-3-gurchetansingh@chromium.org (mailing list archive)
State New, archived
Headers show

Commit Message

Gurchetan Singh Jan. 17, 2018, 12:35 a.m. UTC
The DMA API can be used to flush scatter gather tables and physical
pages on ARM devices.

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
 drivers/gpu/drm/tegra/gem.c                 |  6 +-----
 3 files changed, 20 insertions(+), 10 deletions(-)

Comments

Daniel Vetter Jan. 17, 2018, 8:31 a.m. UTC | #1
On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> The DMA API can be used to flush scatter gather tables and physical
> pages on ARM devices.
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> ---
>  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
>  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
>  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
>  3 files changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 3d2bb9d71a60..98d6ebb40e96 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page *pages[],
>  				   (unsigned long)page_virtual + PAGE_SIZE);
>  		kunmap_atomic(page_virtual);
>  	}
> +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> +	unsigned long i;
> +	dma_addr_t dma_handle;
> +
> +	if (!dev)
> +		return;
> +
> +	for (i = 0; i < num_pages; i++) {
> +		dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
> +		dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> +					   DMA_TO_DEVICE);

Erm no. These functions here are super-low-level functions used by drivers
which know exactly what they're doing. Which is reimplementing half of the
dma api behind the dma api's back because the dma api just isn't quite
sufficient for implementing fast gpu drivers.

If all you end up doing is calling the dma api again, then pls just call
it directly.

And just to make it clear: I'd be perfectly fine with adding arm support
here, but then it'd need to directly flush cpu caches and bypass the dma
api. Otherwise this is pointless.
-Daniel

> +	}
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table *st)
>  
>  	if (wbinvd_on_all_cpus())
>  		pr_err("Timed out waiting for cache flush\n");
> +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> +	if (!dev)
> +		return;
> +
> +	dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> index 8ac7eb25e46d..0157f90b5d10 100644
> --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> @@ -14,6 +14,7 @@
>  
>  #include <drm/drm.h>
>  #include <drm/drmP.h>
> +#include <drm/drm_cache.h>
>  #include <drm/drm_gem.h>
>  #include <drm/drm_vma_manager.h>
>  #include <linux/iommu.h>
> @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
>  	/*
>  	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
>  	 * to flush the pages associated with it.
> -	 *
> -	 * TODO: Replace this by drm_flush_sg() once it can be implemented
> -	 * without relying on symbols that are not exported.
>  	 */
>  	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
>  		sg_dma_address(s) = sg_phys(s);
>  
> -	dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
> -			       DMA_TO_DEVICE);
> +	drm_flush_sg(drm->dev, rk_obj->sgt);
>  
>  	return 0;
>  
> diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> index ab1e53d434e8..9945fd2f6bd6 100644
> --- a/drivers/gpu/drm/tegra/gem.c
> +++ b/drivers/gpu/drm/tegra/gem.c
> @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
>  	/*
>  	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
>  	 * to flush the pages associated with it.
> -	 *
> -	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
> -	 * without relying on symbols that are not exported.
>  	 */
>  	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
>  		sg_dma_address(s) = sg_phys(s);
>  
> -	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> -			       DMA_TO_DEVICE);
> +	drm_flush_sg(drm->dev, bo->sgt);
>  
>  	return 0;
>  
> -- 
> 2.13.5
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Sean Paul Jan. 17, 2018, 6:53 p.m. UTC | #2
On Wed, Jan 17, 2018 at 09:31:05AM +0100, Daniel Vetter wrote:
> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> > The DMA API can be used to flush scatter gather tables and physical
> > pages on ARM devices.
> > 
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > ---
> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> >  3 files changed, 20 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > index 3d2bb9d71a60..98d6ebb40e96 100644
> > --- a/drivers/gpu/drm/drm_cache.c
> > +++ b/drivers/gpu/drm/drm_cache.c
> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page *pages[],
> >  				   (unsigned long)page_virtual + PAGE_SIZE);
> >  		kunmap_atomic(page_virtual);
> >  	}
> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > +	unsigned long i;
> > +	dma_addr_t dma_handle;
> > +
> > +	if (!dev)
> > +		return;
> > +
> > +	for (i = 0; i < num_pages; i++) {
> > +		dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
> > +		dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> > +					   DMA_TO_DEVICE);
> 
> Erm no. These functions here are super-low-level functions used by drivers
> which know exactly what they're doing. Which is reimplementing half of the
> dma api behind the dma api's back because the dma api just isn't quite
> sufficient for implementing fast gpu drivers.

This isn't obvious from reading the docs, and doubly confusing when reading the
TODOs in the individual drivers. I don't think it's reasonable to assume
everyone (especially first time contributors!) will know this.

> 
> If all you end up doing is calling the dma api again, then pls just call
> it directly.

Given that some drivers already midlayer the flush, I don't think it's wrong to
provide "dumb" default behavior so drivers don't need to worry about knowing
exactly what they're doing.

> 
> And just to make it clear: I'd be perfectly fine with adding arm support
> here, but then it'd need to directly flush cpu caches and bypass the dma
> api. Otherwise this is pointless.

I don't really have skin in this game, but I think providing one call that works
across all platforms is pretty nice.

Sean


> -Daniel
> 
> > +	}
> >  #else
> >  	pr_err("Architecture has no drm_cache.c support\n");
> >  	WARN_ON_ONCE(1);
> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table *st)
> >  
> >  	if (wbinvd_on_all_cpus())
> >  		pr_err("Timed out waiting for cache flush\n");
> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > +	if (!dev)
> > +		return;
> > +
> > +	dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> >  #else
> >  	pr_err("Architecture has no drm_cache.c support\n");
> >  	WARN_ON_ONCE(1);
> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > index 8ac7eb25e46d..0157f90b5d10 100644
> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > @@ -14,6 +14,7 @@
> >  
> >  #include <drm/drm.h>
> >  #include <drm/drmP.h>
> > +#include <drm/drm_cache.h>
> >  #include <drm/drm_gem.h>
> >  #include <drm/drm_vma_manager.h>
> >  #include <linux/iommu.h>
> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
> >  	/*
> >  	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
> >  	 * to flush the pages associated with it.
> > -	 *
> > -	 * TODO: Replace this by drm_flush_sg() once it can be implemented
> > -	 * without relying on symbols that are not exported.
> >  	 */
> >  	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> >  		sg_dma_address(s) = sg_phys(s);
> >  
> > -	dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
> > -			       DMA_TO_DEVICE);
> > +	drm_flush_sg(drm->dev, rk_obj->sgt);
> >  
> >  	return 0;
> >  
> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> > index ab1e53d434e8..9945fd2f6bd6 100644
> > --- a/drivers/gpu/drm/tegra/gem.c
> > +++ b/drivers/gpu/drm/tegra/gem.c
> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
> >  	/*
> >  	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
> >  	 * to flush the pages associated with it.
> > -	 *
> > -	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
> > -	 * without relying on symbols that are not exported.
> >  	 */
> >  	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> >  		sg_dma_address(s) = sg_phys(s);
> >  
> > -	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> > -			       DMA_TO_DEVICE);
> > +	drm_flush_sg(drm->dev, bo->sgt);
> >  
> >  	return 0;
> >  
> > -- 
> > 2.13.5
> > 
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Daniel Vetter Jan. 17, 2018, 9:22 p.m. UTC | #3
On Wed, Jan 17, 2018 at 7:53 PM, Sean Paul <seanpaul@chromium.org> wrote:
> On Wed, Jan 17, 2018 at 09:31:05AM +0100, Daniel Vetter wrote:
>> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
>> > The DMA API can be used to flush scatter gather tables and physical
>> > pages on ARM devices.
>> >
>> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
>> > ---
>> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
>> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
>> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
>> >  3 files changed, 20 insertions(+), 10 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
>> > index 3d2bb9d71a60..98d6ebb40e96 100644
>> > --- a/drivers/gpu/drm/drm_cache.c
>> > +++ b/drivers/gpu/drm/drm_cache.c
>> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page *pages[],
>> >                                (unsigned long)page_virtual + PAGE_SIZE);
>> >             kunmap_atomic(page_virtual);
>> >     }
>> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>> > +   unsigned long i;
>> > +   dma_addr_t dma_handle;
>> > +
>> > +   if (!dev)
>> > +           return;
>> > +
>> > +   for (i = 0; i < num_pages; i++) {
>> > +           dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
>> > +           dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
>> > +                                      DMA_TO_DEVICE);
>>
>> Erm no. These functions here are super-low-level functions used by drivers
>> which know exactly what they're doing. Which is reimplementing half of the
>> dma api behind the dma api's back because the dma api just isn't quite
>> sufficient for implementing fast gpu drivers.
>
> This isn't obvious from reading the docs, and doubly confusing when reading the
> TODOs in the individual drivers. I don't think it's reasonable to assume
> everyone (especially first time contributors!) will know this.

There isn't any docs, because all this stuff is really badly littered
with ill-defined behaviour (managed dma-buf and dma api coherency for
gpu drivers). I've discussed it with lots of people, but thus far no
insight how we should be do this.

>> If all you end up doing is calling the dma api again, then pls just call
>> it directly.
>
> Given that some drivers already midlayer the flush, I don't think it's wrong to
> provide "dumb" default behavior so drivers don't need to worry about knowing
> exactly what they're doing.
>
>>
>> And just to make it clear: I'd be perfectly fine with adding arm support
>> here, but then it'd need to directly flush cpu caches and bypass the dma
>> api. Otherwise this is pointless.
>
> I don't really have skin in this game, but I think providing one call that works
> across all platforms is pretty nice.

The problem I have is not with calling back down into the dma api.
It's with adding a devcie paramater that isn't needed, but just there
because we call down into the wrong layer (i.e. the previous patch).
If you can make this happen without the dummy device then I'd be happy
to ack the patches. But I thought ARM has a cacheline flush
instruction too?
-Daniel

>
> Sean
>
>
>> -Daniel
>>
>> > +   }
>> >  #else
>> >     pr_err("Architecture has no drm_cache.c support\n");
>> >     WARN_ON_ONCE(1);
>> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table *st)
>> >
>> >     if (wbinvd_on_all_cpus())
>> >             pr_err("Timed out waiting for cache flush\n");
>> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>> > +   if (!dev)
>> > +           return;
>> > +
>> > +   dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
>> >  #else
>> >     pr_err("Architecture has no drm_cache.c support\n");
>> >     WARN_ON_ONCE(1);
>> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > index 8ac7eb25e46d..0157f90b5d10 100644
>> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > @@ -14,6 +14,7 @@
>> >
>> >  #include <drm/drm.h>
>> >  #include <drm/drmP.h>
>> > +#include <drm/drm_cache.h>
>> >  #include <drm/drm_gem.h>
>> >  #include <drm/drm_vma_manager.h>
>> >  #include <linux/iommu.h>
>> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
>> >     /*
>> >      * Fake up the SG table so that dma_sync_sg_for_device() can be used
>> >      * to flush the pages associated with it.
>> > -    *
>> > -    * TODO: Replace this by drm_flush_sg() once it can be implemented
>> > -    * without relying on symbols that are not exported.
>> >      */
>> >     for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
>> >             sg_dma_address(s) = sg_phys(s);
>> >
>> > -   dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
>> > -                          DMA_TO_DEVICE);
>> > +   drm_flush_sg(drm->dev, rk_obj->sgt);
>> >
>> >     return 0;
>> >
>> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
>> > index ab1e53d434e8..9945fd2f6bd6 100644
>> > --- a/drivers/gpu/drm/tegra/gem.c
>> > +++ b/drivers/gpu/drm/tegra/gem.c
>> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
>> >     /*
>> >      * Fake up the SG table so that dma_sync_sg_for_device() can be used
>> >      * to flush the pages associated with it.
>> > -    *
>> > -    * TODO: Replace this by drm_clflash_sg() once it can be implemented
>> > -    * without relying on symbols that are not exported.
>> >      */
>> >     for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
>> >             sg_dma_address(s) = sg_phys(s);
>> >
>> > -   dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
>> > -                          DMA_TO_DEVICE);
>> > +   drm_flush_sg(drm->dev, bo->sgt);
>> >
>> >     return 0;
>> >
>> > --
>> > 2.13.5
>> >
>> > _______________________________________________
>> > dri-devel mailing list
>> > dri-devel@lists.freedesktop.org
>> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> http://blog.ffwll.ch
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
> --
> Sean Paul, Software Engineer, Google / Chromium OS
Gurchetan Singh Jan. 17, 2018, 10:46 p.m. UTC | #4
>
> dma api just isn't quite sufficient for implementing fast gpu drivers.


Can you elaborate?  IIRC the DMA API has strong synchronization guarantees
and that can be problematic for GPU drivers.  However, using the DMA API
for flushing doesn't necessarily mean the driver has to use the rest of the
DMA API.

but then it'd need to directly flush cpu caches and bypass the dma api.


On ARM, cache flushing seems to vary from chipset to chipset.  For example,
on ARM32 a typical call-stack for dma_sync_single_for_device looks like:

arm_dma_sync_single_for_device
__dma_page_cpu_to_dev
outer_clean_range
outer_cache.clean_range

There are multiple clean_range implementations out there
(i.e, aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so
that's why the DMA API was used in this case.  On ARM64, things are a
little simpler, but the DMA API seems to go directly to assembly
(__dma_map_area) after a little indirection.  Why do you think that's
inefficient?

On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> > The DMA API can be used to flush scatter gather tables and physical
> > pages on ARM devices.
> >
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > ---
> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> >  3 files changed, 20 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > index 3d2bb9d71a60..98d6ebb40e96 100644
> > --- a/drivers/gpu/drm/drm_cache.c
> > +++ b/drivers/gpu/drm/drm_cache.c
> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
> *pages[],
> >                                  (unsigned long)page_virtual +
> PAGE_SIZE);
> >               kunmap_atomic(page_virtual);
> >       }
> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > +     unsigned long i;
> > +     dma_addr_t dma_handle;
> > +
> > +     if (!dev)
> > +             return;
> > +
> > +     for (i = 0; i < num_pages; i++) {
> > +             dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> > +                                        DMA_TO_DEVICE);
>
> Erm no. These functions here are super-low-level functions used by drivers
> which know exactly what they're doing. Which is reimplementing half of the
> dma api behind the dma api's back because the dma api just isn't quite
> sufficient for implementing fast gpu drivers.
>
> If all you end up doing is calling the dma api again, then pls just call
> it directly.
>
> And just to make it clear: I'd be perfectly fine with adding arm support
> here, but then it'd need to directly flush cpu caches and bypass the dma
> api. Otherwise this is pointless.
> -Daniel
>
> > +     }
> >  #else
> >       pr_err("Architecture has no drm_cache.c support\n");
> >       WARN_ON_ONCE(1);
> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
> *st)
> >
> >       if (wbinvd_on_all_cpus())
> >               pr_err("Timed out waiting for cache flush\n");
> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > +     if (!dev)
> > +             return;
> > +
> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> >  #else
> >       pr_err("Architecture has no drm_cache.c support\n");
> >       WARN_ON_ONCE(1);
> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > index 8ac7eb25e46d..0157f90b5d10 100644
> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > @@ -14,6 +14,7 @@
> >
> >  #include <drm/drm.h>
> >  #include <drm/drmP.h>
> > +#include <drm/drm_cache.h>
> >  #include <drm/drm_gem.h>
> >  #include <drm/drm_vma_manager.h>
> >  #include <linux/iommu.h>
> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
> rockchip_gem_object *rk_obj)
> >       /*
> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> used
> >        * to flush the pages associated with it.
> > -      *
> > -      * TODO: Replace this by drm_flush_sg() once it can be implemented
> > -      * without relying on symbols that are not exported.
> >        */
> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> >               sg_dma_address(s) = sg_phys(s);
> >
> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
> rk_obj->sgt->nents,
> > -                            DMA_TO_DEVICE);
> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
> >
> >       return 0;
> >
> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> > index ab1e53d434e8..9945fd2f6bd6 100644
> > --- a/drivers/gpu/drm/tegra/gem.c
> > +++ b/drivers/gpu/drm/tegra/gem.c
> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
> *drm, struct tegra_bo *bo)
> >       /*
> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> used
> >        * to flush the pages associated with it.
> > -      *
> > -      * TODO: Replace this by drm_clflash_sg() once it can be
> implemented
> > -      * without relying on symbols that are not exported.
> >        */
> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> >               sg_dma_address(s) = sg_phys(s);
> >
> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> > -                            DMA_TO_DEVICE);
> > +     drm_flush_sg(drm->dev, bo->sgt);
> >
> >       return 0;
> >
> > --
> > 2.13.5
> >
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
>
Daniel Vetter Jan. 18, 2018, 7:42 a.m. UTC | #5
On Wed, Jan 17, 2018 at 11:46 PM, Gurchetan Singh
<gurchetansingh@chromium.org> wrote:
>> dma api just isn't quite sufficient for implementing fast gpu drivers.
>
>
> Can you elaborate?  IIRC the DMA API has strong synchronization guarantees
> and that can be problematic for GPU drivers.  However, using the DMA API for
> flushing doesn't necessarily mean the driver has to use the rest of the DMA
> API.
>
>> but then it'd need to directly flush cpu caches and bypass the dma api.
>
>
> On ARM, cache flushing seems to vary from chipset to chipset.  For example,
> on ARM32 a typical call-stack for dma_sync_single_for_device looks like:
>
> arm_dma_sync_single_for_device
> __dma_page_cpu_to_dev
> outer_clean_range
> outer_cache.clean_range
>
> There are multiple clean_range implementations out there (i.e,
> aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so that's
> why the DMA API was used in this case.  On ARM64, things are a little
> simpler, but the DMA API seems to go directly to assembly (__dma_map_area)
> after a little indirection.  Why do you think that's inefficient?

I never said it's inefficient. My only gripe is with adding the
pointless struct device * argument to flushing functions which really
don't care (nor should care) about the device. Because what we really
want to call is outer_clean_range here, and that doesn't need a struct
device *. Imo that function (or something similar) needs to be
exported and then used by drm_flush_* functions.

Also note that arm has both flush and invalidate functions, but on x86
those are the same implementation (and we don't have a separate
drm_invalidate_* set of functions). That doesn't look like a too good
idea.

Of course that doesn't solve the problem of who's supposed to call it
and when in the dma-buf sharing situation.
-Daniel

> On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
>>
>> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
>> > The DMA API can be used to flush scatter gather tables and physical
>> > pages on ARM devices.
>> >
>> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
>> > ---
>> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
>> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
>> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
>> >  3 files changed, 20 insertions(+), 10 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
>> > index 3d2bb9d71a60..98d6ebb40e96 100644
>> > --- a/drivers/gpu/drm/drm_cache.c
>> > +++ b/drivers/gpu/drm/drm_cache.c
>> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
>> > *pages[],
>> >                                  (unsigned long)page_virtual +
>> > PAGE_SIZE);
>> >               kunmap_atomic(page_virtual);
>> >       }
>> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>> > +     unsigned long i;
>> > +     dma_addr_t dma_handle;
>> > +
>> > +     if (!dev)
>> > +             return;
>> > +
>> > +     for (i = 0; i < num_pages; i++) {
>> > +             dma_handle = phys_to_dma(drm->dev,
>> > page_to_phys(pages[i]));
>> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
>> > +                                        DMA_TO_DEVICE);
>>
>> Erm no. These functions here are super-low-level functions used by drivers
>> which know exactly what they're doing. Which is reimplementing half of the
>> dma api behind the dma api's back because the dma api just isn't quite
>> sufficient for implementing fast gpu drivers.
>>
>> If all you end up doing is calling the dma api again, then pls just call
>> it directly.
>>
>> And just to make it clear: I'd be perfectly fine with adding arm support
>> here, but then it'd need to directly flush cpu caches and bypass the dma
>> api. Otherwise this is pointless.
>> -Daniel
>>
>> > +     }
>> >  #else
>> >       pr_err("Architecture has no drm_cache.c support\n");
>> >       WARN_ON_ONCE(1);
>> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
>> > *st)
>> >
>> >       if (wbinvd_on_all_cpus())
>> >               pr_err("Timed out waiting for cache flush\n");
>> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>> > +     if (!dev)
>> > +             return;
>> > +
>> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
>> >  #else
>> >       pr_err("Architecture has no drm_cache.c support\n");
>> >       WARN_ON_ONCE(1);
>> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > index 8ac7eb25e46d..0157f90b5d10 100644
>> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > @@ -14,6 +14,7 @@
>> >
>> >  #include <drm/drm.h>
>> >  #include <drm/drmP.h>
>> > +#include <drm/drm_cache.h>
>> >  #include <drm/drm_gem.h>
>> >  #include <drm/drm_vma_manager.h>
>> >  #include <linux/iommu.h>
>> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
>> > rockchip_gem_object *rk_obj)
>> >       /*
>> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
>> > used
>> >        * to flush the pages associated with it.
>> > -      *
>> > -      * TODO: Replace this by drm_flush_sg() once it can be implemented
>> > -      * without relying on symbols that are not exported.
>> >        */
>> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
>> >               sg_dma_address(s) = sg_phys(s);
>> >
>> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
>> > rk_obj->sgt->nents,
>> > -                            DMA_TO_DEVICE);
>> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
>> >
>> >       return 0;
>> >
>> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
>> > index ab1e53d434e8..9945fd2f6bd6 100644
>> > --- a/drivers/gpu/drm/tegra/gem.c
>> > +++ b/drivers/gpu/drm/tegra/gem.c
>> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
>> > *drm, struct tegra_bo *bo)
>> >       /*
>> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
>> > used
>> >        * to flush the pages associated with it.
>> > -      *
>> > -      * TODO: Replace this by drm_clflash_sg() once it can be
>> > implemented
>> > -      * without relying on symbols that are not exported.
>> >        */
>> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
>> >               sg_dma_address(s) = sg_phys(s);
>> >
>> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
>> > -                            DMA_TO_DEVICE);
>> > +     drm_flush_sg(drm->dev, bo->sgt);
>> >
>> >       return 0;
>> >
>> > --
>> > 2.13.5
>> >
>> > _______________________________________________
>> > dri-devel mailing list
>> > dri-devel@lists.freedesktop.org
>> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> http://blog.ffwll.ch
>
>
Gurchetan Singh Jan. 18, 2018, 5:20 p.m. UTC | #6
On Wed, Jan 17, 2018 at 11:42 PM, Daniel Vetter <daniel@ffwll.ch> wrote:

> On Wed, Jan 17, 2018 at 11:46 PM, Gurchetan Singh
> <gurchetansingh@chromium.org> wrote:
> >> dma api just isn't quite sufficient for implementing fast gpu drivers.
> >
> >
> > Can you elaborate?  IIRC the DMA API has strong synchronization
> guarantees
> > and that can be problematic for GPU drivers.  However, using the DMA API
> for
> > flushing doesn't necessarily mean the driver has to use the rest of the
> DMA
> > API.
> >
> >> but then it'd need to directly flush cpu caches and bypass the dma api.
> >
> >
> > On ARM, cache flushing seems to vary from chipset to chipset.  For
> example,
> > on ARM32 a typical call-stack for dma_sync_single_for_device looks like:
> >
> > arm_dma_sync_single_for_device
> > __dma_page_cpu_to_dev
> > outer_clean_range
> > outer_cache.clean_range
> >
> > There are multiple clean_range implementations out there (i.e,
> > aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so
> that's
> > why the DMA API was used in this case.  On ARM64, things are a little
> > simpler, but the DMA API seems to go directly to assembly
> (__dma_map_area)
> > after a little indirection.  Why do you think that's inefficient?
>
> I never said it's inefficient. My only gripe is with adding the
> pointless struct device * argument to flushing functions which really
> don't care (nor should care) about the device. Because what we really
> want to call is outer_clean_range here, and that doesn't need a struct
> device *. Imo that function (or something similar) needs to be
> exported and then used by drm_flush_* functions.
>

Okay, we can go with outer_clean_range on ARM and
__dma_map_area/__dma_flush_area
on ARM64.  One concern, on ARM64 at-least, the header
(arm64/include/asm/cacheflush.h) indicates we shouldn't call these
functions directly for whatever reason:

"Cache maintenance functions used by the DMA API. No to be used directly."

However, if no one objects, that's the route we'll go.

>
> Also note that arm has both flush and invalidate functions, but on x86
> those are the same implementation (and we don't have a separate
> drm_invalidate_* set of functions). That doesn't look like a too good
> idea.
>
> Of course that doesn't solve the problem of who's supposed to call it
> and when in the dma-buf sharing situation.
> -Daniel
>
> > On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >>
> >> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> >> > The DMA API can be used to flush scatter gather tables and physical
> >> > pages on ARM devices.
> >> >
> >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> >> > ---
> >> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> >> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> >> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> >> >  3 files changed, 20 insertions(+), 10 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> >> > index 3d2bb9d71a60..98d6ebb40e96 100644
> >> > --- a/drivers/gpu/drm/drm_cache.c
> >> > +++ b/drivers/gpu/drm/drm_cache.c
> >> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
> >> > *pages[],
> >> >                                  (unsigned long)page_virtual +
> >> > PAGE_SIZE);
> >> >               kunmap_atomic(page_virtual);
> >> >       }
> >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> >> > +     unsigned long i;
> >> > +     dma_addr_t dma_handle;
> >> > +
> >> > +     if (!dev)
> >> > +             return;
> >> > +
> >> > +     for (i = 0; i < num_pages; i++) {
> >> > +             dma_handle = phys_to_dma(drm->dev,
> >> > page_to_phys(pages[i]));
> >> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> >> > +                                        DMA_TO_DEVICE);
> >>
> >> Erm no. These functions here are super-low-level functions used by
> drivers
> >> which know exactly what they're doing. Which is reimplementing half of
> the
> >> dma api behind the dma api's back because the dma api just isn't quite
> >> sufficient for implementing fast gpu drivers.
> >>
> >> If all you end up doing is calling the dma api again, then pls just call
> >> it directly.
> >>
> >> And just to make it clear: I'd be perfectly fine with adding arm support
> >> here, but then it'd need to directly flush cpu caches and bypass the dma
> >> api. Otherwise this is pointless.
> >> -Daniel
> >>
> >> > +     }
> >> >  #else
> >> >       pr_err("Architecture has no drm_cache.c support\n");
> >> >       WARN_ON_ONCE(1);
> >> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
> >> > *st)
> >> >
> >> >       if (wbinvd_on_all_cpus())
> >> >               pr_err("Timed out waiting for cache flush\n");
> >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> >> > +     if (!dev)
> >> > +             return;
> >> > +
> >> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> >> >  #else
> >> >       pr_err("Architecture has no drm_cache.c support\n");
> >> >       WARN_ON_ONCE(1);
> >> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > index 8ac7eb25e46d..0157f90b5d10 100644
> >> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > @@ -14,6 +14,7 @@
> >> >
> >> >  #include <drm/drm.h>
> >> >  #include <drm/drmP.h>
> >> > +#include <drm/drm_cache.h>
> >> >  #include <drm/drm_gem.h>
> >> >  #include <drm/drm_vma_manager.h>
> >> >  #include <linux/iommu.h>
> >> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
> >> > rockchip_gem_object *rk_obj)
> >> >       /*
> >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> >> > used
> >> >        * to flush the pages associated with it.
> >> > -      *
> >> > -      * TODO: Replace this by drm_flush_sg() once it can be
> implemented
> >> > -      * without relying on symbols that are not exported.
> >> >        */
> >> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> >> >               sg_dma_address(s) = sg_phys(s);
> >> >
> >> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
> >> > rk_obj->sgt->nents,
> >> > -                            DMA_TO_DEVICE);
> >> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
> >> >
> >> >       return 0;
> >> >
> >> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> >> > index ab1e53d434e8..9945fd2f6bd6 100644
> >> > --- a/drivers/gpu/drm/tegra/gem.c
> >> > +++ b/drivers/gpu/drm/tegra/gem.c
> >> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
> >> > *drm, struct tegra_bo *bo)
> >> >       /*
> >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> >> > used
> >> >        * to flush the pages associated with it.
> >> > -      *
> >> > -      * TODO: Replace this by drm_clflash_sg() once it can be
> >> > implemented
> >> > -      * without relying on symbols that are not exported.
> >> >        */
> >> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> >> >               sg_dma_address(s) = sg_phys(s);
> >> >
> >> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> >> > -                            DMA_TO_DEVICE);
> >> > +     drm_flush_sg(drm->dev, bo->sgt);
> >> >
> >> >       return 0;
> >> >
> >> > --
> >> > 2.13.5
> >> >
> >> > _______________________________________________
> >> > dri-devel mailing list
> >> > dri-devel@lists.freedesktop.org
> >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> >>
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> http://blog.ffwll.ch
> >
> >
>
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
>
Daniel Vetter Jan. 30, 2018, 9:09 a.m. UTC | #7
On Thu, Jan 18, 2018 at 09:20:11AM -0800, Gurchetan Singh wrote:
> On Wed, Jan 17, 2018 at 11:42 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Wed, Jan 17, 2018 at 11:46 PM, Gurchetan Singh
> > <gurchetansingh@chromium.org> wrote:
> > >> dma api just isn't quite sufficient for implementing fast gpu drivers.
> > >
> > >
> > > Can you elaborate?  IIRC the DMA API has strong synchronization
> > guarantees
> > > and that can be problematic for GPU drivers.  However, using the DMA API
> > for
> > > flushing doesn't necessarily mean the driver has to use the rest of the
> > DMA
> > > API.
> > >
> > >> but then it'd need to directly flush cpu caches and bypass the dma api.
> > >
> > >
> > > On ARM, cache flushing seems to vary from chipset to chipset.  For
> > example,
> > > on ARM32 a typical call-stack for dma_sync_single_for_device looks like:
> > >
> > > arm_dma_sync_single_for_device
> > > __dma_page_cpu_to_dev
> > > outer_clean_range
> > > outer_cache.clean_range
> > >
> > > There are multiple clean_range implementations out there (i.e,
> > > aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so
> > that's
> > > why the DMA API was used in this case.  On ARM64, things are a little
> > > simpler, but the DMA API seems to go directly to assembly
> > (__dma_map_area)
> > > after a little indirection.  Why do you think that's inefficient?
> >
> > I never said it's inefficient. My only gripe is with adding the
> > pointless struct device * argument to flushing functions which really
> > don't care (nor should care) about the device. Because what we really
> > want to call is outer_clean_range here, and that doesn't need a struct
> > device *. Imo that function (or something similar) needs to be
> > exported and then used by drm_flush_* functions.
> >
> 
> Okay, we can go with outer_clean_range on ARM and
> __dma_map_area/__dma_flush_area
> on ARM64.  One concern, on ARM64 at-least, the header
> (arm64/include/asm/cacheflush.h) indicates we shouldn't call these
> functions directly for whatever reason:
> 
> "Cache maintenance functions used by the DMA API. No to be used directly."
> 
> However, if no one objects, that's the route we'll go.

Yeah, drm_flush.c is all about not following the dma api and digging a
direct path to the underlying concepts. So totally fine.

Maybe we should improve the kerneldoc a bit as part of your patch series,
to explain this better?
-Daniel

> 
> >
> > Also note that arm has both flush and invalidate functions, but on x86
> > those are the same implementation (and we don't have a separate
> > drm_invalidate_* set of functions). That doesn't look like a too good
> > idea.
> >
> > Of course that doesn't solve the problem of who's supposed to call it
> > and when in the dma-buf sharing situation.
> > -Daniel
> >
> > > On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > >>
> > >> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> > >> > The DMA API can be used to flush scatter gather tables and physical
> > >> > pages on ARM devices.
> > >> >
> > >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > >> > ---
> > >> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> > >> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> > >> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> > >> >  3 files changed, 20 insertions(+), 10 deletions(-)
> > >> >
> > >> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > >> > index 3d2bb9d71a60..98d6ebb40e96 100644
> > >> > --- a/drivers/gpu/drm/drm_cache.c
> > >> > +++ b/drivers/gpu/drm/drm_cache.c
> > >> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
> > >> > *pages[],
> > >> >                                  (unsigned long)page_virtual +
> > >> > PAGE_SIZE);
> > >> >               kunmap_atomic(page_virtual);
> > >> >       }
> > >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > >> > +     unsigned long i;
> > >> > +     dma_addr_t dma_handle;
> > >> > +
> > >> > +     if (!dev)
> > >> > +             return;
> > >> > +
> > >> > +     for (i = 0; i < num_pages; i++) {
> > >> > +             dma_handle = phys_to_dma(drm->dev,
> > >> > page_to_phys(pages[i]));
> > >> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> > >> > +                                        DMA_TO_DEVICE);
> > >>
> > >> Erm no. These functions here are super-low-level functions used by
> > drivers
> > >> which know exactly what they're doing. Which is reimplementing half of
> > the
> > >> dma api behind the dma api's back because the dma api just isn't quite
> > >> sufficient for implementing fast gpu drivers.
> > >>
> > >> If all you end up doing is calling the dma api again, then pls just call
> > >> it directly.
> > >>
> > >> And just to make it clear: I'd be perfectly fine with adding arm support
> > >> here, but then it'd need to directly flush cpu caches and bypass the dma
> > >> api. Otherwise this is pointless.
> > >> -Daniel
> > >>
> > >> > +     }
> > >> >  #else
> > >> >       pr_err("Architecture has no drm_cache.c support\n");
> > >> >       WARN_ON_ONCE(1);
> > >> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
> > >> > *st)
> > >> >
> > >> >       if (wbinvd_on_all_cpus())
> > >> >               pr_err("Timed out waiting for cache flush\n");
> > >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > >> > +     if (!dev)
> > >> > +             return;
> > >> > +
> > >> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> > >> >  #else
> > >> >       pr_err("Architecture has no drm_cache.c support\n");
> > >> >       WARN_ON_ONCE(1);
> > >> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > >> > b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > >> > index 8ac7eb25e46d..0157f90b5d10 100644
> > >> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > >> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > >> > @@ -14,6 +14,7 @@
> > >> >
> > >> >  #include <drm/drm.h>
> > >> >  #include <drm/drmP.h>
> > >> > +#include <drm/drm_cache.h>
> > >> >  #include <drm/drm_gem.h>
> > >> >  #include <drm/drm_vma_manager.h>
> > >> >  #include <linux/iommu.h>
> > >> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
> > >> > rockchip_gem_object *rk_obj)
> > >> >       /*
> > >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> > >> > used
> > >> >        * to flush the pages associated with it.
> > >> > -      *
> > >> > -      * TODO: Replace this by drm_flush_sg() once it can be
> > implemented
> > >> > -      * without relying on symbols that are not exported.
> > >> >        */
> > >> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> > >> >               sg_dma_address(s) = sg_phys(s);
> > >> >
> > >> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
> > >> > rk_obj->sgt->nents,
> > >> > -                            DMA_TO_DEVICE);
> > >> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
> > >> >
> > >> >       return 0;
> > >> >
> > >> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> > >> > index ab1e53d434e8..9945fd2f6bd6 100644
> > >> > --- a/drivers/gpu/drm/tegra/gem.c
> > >> > +++ b/drivers/gpu/drm/tegra/gem.c
> > >> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
> > >> > *drm, struct tegra_bo *bo)
> > >> >       /*
> > >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> > >> > used
> > >> >        * to flush the pages associated with it.
> > >> > -      *
> > >> > -      * TODO: Replace this by drm_clflash_sg() once it can be
> > >> > implemented
> > >> > -      * without relying on symbols that are not exported.
> > >> >        */
> > >> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> > >> >               sg_dma_address(s) = sg_phys(s);
> > >> >
> > >> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> > >> > -                            DMA_TO_DEVICE);
> > >> > +     drm_flush_sg(drm->dev, bo->sgt);
> > >> >
> > >> >       return 0;
> > >> >
> > >> > --
> > >> > 2.13.5
> > >> >
> > >> > _______________________________________________
> > >> > dri-devel mailing list
> > >> > dri-devel@lists.freedesktop.org
> > >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> > >>
> > >> --
> > >> Daniel Vetter
> > >> Software Engineer, Intel Corporation
> > >> http://blog.ffwll.ch
> > >
> > >
> >
> >
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> >
Ville Syrjälä Jan. 30, 2018, 1:34 p.m. UTC | #8
On Thu, Jan 18, 2018 at 08:42:55AM +0100, Daniel Vetter wrote:
> On Wed, Jan 17, 2018 at 11:46 PM, Gurchetan Singh
> <gurchetansingh@chromium.org> wrote:
> >> dma api just isn't quite sufficient for implementing fast gpu drivers.
> >
> >
> > Can you elaborate?  IIRC the DMA API has strong synchronization guarantees
> > and that can be problematic for GPU drivers.  However, using the DMA API for
> > flushing doesn't necessarily mean the driver has to use the rest of the DMA
> > API.
> >
> >> but then it'd need to directly flush cpu caches and bypass the dma api.
> >
> >
> > On ARM, cache flushing seems to vary from chipset to chipset.  For example,
> > on ARM32 a typical call-stack for dma_sync_single_for_device looks like:
> >
> > arm_dma_sync_single_for_device
> > __dma_page_cpu_to_dev
> > outer_clean_range
> > outer_cache.clean_range
> >
> > There are multiple clean_range implementations out there (i.e,
> > aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so that's
> > why the DMA API was used in this case.  On ARM64, things are a little
> > simpler, but the DMA API seems to go directly to assembly (__dma_map_area)
> > after a little indirection.  Why do you think that's inefficient?
> 
> I never said it's inefficient. My only gripe is with adding the
> pointless struct device * argument to flushing functions which really
> don't care (nor should care) about the device. Because what we really
> want to call is outer_clean_range here, and that doesn't need a struct
> device *. Imo that function (or something similar) needs to be
> exported and then used by drm_flush_* functions.
> 
> Also note that arm has both flush and invalidate functions, but on x86
> those are the same implementation (and we don't have a separate
> drm_invalidate_* set of functions). That doesn't look like a too good
> idea.

IMO if someone adds some new functions they should talk about
"writeback" and "invalidate". I think "flush" is a very vague
term that could mean different things to different people.

x86 has clflush which is writeback+invalidate, and more recently
x86 gained the clwb instruction for doing just writeback without
the invalidate. On ARM IIRC you can choose whether to do
writeback or invalidate or both.

> 
> Of course that doesn't solve the problem of who's supposed to call it
> and when in the dma-buf sharing situation.
> -Daniel
> 
> > On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >>
> >> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> >> > The DMA API can be used to flush scatter gather tables and physical
> >> > pages on ARM devices.
> >> >
> >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> >> > ---
> >> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> >> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> >> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> >> >  3 files changed, 20 insertions(+), 10 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> >> > index 3d2bb9d71a60..98d6ebb40e96 100644
> >> > --- a/drivers/gpu/drm/drm_cache.c
> >> > +++ b/drivers/gpu/drm/drm_cache.c
> >> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
> >> > *pages[],
> >> >                                  (unsigned long)page_virtual +
> >> > PAGE_SIZE);
> >> >               kunmap_atomic(page_virtual);
> >> >       }
> >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> >> > +     unsigned long i;
> >> > +     dma_addr_t dma_handle;
> >> > +
> >> > +     if (!dev)
> >> > +             return;
> >> > +
> >> > +     for (i = 0; i < num_pages; i++) {
> >> > +             dma_handle = phys_to_dma(drm->dev,
> >> > page_to_phys(pages[i]));
> >> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> >> > +                                        DMA_TO_DEVICE);
> >>
> >> Erm no. These functions here are super-low-level functions used by drivers
> >> which know exactly what they're doing. Which is reimplementing half of the
> >> dma api behind the dma api's back because the dma api just isn't quite
> >> sufficient for implementing fast gpu drivers.
> >>
> >> If all you end up doing is calling the dma api again, then pls just call
> >> it directly.
> >>
> >> And just to make it clear: I'd be perfectly fine with adding arm support
> >> here, but then it'd need to directly flush cpu caches and bypass the dma
> >> api. Otherwise this is pointless.
> >> -Daniel
> >>
> >> > +     }
> >> >  #else
> >> >       pr_err("Architecture has no drm_cache.c support\n");
> >> >       WARN_ON_ONCE(1);
> >> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
> >> > *st)
> >> >
> >> >       if (wbinvd_on_all_cpus())
> >> >               pr_err("Timed out waiting for cache flush\n");
> >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> >> > +     if (!dev)
> >> > +             return;
> >> > +
> >> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> >> >  #else
> >> >       pr_err("Architecture has no drm_cache.c support\n");
> >> >       WARN_ON_ONCE(1);
> >> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > index 8ac7eb25e46d..0157f90b5d10 100644
> >> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > @@ -14,6 +14,7 @@
> >> >
> >> >  #include <drm/drm.h>
> >> >  #include <drm/drmP.h>
> >> > +#include <drm/drm_cache.h>
> >> >  #include <drm/drm_gem.h>
> >> >  #include <drm/drm_vma_manager.h>
> >> >  #include <linux/iommu.h>
> >> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
> >> > rockchip_gem_object *rk_obj)
> >> >       /*
> >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> >> > used
> >> >        * to flush the pages associated with it.
> >> > -      *
> >> > -      * TODO: Replace this by drm_flush_sg() once it can be implemented
> >> > -      * without relying on symbols that are not exported.
> >> >        */
> >> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> >> >               sg_dma_address(s) = sg_phys(s);
> >> >
> >> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
> >> > rk_obj->sgt->nents,
> >> > -                            DMA_TO_DEVICE);
> >> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
> >> >
> >> >       return 0;
> >> >
> >> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> >> > index ab1e53d434e8..9945fd2f6bd6 100644
> >> > --- a/drivers/gpu/drm/tegra/gem.c
> >> > +++ b/drivers/gpu/drm/tegra/gem.c
> >> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
> >> > *drm, struct tegra_bo *bo)
> >> >       /*
> >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> >> > used
> >> >        * to flush the pages associated with it.
> >> > -      *
> >> > -      * TODO: Replace this by drm_clflash_sg() once it can be
> >> > implemented
> >> > -      * without relying on symbols that are not exported.
> >> >        */
> >> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> >> >               sg_dma_address(s) = sg_phys(s);
> >> >
> >> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> >> > -                            DMA_TO_DEVICE);
> >> > +     drm_flush_sg(drm->dev, bo->sgt);
> >> >
> >> >       return 0;
> >> >
> >> > --
> >> > 2.13.5
> >> >
> >> > _______________________________________________
> >> > dri-devel mailing list
> >> > dri-devel@lists.freedesktop.org
> >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> >>
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> http://blog.ffwll.ch
> >
> >
> 
> 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
diff mbox

Patch

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 3d2bb9d71a60..98d6ebb40e96 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -105,6 +105,18 @@  drm_flush_pages(struct device *dev, struct page *pages[],
 				   (unsigned long)page_virtual + PAGE_SIZE);
 		kunmap_atomic(page_virtual);
 	}
+#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	unsigned long i;
+	dma_addr_t dma_handle;
+
+	if (!dev)
+		return;
+
+	for (i = 0; i < num_pages; i++) {
+		dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
+		dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
+					   DMA_TO_DEVICE);
+	}
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
@@ -136,6 +148,11 @@  drm_flush_sg(struct device *dev, struct sg_table *st)
 
 	if (wbinvd_on_all_cpus())
 		pr_err("Timed out waiting for cache flush\n");
+#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	if (!dev)
+		return;
+
+	dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
index 8ac7eb25e46d..0157f90b5d10 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
@@ -14,6 +14,7 @@ 
 
 #include <drm/drm.h>
 #include <drm/drmP.h>
+#include <drm/drm_cache.h>
 #include <drm/drm_gem.h>
 #include <drm/drm_vma_manager.h>
 #include <linux/iommu.h>
@@ -99,15 +100,11 @@  static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
 	/*
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_flush_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
 		sg_dma_address(s) = sg_phys(s);
 
-	dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
-			       DMA_TO_DEVICE);
+	drm_flush_sg(drm->dev, rk_obj->sgt);
 
 	return 0;
 
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index ab1e53d434e8..9945fd2f6bd6 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -230,15 +230,11 @@  static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
 	/*
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
 		sg_dma_address(s) = sg_phys(s);
 
-	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
-			       DMA_TO_DEVICE);
+	drm_flush_sg(drm->dev, bo->sgt);
 
 	return 0;