diff mbox series

[PATCHv2,8/8] videobuf2: handle non-contiguous DMA allocations

Message ID 20210427131344.139443-9-senozhatsky@chromium.org (mailing list archive)
State New, archived
Headers show
Series videobuf2: support new noncontiguous DMA API | expand

Commit Message

Sergey Senozhatsky April 27, 2021, 1:13 p.m. UTC
This adds support for new noncontiguous DMA API, which
requires allocators to have two execution branches: one
for the current API, and one for the new one.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
[hch: untested conversion to the ne API]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 .../common/videobuf2/videobuf2-dma-contig.c   | 140 +++++++++++++++---
 1 file changed, 116 insertions(+), 24 deletions(-)

Comments

Christoph Hellwig April 28, 2021, 6:14 a.m. UTC | #1
On Tue, Apr 27, 2021 at 10:13:43PM +0900, Sergey Senozhatsky wrote:
> This adds support for new noncontiguous DMA API, which
> requires allocators to have two execution branches: one
> for the current API, and one for the new one.
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> [hch: untested conversion to the ne API]
> Signed-off-by: Christoph Hellwig <hch@lst.de>

I think you can skip my signoff and the note here for the trivial
conversion.

The code looks fine to me from the DMA API perspective:

Acked-by: Christoph Hellwig <hch@lst.de>
Hans Verkuil June 3, 2021, 11:59 a.m. UTC | #2
On 27/04/2021 15:13, Sergey Senozhatsky wrote:
> This adds support for new noncontiguous DMA API, which
> requires allocators to have two execution branches: one
> for the current API, and one for the new one.
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> [hch: untested conversion to the ne API]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  .../common/videobuf2/videobuf2-dma-contig.c   | 140 +++++++++++++++---
>  1 file changed, 116 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
> index 1e218bc440c6..40eaaef1565b 100644
> --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c
> +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
> @@ -17,6 +17,7 @@
>  #include <linux/sched.h>
>  #include <linux/slab.h>
>  #include <linux/dma-mapping.h>
> +#include <linux/highmem.h>
>  
>  #include <media/videobuf2-v4l2.h>
>  #include <media/videobuf2-dma-contig.h>
> @@ -42,6 +43,7 @@ struct vb2_dc_buf {
>  	struct dma_buf_attachment	*db_attach;
>  
>  	struct vb2_buffer		*vb;
> +	bool				coherent_mem;
>  };
>  
>  /*********************************************/
> @@ -78,14 +80,22 @@ static void *vb2_dc_cookie(struct vb2_buffer *vb, void *buf_priv)
>  static void *vb2_dc_vaddr(struct vb2_buffer *vb, void *buf_priv)
>  {
>  	struct vb2_dc_buf *buf = buf_priv;
> -	struct dma_buf_map map;
> -	int ret;
>  
> -	if (!buf->vaddr && buf->db_attach) {
> -		ret = dma_buf_vmap(buf->db_attach->dmabuf, &map);
> -		buf->vaddr = ret ? NULL : map.vaddr;
> +	if (buf->vaddr)
> +		return buf->vaddr;
> +
> +	if (buf->db_attach) {
> +		struct dma_buf_map map;
> +
> +		if (!dma_buf_vmap(buf->db_attach->dmabuf, &map))
> +			buf->vaddr = map.vaddr;
> +
> +		return buf->vaddr;
>  	}
>  
> +	/* Non-coherent memory */
> +	buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size, buf->dma_sgt);
> +

This function can use some comments. What is happening AFAICS is that
buf->vaddr is either set in vb2_dc_alloc_coherent (unless
DMA_ATTR_NO_KERNEL_MAPPING was set), it is obtained through dma_buf_vmap()
if the buffer was attached to a dma_buf, or it is allocated via
dma_vmap_noncontiguous() for non-coherent memory.

But this leaves coherent memory with DMA_ATTR_NO_KERNEL_MAPPING set, what
is vaddr in that case? I think it will call dma_vmap_noncontiguous()
incorrectly in that case, shouldn't there be a check for !buf->coherent_mem
before the call to dma_vmap_noncontiguous()?

It's quite confusing since vaddr can be set in different functions.

>  	return buf->vaddr;
>  }
>  
> @@ -101,13 +111,26 @@ static void vb2_dc_prepare(void *buf_priv)
>  	struct vb2_dc_buf *buf = buf_priv;
>  	struct sg_table *sgt = buf->dma_sgt;
>  
> +	/* This takes care of DMABUF and user-enforced cache sync hint */
>  	if (buf->vb->skip_cache_sync_on_prepare)
>  		return;
>  
> +	/*
> +	 * Coherent MMAP buffers do not need to be synced, unlike USERPTR
> +	 * and non-coherent MMAP buffers.
> +	 */
> +	if (buf->vb->memory == V4L2_MEMORY_MMAP && buf->coherent_mem)
> +		return;
> +
>  	if (!sgt)
>  		return;
>  
> +	/* For both USERPTR and non-coherent MMAP */
>  	dma_sync_sgtable_for_device(buf->dev, sgt, buf->dma_dir);
> +
> +	/* Non-coherent MMAP only */
> +	if (!buf->coherent_mem && buf->vaddr)
> +		flush_kernel_vmap_range(buf->vaddr, buf->size);
>  }
>  
>  static void vb2_dc_finish(void *buf_priv)
> @@ -115,19 +138,46 @@ static void vb2_dc_finish(void *buf_priv)
>  	struct vb2_dc_buf *buf = buf_priv;
>  	struct sg_table *sgt = buf->dma_sgt;
>  
> +	/* This takes care of DMABUF and user-enforced cache sync hint */
>  	if (buf->vb->skip_cache_sync_on_finish)
>  		return;
>  
> +	/*
> +	 * Coherent MMAP buffers do not need to be synced, unlike USERPTR
> +	 * and non-coherent MMAP buffers.
> +	 */
> +	if (buf->vb->memory == V4L2_MEMORY_MMAP && buf->coherent_mem)
> +		return;
> +
>  	if (!sgt)
>  		return;
>  
> +	/* For both USERPTR and non-coherent MMAP */
>  	dma_sync_sgtable_for_cpu(buf->dev, sgt, buf->dma_dir);
> +
> +	/* Non-coherent MMAP only */
> +	if (!buf->coherent_mem && buf->vaddr)
> +		invalidate_kernel_vmap_range(buf->vaddr, buf->size);
>  }
>  
>  /*********************************************/
>  /*        callbacks for MMAP buffers         */
>  /*********************************************/
>  
> +static void __vb2_dc_put(struct vb2_dc_buf *buf)
> +{
> +	if (buf->coherent_mem) {
> +		dma_free_attrs(buf->dev, buf->size, buf->cookie,
> +			       buf->dma_addr, buf->attrs);
> +		return;
> +	}
> +
> +	if (buf->vaddr)
> +		dma_vunmap_noncontiguous(buf->dev, buf->vaddr);
> +	dma_free_noncontiguous(buf->dev, buf->size,
> +			       buf->dma_sgt, buf->dma_addr);
> +}

Is there a reason for creating __vb2_dc_put()? I found it more
a hindrance when reviewing than an advantage. I prefer to have
it moved to vb2_dc_put, that way all the clean up happens in that
single function.

> +
>  static void vb2_dc_put(void *buf_priv)
>  {
>  	struct vb2_dc_buf *buf = buf_priv;
> @@ -139,17 +189,52 @@ static void vb2_dc_put(void *buf_priv)
>  		sg_free_table(buf->sgt_base);
>  		kfree(buf->sgt_base);
>  	}
> -	dma_free_attrs(buf->dev, buf->size, buf->cookie, buf->dma_addr,
> -		       buf->attrs);
> +	__vb2_dc_put(buf);
>  	put_device(buf->dev);
>  	kfree(buf);
>  }
>  
> +static int vb2_dc_alloc_coherent(struct vb2_dc_buf *buf)
> +{
> +	struct vb2_queue *q = buf->vb->vb2_queue;
> +
> +	buf->cookie = dma_alloc_attrs(buf->dev,
> +				      buf->size,
> +				      &buf->dma_addr,
> +				      GFP_KERNEL | q->gfp_flags,
> +				      buf->attrs);
> +	if (!buf->cookie)
> +		return -ENOMEM;
> +	if ((q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING) == 0)
> +		buf->vaddr = buf->cookie;
> +	return 0;
> +}
> +
> +static int vb2_dc_alloc_non_coherent(struct vb2_dc_buf *buf)
> +{
> +	struct vb2_queue *q = buf->vb->vb2_queue;
> +
> +	buf->dma_sgt = dma_alloc_noncontiguous(buf->dev,
> +					       buf->size,
> +					       buf->dma_dir,
> +					       GFP_KERNEL | q->gfp_flags,
> +					       buf->attrs);
> +	if (!buf->dma_sgt)
> +		return -ENOMEM;
> +	/*
> +	 * For requests that need kernel mapping (DMA_ATTR_NO_KERNEL_MAPPING

I'm confused about this comment: DMA_ATTR_NO_KERNEL_MAPPING is only checked
in vb2_dc_alloc_coherent(), so is this comment valid?

> +	 * bit is cleared) we perform dma_vmap_noncontiguous() later, in
> +	 * vb2_dc_vadd().

Typo: vb2_dc_vadd -> vb2_dc_vaddr

> +	 */

So this leaves buf->vaddr to NULL.

Is it possible that after allocating the buffer, the buffer is exported as a
dma_buf and another device calls dma_buf_ops vb2_dc_dmabuf_ops_vmap, which
in turn calls dma_buf_map_set_vaddr(map, buf->vaddr); with a NULL buf->vaddr?

I may be barking up the wrong tree here, to really understand it I would have
to spend more time.

> +	return 0;
> +}
> +
>  static void *vb2_dc_alloc(struct vb2_buffer *vb,
>  			  struct device *dev,
>  			  unsigned long size)
>  {
>  	struct vb2_dc_buf *buf;
> +	int ret;
>  
>  	if (WARN_ON(!dev))
>  		return ERR_PTR(-EINVAL);
> @@ -159,27 +244,28 @@ static void *vb2_dc_alloc(struct vb2_buffer *vb,
>  		return ERR_PTR(-ENOMEM);
>  
>  	buf->attrs = vb->vb2_queue->dma_attrs;
> -	buf->cookie = dma_alloc_attrs(dev, size, &buf->dma_addr,
> -				      GFP_KERNEL | vb->vb2_queue->gfp_flags,
> -				      buf->attrs);
> -	if (!buf->cookie) {
> -		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
> -		kfree(buf);
> -		return ERR_PTR(-ENOMEM);
> -	}
> -
> -	if ((buf->attrs & DMA_ATTR_NO_KERNEL_MAPPING) == 0)
> -		buf->vaddr = buf->cookie;
> +	buf->dma_dir = vb->vb2_queue->dma_dir;
> +	buf->vb = vb;
> +	buf->coherent_mem = vb->vb2_queue->coherent_mem;
>  
> +	buf->size = size;
>  	/* Prevent the device from being released while the buffer is used */
>  	buf->dev = get_device(dev);
> -	buf->size = size;
> -	buf->dma_dir = vb->vb2_queue->dma_dir;
> +
> +	if (buf->coherent_mem)
> +		ret = vb2_dc_alloc_coherent(buf);
> +	else
> +		ret = vb2_dc_alloc_non_coherent(buf);
> +
> +	if (ret) {
> +		dev_err(dev, "dma alloc of size %ld failed\n", size);
> +		kfree(buf);
> +		return ERR_PTR(-ENOMEM);
> +	}
>  
>  	buf->handler.refcount = &buf->refcount;
>  	buf->handler.put = vb2_dc_put;
>  	buf->handler.arg = buf;
> -	buf->vb = vb;
>  
>  	refcount_set(&buf->refcount, 1);
>  
> @@ -196,9 +282,12 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma)
>  		return -EINVAL;
>  	}
>  
> -	ret = dma_mmap_attrs(buf->dev, vma, buf->cookie,
> -		buf->dma_addr, buf->size, buf->attrs);
> -
> +	if (buf->coherent_mem)
> +		ret = dma_mmap_attrs(buf->dev, vma, buf->cookie, buf->dma_addr,
> +				     buf->size, buf->attrs);
> +	else
> +		ret = dma_mmap_noncontiguous(buf->dev, vma, buf->size,
> +					     buf->dma_sgt);
>  	if (ret) {
>  		pr_err("Remapping memory failed, error: %d\n", ret);
>  		return ret;
> @@ -390,6 +479,9 @@ static struct sg_table *vb2_dc_get_base_sgt(struct vb2_dc_buf *buf)
>  	int ret;
>  	struct sg_table *sgt;
>  
> +	if (!buf->coherent_mem)
> +		return buf->dma_sgt;
> +
>  	sgt = kmalloc(sizeof(*sgt), GFP_KERNEL);
>  	if (!sgt) {
>  		dev_err(buf->dev, "failed to alloc sg table\n");
> 

Regards,

	Hans
Sergey Senozhatsky June 17, 2021, 7:56 a.m. UTC | #3
On (21/06/03 13:59), Hans Verkuil wrote:
[[.]
> >  static void *vb2_dc_vaddr(struct vb2_buffer *vb, void *buf_priv)
> >  {
> >  	struct vb2_dc_buf *buf = buf_priv;
> > -	struct dma_buf_map map;
> > -	int ret;
> >  
> > -	if (!buf->vaddr && buf->db_attach) {
> > -		ret = dma_buf_vmap(buf->db_attach->dmabuf, &map);
> > -		buf->vaddr = ret ? NULL : map.vaddr;
> > +	if (buf->vaddr)
> > +		return buf->vaddr;
> > +
> > +	if (buf->db_attach) {
> > +		struct dma_buf_map map;
> > +
> > +		if (!dma_buf_vmap(buf->db_attach->dmabuf, &map))
> > +			buf->vaddr = map.vaddr;
> > +
> > +		return buf->vaddr;
> >  	}
> >  
> > +	/* Non-coherent memory */
> > +	buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size, buf->dma_sgt);
> > +
> 
> This function can use some comments. What is happening AFAICS is that
> buf->vaddr is either set in vb2_dc_alloc_coherent (unless
> DMA_ATTR_NO_KERNEL_MAPPING was set), it is obtained through dma_buf_vmap()
> if the buffer was attached to a dma_buf, or it is allocated via
> dma_vmap_noncontiguous() for non-coherent memory.

Yeah, it's complicated. Maybe we can make things more symmetrical.

> But this leaves coherent memory with DMA_ATTR_NO_KERNEL_MAPPING set, what
> is vaddr in that case? I think it will call dma_vmap_noncontiguous()
> incorrectly in that case, shouldn't there be a check for !buf->coherent_mem
> before the call to dma_vmap_noncontiguous()?

Thanks a lot for looking into it.

So vb2_dc_vaddr() can look like this:

static void *vb2_dc_vaddr(struct vb2_buffer *vb, void *buf_priv)
{
        struct vb2_dc_buf *buf = buf_priv;

        if (buf->vaddr)
                return buf->vaddr;

        if (buf->db_attach) {
                struct dma_buf_map map;

                if (!dma_buf_vmap(buf->db_attach->dmabuf, &map))
                        buf->vaddr = map.vaddr;

                return buf->vaddr;
        }

        if (!buf->coherent_mem)
                buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size,
                                                    buf->dma_sgt);
        return buf->vaddr;
}

And in vb2_dc_alloc functions set vaddr for !DMA_ATTR_NO_KERNEL_MAPPING
in both coherent and non-coherent. So that we probably can have less
branches when ->vaddr is NULL for one type of allocations, and is not
NULL for another.

static int vb2_dc_alloc_coherent(struct vb2_dc_buf *buf)
{
        struct vb2_queue *q = buf->vb->vb2_queue;

        buf->cookie = dma_alloc_attrs(buf->dev,
                                      buf->size,
                                      &buf->dma_addr,
                                      GFP_KERNEL | q->gfp_flags,
                                      buf->attrs);
        if (!buf->cookie)
                return -ENOMEM;

        if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
                return 0;

        buf->vaddr = buf->cookie;
        return 0;
}

static int vb2_dc_alloc_non_coherent(struct vb2_dc_buf *buf)
{
        struct vb2_queue *q = buf->vb->vb2_queue;

        buf->dma_sgt = dma_alloc_noncontiguous(buf->dev,
                                               buf->size,
                                               buf->dma_dir,
                                               GFP_KERNEL | q->gfp_flags,
                                               buf->attrs);
        if (!buf->dma_sgt)
                return -ENOMEM;

        if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
                return 0;

        buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size, buf->dma_sgt);
        if (!buf->vaddr) {
                dma_free_noncontiguous(buf->dev, buf->size,
                                       buf->dma_sgt, buf->dma_addr);
                return -ENOMEM;
        }
        return 0;
}
Sergey Senozhatsky June 17, 2021, 7:56 a.m. UTC | #4
On (21/06/03 13:59), Hans Verkuil wrote:
> >  
> > +static void __vb2_dc_put(struct vb2_dc_buf *buf)
> > +{
> > +	if (buf->coherent_mem) {
> > +		dma_free_attrs(buf->dev, buf->size, buf->cookie,
> > +			       buf->dma_addr, buf->attrs);
> > +		return;
> > +	}
> > +
> > +	if (buf->vaddr)
> > +		dma_vunmap_noncontiguous(buf->dev, buf->vaddr);
> > +	dma_free_noncontiguous(buf->dev, buf->size,
> > +			       buf->dma_sgt, buf->dma_addr);
> > +}
> 
> Is there a reason for creating __vb2_dc_put()? I found it more
> a hindrance when reviewing than an advantage. I prefer to have
> it moved to vb2_dc_put, that way all the clean up happens in that
> single function.

Done.
Christoph Hellwig June 17, 2021, 8:01 a.m. UTC | #5
On Thu, Jun 17, 2021 at 04:56:17PM +0900, Sergey Senozhatsky wrote:
> > This function can use some comments. What is happening AFAICS is that
> > buf->vaddr is either set in vb2_dc_alloc_coherent (unless
> > DMA_ATTR_NO_KERNEL_MAPPING was set), it is obtained through dma_buf_vmap()
> > if the buffer was attached to a dma_buf, or it is allocated via
> > dma_vmap_noncontiguous() for non-coherent memory.
> 
> Yeah, it's complicated. Maybe we can make things more symmetrical.
> 
> > But this leaves coherent memory with DMA_ATTR_NO_KERNEL_MAPPING set, what
> > is vaddr in that case? I think it will call dma_vmap_noncontiguous()
> > incorrectly in that case, shouldn't there be a check for !buf->coherent_mem
> > before the call to dma_vmap_noncontiguous()?
> 
> Thanks a lot for looking into it.

Can we just kill off DMA_ATTR_NO_KERNEL_MAPPING in v4l now?  There really is
no good reason to use that with dma_alloc_noncoherent/noncontiguous
available, and I plan to eventually remove that interface entirely.
Tomasz Figa June 17, 2021, 8:30 a.m. UTC | #6
On Thu, Jun 17, 2021 at 5:01 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Thu, Jun 17, 2021 at 04:56:17PM +0900, Sergey Senozhatsky wrote:
> > > This function can use some comments. What is happening AFAICS is that
> > > buf->vaddr is either set in vb2_dc_alloc_coherent (unless
> > > DMA_ATTR_NO_KERNEL_MAPPING was set), it is obtained through dma_buf_vmap()
> > > if the buffer was attached to a dma_buf, or it is allocated via
> > > dma_vmap_noncontiguous() for non-coherent memory.
> >
> > Yeah, it's complicated. Maybe we can make things more symmetrical.
> >
> > > But this leaves coherent memory with DMA_ATTR_NO_KERNEL_MAPPING set, what
> > > is vaddr in that case? I think it will call dma_vmap_noncontiguous()
> > > incorrectly in that case, shouldn't there be a check for !buf->coherent_mem
> > > before the call to dma_vmap_noncontiguous()?
> >
> > Thanks a lot for looking into it.
>
> Can we just kill off DMA_ATTR_NO_KERNEL_MAPPING in v4l now?  There really is
> no good reason to use that with dma_alloc_noncoherent/noncontiguous
> available, and I plan to eventually remove that interface entirely.

We still have use cases for dma_alloc_coherent() and DMA_ATTR_NO_KERNEL_MAPPING.

Perhaps the way to handle this would be to make the
dma_alloc_coherent() behave the same way as dma_alloc_noncontiguous(),
where it just allocates the memory without handling the kernel
mapping?

Best regards,
Tomasz
Christoph Hellwig June 17, 2021, 8:52 a.m. UTC | #7
On Thu, Jun 17, 2021 at 05:30:11PM +0900, Tomasz Figa wrote:
> We still have use cases for dma_alloc_coherent() and DMA_ATTR_NO_KERNEL_MAPPING.

dma_alloc_coherent does not take a flags argument, so you can't use
DMA_ATTR_NO_KERNEL_MAPPING with it.  What would your use case be here
anyway?  In general DMA_ATTR_NO_KERNEL_MAPPING is rather misnamed, as
usually there is a kernel mapping, just not one that is coherent and
should be used.
Tomasz Figa June 17, 2021, 9:40 a.m. UTC | #8
On Thu, Jun 17, 2021 at 5:52 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Thu, Jun 17, 2021 at 05:30:11PM +0900, Tomasz Figa wrote:
> > We still have use cases for dma_alloc_coherent() and DMA_ATTR_NO_KERNEL_MAPPING.
>
> dma_alloc_coherent does not take a flags argument, so you can't use
> DMA_ATTR_NO_KERNEL_MAPPING with it.  What would your use case be here
> anyway?  In general DMA_ATTR_NO_KERNEL_MAPPING is rather misnamed, as
> usually there is a kernel mapping, just not one that is coherent and
> should be used.

Sorry, I meant dma_alloc_attrs() and yes, it's indeed a misnomer. Our
use case basically has no need for the additional coherent mapping, so
creation of it can be skipped to save some vmalloc space. (Yes, it
probably only matters for 32-bit architectures.)
Christoph Hellwig June 17, 2021, 10:06 a.m. UTC | #9
On Thu, Jun 17, 2021 at 06:40:58PM +0900, Tomasz Figa wrote:
> Sorry, I meant dma_alloc_attrs() and yes, it's indeed a misnomer. Our
> use case basically has no need for the additional coherent mapping, so
> creation of it can be skipped to save some vmalloc space. (Yes, it
> probably only matters for 32-bit architectures.)

Yes, that is the normal use case, and it is solved by using
dma_alloc_noncoherent or dma_alloc_noncontigous without the vmap
step.
Tomasz Figa June 18, 2021, 3:21 a.m. UTC | #10
On Thu, Jun 17, 2021 at 7:07 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Thu, Jun 17, 2021 at 06:40:58PM +0900, Tomasz Figa wrote:
> > Sorry, I meant dma_alloc_attrs() and yes, it's indeed a misnomer. Our
> > use case basically has no need for the additional coherent mapping, so
> > creation of it can be skipped to save some vmalloc space. (Yes, it
> > probably only matters for 32-bit architectures.)
>
> Yes, that is the normal use case, and it is solved by using
> dma_alloc_noncoherent or dma_alloc_noncontigous without the vmap
> step.

True, silly me. Probably not enough coffee at the time I was looking at it.

With that, wouldn't it be possible to completely get rid of
dma_alloc_{coherent,attrs}() and use dma_alloc_noncontiguous() +
optional kernel and/or userspace mapping helper everywhere? (Possibly
renaming it to something as simple as dma_alloc().
Christoph Hellwig June 18, 2021, 4:25 a.m. UTC | #11
On Fri, Jun 18, 2021 at 12:21:33PM +0900, Tomasz Figa wrote:
> > On Thu, Jun 17, 2021 at 06:40:58PM +0900, Tomasz Figa wrote:
> > > Sorry, I meant dma_alloc_attrs() and yes, it's indeed a misnomer. Our
> > > use case basically has no need for the additional coherent mapping, so
> > > creation of it can be skipped to save some vmalloc space. (Yes, it
> > > probably only matters for 32-bit architectures.)
> >
> > Yes, that is the normal use case, and it is solved by using
> > dma_alloc_noncoherent or dma_alloc_noncontigous without the vmap
> > step.
> 
> True, silly me. Probably not enough coffee at the time I was looking at it.
> 
> With that, wouldn't it be possible to completely get rid of
> dma_alloc_{coherent,attrs}() and use dma_alloc_noncontiguous() +
> optional kernel and/or userspace mapping helper everywhere? (Possibly
> renaming it to something as simple as dma_alloc().

Well, dma_alloc_coherent users want a non-cached mapping.  And while
some architectures provide that using a vmap with "uncached" bits in the
PTE to provide that, this:

 a) is not possibly everywhere
 b) even where possible is not always the best idea as it creates mappings
    with differnet cachability bets

And even without that dma_alloc_noncoherent causes less overhead than
dma_alloc_noncontigious if you only need a single contiguous range.

So while I'm happy we have something useful for more complex drivers like
v4l I think the simple dma_alloc_coherent API, including some of the less
crazy flags for dma_alloc_attrs is the right thing to use for more than
90% of the use cases.
Tomasz Figa June 18, 2021, 4:44 a.m. UTC | #12
On Fri, Jun 18, 2021 at 1:25 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Jun 18, 2021 at 12:21:33PM +0900, Tomasz Figa wrote:
> > > On Thu, Jun 17, 2021 at 06:40:58PM +0900, Tomasz Figa wrote:
> > > > Sorry, I meant dma_alloc_attrs() and yes, it's indeed a misnomer. Our
> > > > use case basically has no need for the additional coherent mapping, so
> > > > creation of it can be skipped to save some vmalloc space. (Yes, it
> > > > probably only matters for 32-bit architectures.)
> > >
> > > Yes, that is the normal use case, and it is solved by using
> > > dma_alloc_noncoherent or dma_alloc_noncontigous without the vmap
> > > step.
> >
> > True, silly me. Probably not enough coffee at the time I was looking at it.
> >
> > With that, wouldn't it be possible to completely get rid of
> > dma_alloc_{coherent,attrs}() and use dma_alloc_noncontiguous() +
> > optional kernel and/or userspace mapping helper everywhere? (Possibly
> > renaming it to something as simple as dma_alloc().
>
> Well, dma_alloc_coherent users want a non-cached mapping.  And while
> some architectures provide that using a vmap with "uncached" bits in the
> PTE to provide that, this:
>
>  a) is not possibly everywhere
>  b) even where possible is not always the best idea as it creates mappings
>     with differnet cachability bets

I think this could be addressed by having a dma_vmap() helper that
does the right thing, whether it's vmap() or dma_common_pages_remap()
as appropriate. Or would be this still insufficient for some
architectures?

>
> And even without that dma_alloc_noncoherent causes less overhead than
> dma_alloc_noncontigious if you only need a single contiguous range.
>

Given that behind the scenes dma_alloc_noncontiguous() would also just
call __dma_alloc_pages() for devices that need contiguous pages, would
the overhead be basically the creation of a single-entry sgtable?

> So while I'm happy we have something useful for more complex drivers like
> v4l I think the simple dma_alloc_coherent API, including some of the less
> crazy flags for dma_alloc_attrs is the right thing to use for more than
> 90% of the use cases.

One thing to take into account here is that many drivers use the
existing "simple" way, just because there wasn't a viable alternative
to do something better. Agreed, though, that we shouldn't optimize for
the rare cases.

Best regards,
Tomasz
Christoph Hellwig June 22, 2021, 7:33 a.m. UTC | #13
On Fri, Jun 18, 2021 at 01:44:08PM +0900, Tomasz Figa wrote:
> > Well, dma_alloc_coherent users want a non-cached mapping.  And while
> > some architectures provide that using a vmap with "uncached" bits in the
> > PTE to provide that, this:
> >
> >  a) is not possibly everywhere
> >  b) even where possible is not always the best idea as it creates mappings
> >     with differnet cachability bets
> 
> I think this could be addressed by having a dma_vmap() helper that
> does the right thing, whether it's vmap() or dma_common_pages_remap()
> as appropriate. Or would be this still insufficient for some
> architectures?

It can't always do the right thing.  E.g. for the case where uncached
memory needs to be allocated from a special boot time fixed pool.

> > And even without that dma_alloc_noncoherent causes less overhead than
> > dma_alloc_noncontigious if you only need a single contiguous range.
> >
> 
> Given that behind the scenes dma_alloc_noncontiguous() would also just
> call __dma_alloc_pages() for devices that need contiguous pages, would
> the overhead be basically the creation of a single-entry sgtable?

In the best case: yes.

> > So while I'm happy we have something useful for more complex drivers like
> > v4l I think the simple dma_alloc_coherent API, including some of the less
> > crazy flags for dma_alloc_attrs is the right thing to use for more than
> > 90% of the use cases.
> 
> One thing to take into account here is that many drivers use the
> existing "simple" way, just because there wasn't a viable alternative
> to do something better. Agreed, though, that we shouldn't optimize for
> the rare cases.

While that might be true for a few drivers, it is absolutely not true
for the wide majority.  I think you media people are a little special,
with only the GPU folks contending for "specialness" :)  (although
media handles it way better, gpu folks just create local hacks that
can't work portably).
Sergey Senozhatsky June 25, 2021, 3:10 a.m. UTC | #14
Hi Hans,

On (21/06/17 16:56), Sergey Senozhatsky wrote:
[..]
> static void *vb2_dc_vaddr(struct vb2_buffer *vb, void *buf_priv)
> {
>         struct vb2_dc_buf *buf = buf_priv;
> 
>         if (buf->vaddr)
>                 return buf->vaddr;
> 
>         if (buf->db_attach) {
>                 struct dma_buf_map map;
> 
>                 if (!dma_buf_vmap(buf->db_attach->dmabuf, &map))
>                         buf->vaddr = map.vaddr;
> 
>                 return buf->vaddr;
>         }
> 
>         if (!buf->coherent_mem)
>                 buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size,
>                                                     buf->dma_sgt);
>         return buf->vaddr;
> }
> 
> And in vb2_dc_alloc functions set vaddr for !DMA_ATTR_NO_KERNEL_MAPPING
> in both coherent and non-coherent. So that we probably can have less
> branches when ->vaddr is NULL for one type of allocations, and is not
> NULL for another.
> 
> static int vb2_dc_alloc_coherent(struct vb2_dc_buf *buf)
> {
>         struct vb2_queue *q = buf->vb->vb2_queue;
> 
>         buf->cookie = dma_alloc_attrs(buf->dev,
>                                       buf->size,
>                                       &buf->dma_addr,
>                                       GFP_KERNEL | q->gfp_flags,
>                                       buf->attrs);
>         if (!buf->cookie)
>                 return -ENOMEM;
> 
>         if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
>                 return 0;
> 
>         buf->vaddr = buf->cookie;
>         return 0;
> }
> 
> static int vb2_dc_alloc_non_coherent(struct vb2_dc_buf *buf)
> {
>         struct vb2_queue *q = buf->vb->vb2_queue;
> 
>         buf->dma_sgt = dma_alloc_noncontiguous(buf->dev,
>                                                buf->size,
>                                                buf->dma_dir,
>                                                GFP_KERNEL | q->gfp_flags,
>                                                buf->attrs);
>         if (!buf->dma_sgt)
>                 return -ENOMEM;
> 
>         if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
>                 return 0;
> 
>         buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size, buf->dma_sgt);
>         if (!buf->vaddr) {
>                 dma_free_noncontiguous(buf->dev, buf->size,
>                                        buf->dma_sgt, buf->dma_addr);
>                 return -ENOMEM;
>         }
>         return 0;
> }

I guess this should address the case when

"after allocating the buffer, the buffer is exported as a dma_buf and
another device calls dma_buf_ops vb2_dc_dmabuf_ops_vmap, which in turn
calls dma_buf_map_set_vaddr(map, buf->vaddr); with a NULL buf->vaddr"

Because ->vaddr will not be NULL now after allocation for both coherent
and non-coherent buffers (modulo DMA_ATTR_NO_KERNEL_MAPPING requests).

What do you think?
Tomasz Figa July 7, 2021, 12:42 p.m. UTC | #15
On Tue, Jun 22, 2021 at 4:33 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Jun 18, 2021 at 01:44:08PM +0900, Tomasz Figa wrote:
> > > Well, dma_alloc_coherent users want a non-cached mapping.  And while
> > > some architectures provide that using a vmap with "uncached" bits in the
> > > PTE to provide that, this:
> > >
> > >  a) is not possibly everywhere
> > >  b) even where possible is not always the best idea as it creates mappings
> > >     with differnet cachability bets
> >
> > I think this could be addressed by having a dma_vmap() helper that
> > does the right thing, whether it's vmap() or dma_common_pages_remap()
> > as appropriate. Or would be this still insufficient for some
> > architectures?
>
> It can't always do the right thing.  E.g. for the case where uncached
> memory needs to be allocated from a special boot time fixed pool.
>

Fair enough. Thanks for elaborating.

> > > And even without that dma_alloc_noncoherent causes less overhead than
> > > dma_alloc_noncontigious if you only need a single contiguous range.
> > >
> >
> > Given that behind the scenes dma_alloc_noncontiguous() would also just
> > call __dma_alloc_pages() for devices that need contiguous pages, would
> > the overhead be basically the creation of a single-entry sgtable?
>
> In the best case: yes.
>
> > > So while I'm happy we have something useful for more complex drivers like
> > > v4l I think the simple dma_alloc_coherent API, including some of the less
> > > crazy flags for dma_alloc_attrs is the right thing to use for more than
> > > 90% of the use cases.
> >
> > One thing to take into account here is that many drivers use the
> > existing "simple" way, just because there wasn't a viable alternative
> > to do something better. Agreed, though, that we shouldn't optimize for
> > the rare cases.
>
> While that might be true for a few drivers, it is absolutely not true
> for the wide majority.  I think you media people are a little special,
> with only the GPU folks contending for "specialness" :)  (although
> media handles it way better, gpu folks just create local hacks that
> can't work portably).

I don't have the evidence to argue, so let's just leave it at "time
will tell". I think it's great that we have the possibility to do the
more special things and we can see where it goes from now on. :)

Best regards,
Tomasz
Tomasz Figa July 7, 2021, 12:48 p.m. UTC | #16
On Fri, Jun 25, 2021 at 12:10 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> Hi Hans,
>
> On (21/06/17 16:56), Sergey Senozhatsky wrote:
> [..]
> > static void *vb2_dc_vaddr(struct vb2_buffer *vb, void *buf_priv)
> > {
> >         struct vb2_dc_buf *buf = buf_priv;
> >
> >         if (buf->vaddr)
> >                 return buf->vaddr;
> >
> >         if (buf->db_attach) {
> >                 struct dma_buf_map map;
> >
> >                 if (!dma_buf_vmap(buf->db_attach->dmabuf, &map))
> >                         buf->vaddr = map.vaddr;
> >
> >                 return buf->vaddr;
> >         }
> >
> >         if (!buf->coherent_mem)
> >                 buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size,
> >                                                     buf->dma_sgt);
> >         return buf->vaddr;
> > }
> >
> > And in vb2_dc_alloc functions set vaddr for !DMA_ATTR_NO_KERNEL_MAPPING
> > in both coherent and non-coherent. So that we probably can have less
> > branches when ->vaddr is NULL for one type of allocations, and is not
> > NULL for another.

I'd prefer if it stayed as is. This opportunistic mapping as in the
current revision is quite nice, because most of the drivers don't
bother to set DMA_ATTR_NO_KERNEL_MAPPING even if they don't need the
kernel mapping. Also, even if the driver itself doesn't need the
kernel mapping, we can still create one on demand if the DMA-buf
importer demands it from us.

> >
> > static int vb2_dc_alloc_coherent(struct vb2_dc_buf *buf)
> > {
> >         struct vb2_queue *q = buf->vb->vb2_queue;
> >
> >         buf->cookie = dma_alloc_attrs(buf->dev,
> >                                       buf->size,
> >                                       &buf->dma_addr,
> >                                       GFP_KERNEL | q->gfp_flags,
> >                                       buf->attrs);
> >         if (!buf->cookie)
> >                 return -ENOMEM;
> >
> >         if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
> >                 return 0;
> >
> >         buf->vaddr = buf->cookie;
> >         return 0;
> > }
> >
> > static int vb2_dc_alloc_non_coherent(struct vb2_dc_buf *buf)
> > {
> >         struct vb2_queue *q = buf->vb->vb2_queue;
> >
> >         buf->dma_sgt = dma_alloc_noncontiguous(buf->dev,
> >                                                buf->size,
> >                                                buf->dma_dir,
> >                                                GFP_KERNEL | q->gfp_flags,
> >                                                buf->attrs);
> >         if (!buf->dma_sgt)
> >                 return -ENOMEM;
> >
> >         if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
> >                 return 0;
> >
> >         buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size, buf->dma_sgt);
> >         if (!buf->vaddr) {
> >                 dma_free_noncontiguous(buf->dev, buf->size,
> >                                        buf->dma_sgt, buf->dma_addr);
> >                 return -ENOMEM;
> >         }
> >         return 0;
> > }
>
> I guess this should address the case when
>
> "after allocating the buffer, the buffer is exported as a dma_buf and
> another device calls dma_buf_ops vb2_dc_dmabuf_ops_vmap, which in turn
> calls dma_buf_map_set_vaddr(map, buf->vaddr); with a NULL buf->vaddr"

Sorry, I fail to get what this is about. Where does this quote come from?

>
> Because ->vaddr will not be NULL now after allocation for both coherent
> and non-coherent buffers (modulo DMA_ATTR_NO_KERNEL_MAPPING requests).
>
> What do you think?

Hans, any feedback on this? Thanks.

Best regards,
Tomasz
Sergey Senozhatsky July 7, 2021, 1:29 p.m. UTC | #17
On (21/07/07 21:48), Tomasz Figa wrote:
> > [..]
> > > static void *vb2_dc_vaddr(struct vb2_buffer *vb, void *buf_priv)
> > > {
> > >         struct vb2_dc_buf *buf = buf_priv;
> > >
> > >         if (buf->vaddr)
> > >                 return buf->vaddr;
> > >
> > >         if (buf->db_attach) {
> > >                 struct dma_buf_map map;
> > >
> > >                 if (!dma_buf_vmap(buf->db_attach->dmabuf, &map))
> > >                         buf->vaddr = map.vaddr;
> > >
> > >                 return buf->vaddr;
> > >         }
> > >
> > >         if (!buf->coherent_mem)
> > >                 buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size,
> > >                                                     buf->dma_sgt);
> > >         return buf->vaddr;
> > > }
> > >
> > > And in vb2_dc_alloc functions set vaddr for !DMA_ATTR_NO_KERNEL_MAPPING
> > > in both coherent and non-coherent. So that we probably can have less
> > > branches when ->vaddr is NULL for one type of allocations, and is not
> > > NULL for another.
> 
> I'd prefer if it stayed as is. This opportunistic mapping as in the
> current revision is quite nice, because most of the drivers don't
> bother to set DMA_ATTR_NO_KERNEL_MAPPING even if they don't need the
> kernel mapping. Also, even if the driver itself doesn't need the
> kernel mapping, we can still create one on demand if the DMA-buf
> importer demands it from us.

[..]

> > > static int vb2_dc_alloc_coherent(struct vb2_dc_buf *buf)
> > > {
> > >         struct vb2_queue *q = buf->vb->vb2_queue;
> > >
> > >         buf->cookie = dma_alloc_attrs(buf->dev,
> > >                                       buf->size,
> > >                                       &buf->dma_addr,
> > >                                       GFP_KERNEL | q->gfp_flags,
> > >                                       buf->attrs);
> > >         if (!buf->cookie)
> > >                 return -ENOMEM;
> > >
> > >         if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
> > >                 return 0;
> > >
> > >         buf->vaddr = buf->cookie;
> > >         return 0;
> > > }
> > >
> > > static int vb2_dc_alloc_non_coherent(struct vb2_dc_buf *buf)
> > > {
> > >         struct vb2_queue *q = buf->vb->vb2_queue;
> > >
> > >         buf->dma_sgt = dma_alloc_noncontiguous(buf->dev,
> > >                                                buf->size,
> > >                                                buf->dma_dir,
> > >                                                GFP_KERNEL | q->gfp_flags,
> > >                                                buf->attrs);
> > >         if (!buf->dma_sgt)
> > >                 return -ENOMEM;
> > >
> > >         if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
> > >                 return 0;
> > >
> > >         buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size, buf->dma_sgt);
> > >         if (!buf->vaddr) {
> > >                 dma_free_noncontiguous(buf->dev, buf->size,
> > >                                        buf->dma_sgt, buf->dma_addr);
> > >                 return -ENOMEM;
> > >         }
> > >         return 0;
> > > }
> >
> > I guess this should address the case when
> >
> > "after allocating the buffer, the buffer is exported as a dma_buf and
> > another device calls dma_buf_ops vb2_dc_dmabuf_ops_vmap, which in turn
> > calls dma_buf_map_set_vaddr(map, buf->vaddr); with a NULL buf->vaddr"
> 
> Sorry, I fail to get what this is about. Where does this quote come from?

Bottom half of https://lore.kernel.org/lkml/10a0903a-e295-5cba-683a-1eb89a0804ed@xs4all.nl/
Tomasz Figa July 7, 2021, 2:10 p.m. UTC | #18
On Wed, Jul 7, 2021 at 10:29 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> On (21/07/07 21:48), Tomasz Figa wrote:
> > > [..]
> > > > static void *vb2_dc_vaddr(struct vb2_buffer *vb, void *buf_priv)
> > > > {
> > > >         struct vb2_dc_buf *buf = buf_priv;
> > > >
> > > >         if (buf->vaddr)
> > > >                 return buf->vaddr;
> > > >
> > > >         if (buf->db_attach) {
> > > >                 struct dma_buf_map map;
> > > >
> > > >                 if (!dma_buf_vmap(buf->db_attach->dmabuf, &map))
> > > >                         buf->vaddr = map.vaddr;
> > > >
> > > >                 return buf->vaddr;
> > > >         }
> > > >
> > > >         if (!buf->coherent_mem)
> > > >                 buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size,
> > > >                                                     buf->dma_sgt);
> > > >         return buf->vaddr;
> > > > }
> > > >
> > > > And in vb2_dc_alloc functions set vaddr for !DMA_ATTR_NO_KERNEL_MAPPING
> > > > in both coherent and non-coherent. So that we probably can have less
> > > > branches when ->vaddr is NULL for one type of allocations, and is not
> > > > NULL for another.
> >
> > I'd prefer if it stayed as is. This opportunistic mapping as in the
> > current revision is quite nice, because most of the drivers don't
> > bother to set DMA_ATTR_NO_KERNEL_MAPPING even if they don't need the
> > kernel mapping. Also, even if the driver itself doesn't need the
> > kernel mapping, we can still create one on demand if the DMA-buf
> > importer demands it from us.
>
> [..]
>
> > > > static int vb2_dc_alloc_coherent(struct vb2_dc_buf *buf)
> > > > {
> > > >         struct vb2_queue *q = buf->vb->vb2_queue;
> > > >
> > > >         buf->cookie = dma_alloc_attrs(buf->dev,
> > > >                                       buf->size,
> > > >                                       &buf->dma_addr,
> > > >                                       GFP_KERNEL | q->gfp_flags,
> > > >                                       buf->attrs);
> > > >         if (!buf->cookie)
> > > >                 return -ENOMEM;
> > > >
> > > >         if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
> > > >                 return 0;
> > > >
> > > >         buf->vaddr = buf->cookie;
> > > >         return 0;
> > > > }
> > > >
> > > > static int vb2_dc_alloc_non_coherent(struct vb2_dc_buf *buf)
> > > > {
> > > >         struct vb2_queue *q = buf->vb->vb2_queue;
> > > >
> > > >         buf->dma_sgt = dma_alloc_noncontiguous(buf->dev,
> > > >                                                buf->size,
> > > >                                                buf->dma_dir,
> > > >                                                GFP_KERNEL | q->gfp_flags,
> > > >                                                buf->attrs);
> > > >         if (!buf->dma_sgt)
> > > >                 return -ENOMEM;
> > > >
> > > >         if (q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
> > > >                 return 0;
> > > >
> > > >         buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size, buf->dma_sgt);
> > > >         if (!buf->vaddr) {
> > > >                 dma_free_noncontiguous(buf->dev, buf->size,
> > > >                                        buf->dma_sgt, buf->dma_addr);
> > > >                 return -ENOMEM;
> > > >         }
> > > >         return 0;
> > > > }
> > >
> > > I guess this should address the case when
> > >
> > > "after allocating the buffer, the buffer is exported as a dma_buf and
> > > another device calls dma_buf_ops vb2_dc_dmabuf_ops_vmap, which in turn
> > > calls dma_buf_map_set_vaddr(map, buf->vaddr); with a NULL buf->vaddr"
> >
> > Sorry, I fail to get what this is about. Where does this quote come from?
>
> Bottom half of https://lore.kernel.org/lkml/10a0903a-e295-5cba-683a-1eb89a0804ed@xs4all.nl/

I see, thanks for the pointer. Yes, vb2_dc_dmabuf_ops_vmap() needs to
be changed so that it calls vb2_dc_vaddr() internally instead of
relying on buf->vaddr directly.
Sergey Senozhatsky July 9, 2021, 7:20 a.m. UTC | #19
On (21/07/07 23:10), Tomasz Figa wrote:
> > > >
> > > > I guess this should address the case when
> > > >
> > > > "after allocating the buffer, the buffer is exported as a dma_buf and
> > > > another device calls dma_buf_ops vb2_dc_dmabuf_ops_vmap, which in turn
> > > > calls dma_buf_map_set_vaddr(map, buf->vaddr); with a NULL buf->vaddr"
> > >
> > > Sorry, I fail to get what this is about. Where does this quote come from?
> >
> > Bottom half of https://lore.kernel.org/lkml/10a0903a-e295-5cba-683a-1eb89a0804ed@xs4all.nl/
> 
> I see, thanks for the pointer. Yes, vb2_dc_dmabuf_ops_vmap() needs to
> be changed so that it calls vb2_dc_vaddr() internally instead of
> relying on buf->vaddr directly.

Done.
diff mbox series

Patch

diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
index 1e218bc440c6..40eaaef1565b 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
@@ -17,6 +17,7 @@ 
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/dma-mapping.h>
+#include <linux/highmem.h>
 
 #include <media/videobuf2-v4l2.h>
 #include <media/videobuf2-dma-contig.h>
@@ -42,6 +43,7 @@  struct vb2_dc_buf {
 	struct dma_buf_attachment	*db_attach;
 
 	struct vb2_buffer		*vb;
+	bool				coherent_mem;
 };
 
 /*********************************************/
@@ -78,14 +80,22 @@  static void *vb2_dc_cookie(struct vb2_buffer *vb, void *buf_priv)
 static void *vb2_dc_vaddr(struct vb2_buffer *vb, void *buf_priv)
 {
 	struct vb2_dc_buf *buf = buf_priv;
-	struct dma_buf_map map;
-	int ret;
 
-	if (!buf->vaddr && buf->db_attach) {
-		ret = dma_buf_vmap(buf->db_attach->dmabuf, &map);
-		buf->vaddr = ret ? NULL : map.vaddr;
+	if (buf->vaddr)
+		return buf->vaddr;
+
+	if (buf->db_attach) {
+		struct dma_buf_map map;
+
+		if (!dma_buf_vmap(buf->db_attach->dmabuf, &map))
+			buf->vaddr = map.vaddr;
+
+		return buf->vaddr;
 	}
 
+	/* Non-coherent memory */
+	buf->vaddr = dma_vmap_noncontiguous(buf->dev, buf->size, buf->dma_sgt);
+
 	return buf->vaddr;
 }
 
@@ -101,13 +111,26 @@  static void vb2_dc_prepare(void *buf_priv)
 	struct vb2_dc_buf *buf = buf_priv;
 	struct sg_table *sgt = buf->dma_sgt;
 
+	/* This takes care of DMABUF and user-enforced cache sync hint */
 	if (buf->vb->skip_cache_sync_on_prepare)
 		return;
 
+	/*
+	 * Coherent MMAP buffers do not need to be synced, unlike USERPTR
+	 * and non-coherent MMAP buffers.
+	 */
+	if (buf->vb->memory == V4L2_MEMORY_MMAP && buf->coherent_mem)
+		return;
+
 	if (!sgt)
 		return;
 
+	/* For both USERPTR and non-coherent MMAP */
 	dma_sync_sgtable_for_device(buf->dev, sgt, buf->dma_dir);
+
+	/* Non-coherent MMAP only */
+	if (!buf->coherent_mem && buf->vaddr)
+		flush_kernel_vmap_range(buf->vaddr, buf->size);
 }
 
 static void vb2_dc_finish(void *buf_priv)
@@ -115,19 +138,46 @@  static void vb2_dc_finish(void *buf_priv)
 	struct vb2_dc_buf *buf = buf_priv;
 	struct sg_table *sgt = buf->dma_sgt;
 
+	/* This takes care of DMABUF and user-enforced cache sync hint */
 	if (buf->vb->skip_cache_sync_on_finish)
 		return;
 
+	/*
+	 * Coherent MMAP buffers do not need to be synced, unlike USERPTR
+	 * and non-coherent MMAP buffers.
+	 */
+	if (buf->vb->memory == V4L2_MEMORY_MMAP && buf->coherent_mem)
+		return;
+
 	if (!sgt)
 		return;
 
+	/* For both USERPTR and non-coherent MMAP */
 	dma_sync_sgtable_for_cpu(buf->dev, sgt, buf->dma_dir);
+
+	/* Non-coherent MMAP only */
+	if (!buf->coherent_mem && buf->vaddr)
+		invalidate_kernel_vmap_range(buf->vaddr, buf->size);
 }
 
 /*********************************************/
 /*        callbacks for MMAP buffers         */
 /*********************************************/
 
+static void __vb2_dc_put(struct vb2_dc_buf *buf)
+{
+	if (buf->coherent_mem) {
+		dma_free_attrs(buf->dev, buf->size, buf->cookie,
+			       buf->dma_addr, buf->attrs);
+		return;
+	}
+
+	if (buf->vaddr)
+		dma_vunmap_noncontiguous(buf->dev, buf->vaddr);
+	dma_free_noncontiguous(buf->dev, buf->size,
+			       buf->dma_sgt, buf->dma_addr);
+}
+
 static void vb2_dc_put(void *buf_priv)
 {
 	struct vb2_dc_buf *buf = buf_priv;
@@ -139,17 +189,52 @@  static void vb2_dc_put(void *buf_priv)
 		sg_free_table(buf->sgt_base);
 		kfree(buf->sgt_base);
 	}
-	dma_free_attrs(buf->dev, buf->size, buf->cookie, buf->dma_addr,
-		       buf->attrs);
+	__vb2_dc_put(buf);
 	put_device(buf->dev);
 	kfree(buf);
 }
 
+static int vb2_dc_alloc_coherent(struct vb2_dc_buf *buf)
+{
+	struct vb2_queue *q = buf->vb->vb2_queue;
+
+	buf->cookie = dma_alloc_attrs(buf->dev,
+				      buf->size,
+				      &buf->dma_addr,
+				      GFP_KERNEL | q->gfp_flags,
+				      buf->attrs);
+	if (!buf->cookie)
+		return -ENOMEM;
+	if ((q->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING) == 0)
+		buf->vaddr = buf->cookie;
+	return 0;
+}
+
+static int vb2_dc_alloc_non_coherent(struct vb2_dc_buf *buf)
+{
+	struct vb2_queue *q = buf->vb->vb2_queue;
+
+	buf->dma_sgt = dma_alloc_noncontiguous(buf->dev,
+					       buf->size,
+					       buf->dma_dir,
+					       GFP_KERNEL | q->gfp_flags,
+					       buf->attrs);
+	if (!buf->dma_sgt)
+		return -ENOMEM;
+	/*
+	 * For requests that need kernel mapping (DMA_ATTR_NO_KERNEL_MAPPING
+	 * bit is cleared) we perform dma_vmap_noncontiguous() later, in
+	 * vb2_dc_vadd().
+	 */
+	return 0;
+}
+
 static void *vb2_dc_alloc(struct vb2_buffer *vb,
 			  struct device *dev,
 			  unsigned long size)
 {
 	struct vb2_dc_buf *buf;
+	int ret;
 
 	if (WARN_ON(!dev))
 		return ERR_PTR(-EINVAL);
@@ -159,27 +244,28 @@  static void *vb2_dc_alloc(struct vb2_buffer *vb,
 		return ERR_PTR(-ENOMEM);
 
 	buf->attrs = vb->vb2_queue->dma_attrs;
-	buf->cookie = dma_alloc_attrs(dev, size, &buf->dma_addr,
-				      GFP_KERNEL | vb->vb2_queue->gfp_flags,
-				      buf->attrs);
-	if (!buf->cookie) {
-		dev_err(dev, "dma_alloc_coherent of size %ld failed\n", size);
-		kfree(buf);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	if ((buf->attrs & DMA_ATTR_NO_KERNEL_MAPPING) == 0)
-		buf->vaddr = buf->cookie;
+	buf->dma_dir = vb->vb2_queue->dma_dir;
+	buf->vb = vb;
+	buf->coherent_mem = vb->vb2_queue->coherent_mem;
 
+	buf->size = size;
 	/* Prevent the device from being released while the buffer is used */
 	buf->dev = get_device(dev);
-	buf->size = size;
-	buf->dma_dir = vb->vb2_queue->dma_dir;
+
+	if (buf->coherent_mem)
+		ret = vb2_dc_alloc_coherent(buf);
+	else
+		ret = vb2_dc_alloc_non_coherent(buf);
+
+	if (ret) {
+		dev_err(dev, "dma alloc of size %ld failed\n", size);
+		kfree(buf);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	buf->handler.refcount = &buf->refcount;
 	buf->handler.put = vb2_dc_put;
 	buf->handler.arg = buf;
-	buf->vb = vb;
 
 	refcount_set(&buf->refcount, 1);
 
@@ -196,9 +282,12 @@  static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma)
 		return -EINVAL;
 	}
 
-	ret = dma_mmap_attrs(buf->dev, vma, buf->cookie,
-		buf->dma_addr, buf->size, buf->attrs);
-
+	if (buf->coherent_mem)
+		ret = dma_mmap_attrs(buf->dev, vma, buf->cookie, buf->dma_addr,
+				     buf->size, buf->attrs);
+	else
+		ret = dma_mmap_noncontiguous(buf->dev, vma, buf->size,
+					     buf->dma_sgt);
 	if (ret) {
 		pr_err("Remapping memory failed, error: %d\n", ret);
 		return ret;
@@ -390,6 +479,9 @@  static struct sg_table *vb2_dc_get_base_sgt(struct vb2_dc_buf *buf)
 	int ret;
 	struct sg_table *sgt;
 
+	if (!buf->coherent_mem)
+		return buf->dma_sgt;
+
 	sgt = kmalloc(sizeof(*sgt), GFP_KERNEL);
 	if (!sgt) {
 		dev_err(buf->dev, "failed to alloc sg table\n");