diff mbox series

[v6,1/4] RDMA/umem: Support importing dma-buf as user memory region

Message ID 1603471201-32588-2-git-send-email-jianxin.xiong@intel.com (mailing list archive)
State Superseded
Headers show
Series RDMA: Add dma-buf support | expand

Commit Message

Xiong, Jianxin Oct. 23, 2020, 4:39 p.m. UTC
Dma-buf is a standard cross-driver buffer sharing mechanism that can be
used to support peer-to-peer access from RDMA devices.

Device memory exported via dma-buf is associated with a file descriptor.
This is passed to the user space as a property associated with the
buffer allocation. When the buffer is registered as a memory region,
the file descriptor is passed to the RDMA driver along with other
parameters.

Implement the common code for importing dma-buf object and mapping
dma-buf pages.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Acked-by: Christian Koenig <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/infiniband/core/Makefile      |   2 +-
 drivers/infiniband/core/umem.c        |   4 +
 drivers/infiniband/core/umem_dmabuf.c | 197 ++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/umem_dmabuf.h |  11 ++
 include/rdma/ib_umem.h                |  35 +++++-
 5 files changed, 247 insertions(+), 2 deletions(-)
 create mode 100644 drivers/infiniband/core/umem_dmabuf.c
 create mode 100644 drivers/infiniband/core/umem_dmabuf.h

Comments

Daniel Vetter Oct. 23, 2020, 4:49 p.m. UTC | #1
On Fri, Oct 23, 2020 at 09:39:58AM -0700, Jianxin Xiong wrote:
> Dma-buf is a standard cross-driver buffer sharing mechanism that can be
> used to support peer-to-peer access from RDMA devices.
> 
> Device memory exported via dma-buf is associated with a file descriptor.
> This is passed to the user space as a property associated with the
> buffer allocation. When the buffer is registered as a memory region,
> the file descriptor is passed to the RDMA driver along with other
> parameters.
> 
> Implement the common code for importing dma-buf object and mapping
> dma-buf pages.
> 
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Acked-by: Christian Koenig <christian.koenig@amd.com>
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/infiniband/core/Makefile      |   2 +-
>  drivers/infiniband/core/umem.c        |   4 +
>  drivers/infiniband/core/umem_dmabuf.c | 197 ++++++++++++++++++++++++++++++++++
>  drivers/infiniband/core/umem_dmabuf.h |  11 ++
>  include/rdma/ib_umem.h                |  35 +++++-
>  5 files changed, 247 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/infiniband/core/umem_dmabuf.c
>  create mode 100644 drivers/infiniband/core/umem_dmabuf.h
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index ccf2670..8ab4eea 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -40,5 +40,5 @@ ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
>  				uverbs_std_types_srq.o \
>  				uverbs_std_types_wq.o \
>  				uverbs_std_types_qp.o
> -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
>  ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index e9fecbd..2c45525 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -2,6 +2,7 @@
>   * Copyright (c) 2005 Topspin Communications.  All rights reserved.
>   * Copyright (c) 2005 Cisco Systems.  All rights reserved.
>   * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
>   *
>   * This software is available to you under a choice of one of two
>   * licenses.  You may choose to be licensed under the terms of the GNU
> @@ -43,6 +44,7 @@
>  #include <rdma/ib_umem_odp.h>
>  
>  #include "uverbs.h"
> +#include "umem_dmabuf.h"
>  
>  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>  {
> @@ -269,6 +271,8 @@ void ib_umem_release(struct ib_umem *umem)
>  {
>  	if (!umem)
>  		return;
> +	if (umem->is_dmabuf)
> +		return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
>  	if (umem->is_odp)
>  		return ib_umem_odp_release(to_ib_umem_odp(umem));
>  
> diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
> new file mode 100644
> index 0000000..66b234d
> --- /dev/null
> +++ b/drivers/infiniband/core/umem_dmabuf.c
> @@ -0,0 +1,197 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +/*
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/dma-resv.h>
> +#include <linux/dma-mapping.h>
> +
> +#include "uverbs.h"
> +#include "umem_dmabuf.h"
> +
> +/*
> + * Generate a new dma sg list from a sub range of an existing dma sg list.
> + * Both the input and output have their entries page aligned.
> + */
> +static int ib_umem_dmabuf_sgt_slice(struct sg_table *sgt, u64 offset,
> +				    u64 length, struct sg_table *new_sgt)
> +{
> +	struct scatterlist *sg, *new_sg;
> +	u64 start, end, off, addr, len;
> +	unsigned int new_nents;
> +	int err;
> +	int i;
> +
> +	start = ALIGN_DOWN(offset, PAGE_SIZE);
> +	end = ALIGN(offset + length, PAGE_SIZE);
> +
> +	offset = start;
> +	length = end - start;
> +	new_nents = 0;
> +	for_each_sgtable_dma_sg(sgt, sg, i) {
> +		len = sg_dma_len(sg);
> +		off = min(len, offset);
> +		len -= off;
> +		len = min(len, length);
> +		if (len)
> +			new_nents++;
> +		length -= len;
> +		offset -= off;
> +	}
> +
> +	err = sg_alloc_table(new_sgt, new_nents, GFP_KERNEL);
> +	if (err)
> +		return err;
> +
> +	offset = start;
> +	length = end - start;
> +	new_sg = new_sgt->sgl;
> +	for_each_sgtable_dma_sg(sgt, sg, i) {
> +		addr = sg_dma_address(sg);
> +		len = sg_dma_len(sg);
> +		off = min(len, offset);
> +		addr += off;
> +		len -= off;
> +		len = min(len, length);
> +		if (len) {
> +			sg_dma_address(new_sg) = addr;
> +			sg_dma_len(new_sg) = len;
> +			new_sg = sg_next(new_sg);
> +		}
> +		length -= len;
> +		offset -= off;
> +	}
> +
> +	return 0;
> +}
> +
> +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> +	struct sg_table *sgt;
> +	struct dma_fence *fence;
> +	int err;
> +
> +	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> +
> +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> +				     DMA_BIDIRECTIONAL);
> +
> +	if (IS_ERR(sgt))
> +		return PTR_ERR(sgt);
> +
> +	err = ib_umem_dmabuf_sgt_slice(sgt, umem_dmabuf->umem.address,
> +				       umem_dmabuf->umem.length,
> +				       &umem_dmabuf->umem.sg_head);
> +	if (err) {
> +		dma_buf_unmap_attachment(umem_dmabuf->attach, sgt,
> +					 DMA_BIDIRECTIONAL);
> +		return err;
> +	}
> +
> +	umem_dmabuf->umem.nmap = umem_dmabuf->umem.sg_head.nents;
> +	umem_dmabuf->sgt = sgt;
> +
> +	/*
> +	 * Although the sg list is valid now, the content of the pages
> +	 * may be not up-to-date. Wait for the exporter to finish
> +	 * the migration.
> +	 */
> +	fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
> +	if (fence)
> +		dma_fence_wait(fence, false);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
> +
> +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> +	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> +
> +	if (!umem_dmabuf->sgt)
> +		return;
> +
> +	sg_free_table(&umem_dmabuf->umem.sg_head);
> +	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
> +				 DMA_BIDIRECTIONAL);
> +	umem_dmabuf->sgt = NULL;
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
> +
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +				   unsigned long offset, size_t size,
> +				   int fd, int access,
> +				   const struct dma_buf_attach_ops *ops)
> +{
> +	struct dma_buf *dmabuf;
> +	struct ib_umem_dmabuf *umem_dmabuf;
> +	struct ib_umem *umem;
> +	unsigned long end;
> +	long ret;
> +
> +	if (check_add_overflow(offset, (unsigned long)size, &end))
> +		return ERR_PTR(-EINVAL);
> +
> +	if (unlikely(PAGE_ALIGN(end) < PAGE_SIZE))
> +		return ERR_PTR(-EINVAL);
> +
> +	if (unlikely(!ops || !ops->move_notify))
> +		return ERR_PTR(-EINVAL);
> +
> +#ifdef CONFIG_DMA_VIRT_OPS
> +	if (device->dma_device->dma_ops == &dma_virt_ops)
> +		return ERR_PTR(-EINVAL);
> +#endif

Maybe I'm confused, but should we have this check in dma_buf_attach, or at
least in dma_buf_dynamic_attach? The p2pdma functions use that too, and I
can't imagine how zerocopy should work (which is like the entire point of
dma-buf) when we have dma_virt_ops.

A similar problem exists for swiotlb bounce buffers, not sure how that's
solved.
-Daniel

> +
> +	umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
> +	if (!umem_dmabuf)
> +		return ERR_PTR(-ENOMEM);
> +
> +	umem = &umem_dmabuf->umem;
> +	umem->ibdev = device;
> +	umem->length = size;
> +	umem->address = offset;
> +	umem->iova = offset;
> +	umem->writable = ib_access_writable(access);
> +	umem->is_dmabuf = 1;
> +
> +	dmabuf = dma_buf_get(fd);
> +	if (IS_ERR(dmabuf)) {
> +		ret = PTR_ERR(dmabuf);
> +		goto out_free_umem;
> +	}
> +
> +	umem_dmabuf->attach = dma_buf_dynamic_attach(
> +					dmabuf,
> +					device->dma_device,
> +					ops,
> +					umem_dmabuf);
> +	if (IS_ERR(umem_dmabuf->attach)) {
> +		ret = PTR_ERR(umem_dmabuf->attach);
> +		goto out_release_dmabuf;
> +	}
> +
> +	return umem;
> +
> +out_release_dmabuf:
> +	dma_buf_put(dmabuf);
> +
> +out_free_umem:
> +	kfree(umem_dmabuf);
> +	return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_get);
> +
> +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> +	struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
> +
> +	dma_resv_lock(dmabuf->resv, NULL);
> +	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
> +	dma_resv_unlock(dmabuf->resv);
> +
> +	dma_buf_detach(dmabuf, umem_dmabuf->attach);
> +	dma_buf_put(dmabuf);
> +	kfree(umem_dmabuf);
> +}
> diff --git a/drivers/infiniband/core/umem_dmabuf.h b/drivers/infiniband/core/umem_dmabuf.h
> new file mode 100644
> index 0000000..13acf55
> --- /dev/null
> +++ b/drivers/infiniband/core/umem_dmabuf.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +/*
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> + */
> +
> +#ifndef UMEM_DMABUF_H
> +#define UMEM_DMABUF_H
> +
> +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> +
> +#endif /* UMEM_DMABUF_H */
> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> index 7059750..73a7b19 100644
> --- a/include/rdma/ib_umem.h
> +++ b/include/rdma/ib_umem.h
> @@ -1,6 +1,7 @@
>  /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
>  /*
>   * Copyright (c) 2007 Cisco Systems.  All rights reserved.
> + * Copyright (c) 2020 Intel Corporation.  All rights reserved.
>   */
>  
>  #ifndef IB_UMEM_H
> @@ -13,6 +14,7 @@
>  
>  struct ib_ucontext;
>  struct ib_umem_odp;
> +struct dma_buf_attach_ops;
>  
>  struct ib_umem {
>  	struct ib_device       *ibdev;
> @@ -22,12 +24,25 @@ struct ib_umem {
>  	unsigned long		address;
>  	u32 writable : 1;
>  	u32 is_odp : 1;
> +	u32 is_dmabuf : 1;
>  	struct work_struct	work;
>  	struct sg_table sg_head;
>  	int             nmap;
>  	unsigned int    sg_nents;
>  };
>  
> +struct ib_umem_dmabuf {
> +	struct ib_umem umem;
> +	struct dma_buf_attachment *attach;
> +	struct sg_table *sgt;
> +	void *device_context;
> +};
> +
> +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
> +{
> +	return container_of(umem, struct ib_umem_dmabuf, umem);
> +}
> +
>  /* Returns the offset of the umem start relative to the first page. */
>  static inline int ib_umem_offset(struct ib_umem *umem)
>  {
> @@ -79,6 +94,12 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
>  unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
>  				     unsigned long pgsz_bitmap,
>  				     unsigned long virt);
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +				   unsigned long offset, size_t size,
> +				   int fd, int access,
> +				   const struct dma_buf_attach_ops *ops);
> +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf);
> +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
>  
>  #else /* CONFIG_INFINIBAND_USER_MEM */
>  
> @@ -101,7 +122,19 @@ static inline unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
>  {
>  	return 0;
>  }
> +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +						 unsigned long offset,
> +						 size_t size, int fd,
> +						 int access,
> +						 struct dma_buf_attach_ops *ops)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
> +static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> +	return -EINVAL;
> +}
> +static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { }
>  
>  #endif /* CONFIG_INFINIBAND_USER_MEM */
> -
>  #endif /* IB_UMEM_H */
> -- 
> 1.8.3.1
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Xiong, Jianxin Oct. 23, 2020, 6:09 p.m. UTC | #2
> -----Original Message-----
> From: Daniel Vetter <daniel@ffwll.ch>
> Sent: Friday, October 23, 2020 9:49 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Leon Romanovsky <leon@kernel.org>; Jason Gunthorpe <jgg@ziepe.ca>;
> Doug Ledford <dledford@redhat.com>; Vetter, Daniel <daniel.vetter@intel.com>; Christian Koenig <christian.koenig@amd.com>
> Subject: Re: [PATCH v6 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> On Fri, Oct 23, 2020 at 09:39:58AM -0700, Jianxin Xiong wrote:
> > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > be used to support peer-to-peer access from RDMA devices.
> >
> > Device memory exported via dma-buf is associated with a file descriptor.
> > This is passed to the user space as a property associated with the
> > buffer allocation. When the buffer is registered as a memory region,
> > the file descriptor is passed to the RDMA driver along with other
> > parameters.
> >
> > Implement the common code for importing dma-buf object and mapping
> > dma-buf pages.
> >
> > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > Acked-by: Christian Koenig <christian.koenig@amd.com>
> > Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> > ---
> >  drivers/infiniband/core/Makefile      |   2 +-
> >  drivers/infiniband/core/umem.c        |   4 +
> >  drivers/infiniband/core/umem_dmabuf.c | 197
> > ++++++++++++++++++++++++++++++++++
> >  drivers/infiniband/core/umem_dmabuf.h |  11 ++
> >  include/rdma/ib_umem.h                |  35 +++++-
> >  5 files changed, 247 insertions(+), 2 deletions(-)  create mode
> > 100644 drivers/infiniband/core/umem_dmabuf.c
> >  create mode 100644 drivers/infiniband/core/umem_dmabuf.h
> >
> > diff --git a/drivers/infiniband/core/Makefile
> > b/drivers/infiniband/core/Makefile
> > index ccf2670..8ab4eea 100644
> > --- a/drivers/infiniband/core/Makefile
> > +++ b/drivers/infiniband/core/Makefile
> > @@ -40,5 +40,5 @@ ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
> >  				uverbs_std_types_srq.o \
> >  				uverbs_std_types_wq.o \
> >  				uverbs_std_types_qp.o
> > -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> > +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
> >  ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff
> > --git a/drivers/infiniband/core/umem.c
> > b/drivers/infiniband/core/umem.c index e9fecbd..2c45525 100644
> > --- a/drivers/infiniband/core/umem.c
> > +++ b/drivers/infiniband/core/umem.c
> > @@ -2,6 +2,7 @@
> >   * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> >   * Copyright (c) 2005 Cisco Systems.  All rights reserved.
> >   * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> >   *
> >   * This software is available to you under a choice of one of two
> >   * licenses.  You may choose to be licensed under the terms of the
> > GNU @@ -43,6 +44,7 @@  #include <rdma/ib_umem_odp.h>
> >
> >  #include "uverbs.h"
> > +#include "umem_dmabuf.h"
> >
> >  static void __ib_umem_release(struct ib_device *dev, struct ib_umem
> > *umem, int dirty)  { @@ -269,6 +271,8 @@ void ib_umem_release(struct
> > ib_umem *umem)  {
> >  	if (!umem)
> >  		return;
> > +	if (umem->is_dmabuf)
> > +		return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
> >  	if (umem->is_odp)
> >  		return ib_umem_odp_release(to_ib_umem_odp(umem));
> >
> > diff --git a/drivers/infiniband/core/umem_dmabuf.c
> > b/drivers/infiniband/core/umem_dmabuf.c
> > new file mode 100644
> > index 0000000..66b234d
> > --- /dev/null
> > +++ b/drivers/infiniband/core/umem_dmabuf.c
> > @@ -0,0 +1,197 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > +/*
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > + */
> > +
> > +#include <linux/dma-buf.h>
> > +#include <linux/dma-resv.h>
> > +#include <linux/dma-mapping.h>
> > +
> > +#include "uverbs.h"
> > +#include "umem_dmabuf.h"
> > +
> > +/*
> > + * Generate a new dma sg list from a sub range of an existing dma sg list.
> > + * Both the input and output have their entries page aligned.
> > + */
> > +static int ib_umem_dmabuf_sgt_slice(struct sg_table *sgt, u64 offset,
> > +				    u64 length, struct sg_table *new_sgt) {
> > +	struct scatterlist *sg, *new_sg;
> > +	u64 start, end, off, addr, len;
> > +	unsigned int new_nents;
> > +	int err;
> > +	int i;
> > +
> > +	start = ALIGN_DOWN(offset, PAGE_SIZE);
> > +	end = ALIGN(offset + length, PAGE_SIZE);
> > +
> > +	offset = start;
> > +	length = end - start;
> > +	new_nents = 0;
> > +	for_each_sgtable_dma_sg(sgt, sg, i) {
> > +		len = sg_dma_len(sg);
> > +		off = min(len, offset);
> > +		len -= off;
> > +		len = min(len, length);
> > +		if (len)
> > +			new_nents++;
> > +		length -= len;
> > +		offset -= off;
> > +	}
> > +
> > +	err = sg_alloc_table(new_sgt, new_nents, GFP_KERNEL);
> > +	if (err)
> > +		return err;
> > +
> > +	offset = start;
> > +	length = end - start;
> > +	new_sg = new_sgt->sgl;
> > +	for_each_sgtable_dma_sg(sgt, sg, i) {
> > +		addr = sg_dma_address(sg);
> > +		len = sg_dma_len(sg);
> > +		off = min(len, offset);
> > +		addr += off;
> > +		len -= off;
> > +		len = min(len, length);
> > +		if (len) {
> > +			sg_dma_address(new_sg) = addr;
> > +			sg_dma_len(new_sg) = len;
> > +			new_sg = sg_next(new_sg);
> > +		}
> > +		length -= len;
> > +		offset -= off;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) {
> > +	struct sg_table *sgt;
> > +	struct dma_fence *fence;
> > +	int err;
> > +
> > +	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> > +
> > +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> > +				     DMA_BIDIRECTIONAL);
> > +
> > +	if (IS_ERR(sgt))
> > +		return PTR_ERR(sgt);
> > +
> > +	err = ib_umem_dmabuf_sgt_slice(sgt, umem_dmabuf->umem.address,
> > +				       umem_dmabuf->umem.length,
> > +				       &umem_dmabuf->umem.sg_head);
> > +	if (err) {
> > +		dma_buf_unmap_attachment(umem_dmabuf->attach, sgt,
> > +					 DMA_BIDIRECTIONAL);
> > +		return err;
> > +	}
> > +
> > +	umem_dmabuf->umem.nmap = umem_dmabuf->umem.sg_head.nents;
> > +	umem_dmabuf->sgt = sgt;
> > +
> > +	/*
> > +	 * Although the sg list is valid now, the content of the pages
> > +	 * may be not up-to-date. Wait for the exporter to finish
> > +	 * the migration.
> > +	 */
> > +	fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
> > +	if (fence)
> > +		dma_fence_wait(fence, false);
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
> > +
> > +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) {
> > +	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> > +
> > +	if (!umem_dmabuf->sgt)
> > +		return;
> > +
> > +	sg_free_table(&umem_dmabuf->umem.sg_head);
> > +	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
> > +				 DMA_BIDIRECTIONAL);
> > +	umem_dmabuf->sgt = NULL;
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
> > +
> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > +				   unsigned long offset, size_t size,
> > +				   int fd, int access,
> > +				   const struct dma_buf_attach_ops *ops) {
> > +	struct dma_buf *dmabuf;
> > +	struct ib_umem_dmabuf *umem_dmabuf;
> > +	struct ib_umem *umem;
> > +	unsigned long end;
> > +	long ret;
> > +
> > +	if (check_add_overflow(offset, (unsigned long)size, &end))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	if (unlikely(PAGE_ALIGN(end) < PAGE_SIZE))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	if (unlikely(!ops || !ops->move_notify))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +#ifdef CONFIG_DMA_VIRT_OPS
> > +	if (device->dma_device->dma_ops == &dma_virt_ops)
> > +		return ERR_PTR(-EINVAL);
> > +#endif
> 
> Maybe I'm confused, but should we have this check in dma_buf_attach, or at least in dma_buf_dynamic_attach? The p2pdma functions use
> that too, and I can't imagine how zerocopy should work (which is like the entire point of
> dma-buf) when we have dma_virt_ops.
> 
> A similar problem exists for swiotlb bounce buffers, not sure how that's solved.
> -Daniel
> 

This is also checked by dma_buf_dynamic_attach(), not in the common code, but
in the exporter's attach() method. For example, the attach method of 'amdgpu' calls p2pdma_distance_many and would disable p2p if dma_virt_ops is detected.

Here we could instead check the peer2peer flag from the returned 'attach' structure.
 
> > +
> > +	umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
> > +	if (!umem_dmabuf)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	umem = &umem_dmabuf->umem;
> > +	umem->ibdev = device;
> > +	umem->length = size;
> > +	umem->address = offset;
> > +	umem->iova = offset;
> > +	umem->writable = ib_access_writable(access);
> > +	umem->is_dmabuf = 1;
> > +
> > +	dmabuf = dma_buf_get(fd);
> > +	if (IS_ERR(dmabuf)) {
> > +		ret = PTR_ERR(dmabuf);
> > +		goto out_free_umem;
> > +	}
> > +
> > +	umem_dmabuf->attach = dma_buf_dynamic_attach(
> > +					dmabuf,
> > +					device->dma_device,
> > +					ops,
> > +					umem_dmabuf);
> > +	if (IS_ERR(umem_dmabuf->attach)) {
> > +		ret = PTR_ERR(umem_dmabuf->attach);
> > +		goto out_release_dmabuf;
> > +	}
> > +
> > +	return umem;
> > +
> > +out_release_dmabuf:
> > +	dma_buf_put(dmabuf);
> > +
> > +out_free_umem:
> > +	kfree(umem_dmabuf);
> > +	return ERR_PTR(ret);
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_get);
> > +
> > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) {
> > +	struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
> > +
> > +	dma_resv_lock(dmabuf->resv, NULL);
> > +	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
> > +	dma_resv_unlock(dmabuf->resv);
> > +
> > +	dma_buf_detach(dmabuf, umem_dmabuf->attach);
> > +	dma_buf_put(dmabuf);
> > +	kfree(umem_dmabuf);
> > +}
> > diff --git a/drivers/infiniband/core/umem_dmabuf.h
> > b/drivers/infiniband/core/umem_dmabuf.h
> > new file mode 100644
> > index 0000000..13acf55
> > --- /dev/null
> > +++ b/drivers/infiniband/core/umem_dmabuf.h
> > @@ -0,0 +1,11 @@
> > +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> > +/*
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > + */
> > +
> > +#ifndef UMEM_DMABUF_H
> > +#define UMEM_DMABUF_H
> > +
> > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> > +
> > +#endif /* UMEM_DMABUF_H */
> > diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index
> > 7059750..73a7b19 100644
> > --- a/include/rdma/ib_umem.h
> > +++ b/include/rdma/ib_umem.h
> > @@ -1,6 +1,7 @@
> >  /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> >  /*
> >   * Copyright (c) 2007 Cisco Systems.  All rights reserved.
> > + * Copyright (c) 2020 Intel Corporation.  All rights reserved.
> >   */
> >
> >  #ifndef IB_UMEM_H
> > @@ -13,6 +14,7 @@
> >
> >  struct ib_ucontext;
> >  struct ib_umem_odp;
> > +struct dma_buf_attach_ops;
> >
> >  struct ib_umem {
> >  	struct ib_device       *ibdev;
> > @@ -22,12 +24,25 @@ struct ib_umem {
> >  	unsigned long		address;
> >  	u32 writable : 1;
> >  	u32 is_odp : 1;
> > +	u32 is_dmabuf : 1;
> >  	struct work_struct	work;
> >  	struct sg_table sg_head;
> >  	int             nmap;
> >  	unsigned int    sg_nents;
> >  };
> >
> > +struct ib_umem_dmabuf {
> > +	struct ib_umem umem;
> > +	struct dma_buf_attachment *attach;
> > +	struct sg_table *sgt;
> > +	void *device_context;
> > +};
> > +
> > +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem
> > +*umem) {
> > +	return container_of(umem, struct ib_umem_dmabuf, umem); }
> > +
> >  /* Returns the offset of the umem start relative to the first page.
> > */  static inline int ib_umem_offset(struct ib_umem *umem)  { @@ -79,6
> > +94,12 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem,
> > size_t offset,  unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
> >  				     unsigned long pgsz_bitmap,
> >  				     unsigned long virt);
> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > +				   unsigned long offset, size_t size,
> > +				   int fd, int access,
> > +				   const struct dma_buf_attach_ops *ops); int
> > +ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void
> > +ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
> >
> >  #else /* CONFIG_INFINIBAND_USER_MEM */
> >
> > @@ -101,7 +122,19 @@ static inline unsigned long
> > ib_umem_find_best_pgsz(struct ib_umem *umem,  {
> >  	return 0;
> >  }
> > +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > +						 unsigned long offset,
> > +						 size_t size, int fd,
> > +						 int access,
> > +						 struct dma_buf_attach_ops *ops) {
> > +	return ERR_PTR(-EINVAL);
> > +}
> > +static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf
> > +*umem_dmabuf) {
> > +	return -EINVAL;
> > +}
> > +static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf
> > +*umem_dmabuf) { }
> >
> >  #endif /* CONFIG_INFINIBAND_USER_MEM */
> > -
> >  #endif /* IB_UMEM_H */
> > --
> > 1.8.3.1
> >
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter Oct. 23, 2020, 6:13 p.m. UTC | #3
On Fri, Oct 23, 2020 at 8:09 PM Xiong, Jianxin <jianxin.xiong@intel.com> wrote:
>
>
> > -----Original Message-----
> > From: Daniel Vetter <daniel@ffwll.ch>
> > Sent: Friday, October 23, 2020 9:49 AM
> > To: Xiong, Jianxin <jianxin.xiong@intel.com>
> > Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Leon Romanovsky <leon@kernel.org>; Jason Gunthorpe <jgg@ziepe.ca>;
> > Doug Ledford <dledford@redhat.com>; Vetter, Daniel <daniel.vetter@intel.com>; Christian Koenig <christian.koenig@amd.com>
> > Subject: Re: [PATCH v6 1/4] RDMA/umem: Support importing dma-buf as user memory region
> >
> > On Fri, Oct 23, 2020 at 09:39:58AM -0700, Jianxin Xiong wrote:
> > > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > > be used to support peer-to-peer access from RDMA devices.
> > >
> > > Device memory exported via dma-buf is associated with a file descriptor.
> > > This is passed to the user space as a property associated with the
> > > buffer allocation. When the buffer is registered as a memory region,
> > > the file descriptor is passed to the RDMA driver along with other
> > > parameters.
> > >
> > > Implement the common code for importing dma-buf object and mapping
> > > dma-buf pages.
> > >
> > > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > > Acked-by: Christian Koenig <christian.koenig@amd.com>
> > > Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> > > ---
> > >  drivers/infiniband/core/Makefile      |   2 +-
> > >  drivers/infiniband/core/umem.c        |   4 +
> > >  drivers/infiniband/core/umem_dmabuf.c | 197
> > > ++++++++++++++++++++++++++++++++++
> > >  drivers/infiniband/core/umem_dmabuf.h |  11 ++
> > >  include/rdma/ib_umem.h                |  35 +++++-
> > >  5 files changed, 247 insertions(+), 2 deletions(-)  create mode
> > > 100644 drivers/infiniband/core/umem_dmabuf.c
> > >  create mode 100644 drivers/infiniband/core/umem_dmabuf.h
> > >
> > > diff --git a/drivers/infiniband/core/Makefile
> > > b/drivers/infiniband/core/Makefile
> > > index ccf2670..8ab4eea 100644
> > > --- a/drivers/infiniband/core/Makefile
> > > +++ b/drivers/infiniband/core/Makefile
> > > @@ -40,5 +40,5 @@ ib_uverbs-y :=                    uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
> > >                             uverbs_std_types_srq.o \
> > >                             uverbs_std_types_wq.o \
> > >                             uverbs_std_types_qp.o
> > > -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> > > +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
> > >  ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff
> > > --git a/drivers/infiniband/core/umem.c
> > > b/drivers/infiniband/core/umem.c index e9fecbd..2c45525 100644
> > > --- a/drivers/infiniband/core/umem.c
> > > +++ b/drivers/infiniband/core/umem.c
> > > @@ -2,6 +2,7 @@
> > >   * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> > >   * Copyright (c) 2005 Cisco Systems.  All rights reserved.
> > >   * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
> > > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > >   *
> > >   * This software is available to you under a choice of one of two
> > >   * licenses.  You may choose to be licensed under the terms of the
> > > GNU @@ -43,6 +44,7 @@  #include <rdma/ib_umem_odp.h>
> > >
> > >  #include "uverbs.h"
> > > +#include "umem_dmabuf.h"
> > >
> > >  static void __ib_umem_release(struct ib_device *dev, struct ib_umem
> > > *umem, int dirty)  { @@ -269,6 +271,8 @@ void ib_umem_release(struct
> > > ib_umem *umem)  {
> > >     if (!umem)
> > >             return;
> > > +   if (umem->is_dmabuf)
> > > +           return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
> > >     if (umem->is_odp)
> > >             return ib_umem_odp_release(to_ib_umem_odp(umem));
> > >
> > > diff --git a/drivers/infiniband/core/umem_dmabuf.c
> > > b/drivers/infiniband/core/umem_dmabuf.c
> > > new file mode 100644
> > > index 0000000..66b234d
> > > --- /dev/null
> > > +++ b/drivers/infiniband/core/umem_dmabuf.c
> > > @@ -0,0 +1,197 @@
> > > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > > +/*
> > > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > > + */
> > > +
> > > +#include <linux/dma-buf.h>
> > > +#include <linux/dma-resv.h>
> > > +#include <linux/dma-mapping.h>
> > > +
> > > +#include "uverbs.h"
> > > +#include "umem_dmabuf.h"
> > > +
> > > +/*
> > > + * Generate a new dma sg list from a sub range of an existing dma sg list.
> > > + * Both the input and output have their entries page aligned.
> > > + */
> > > +static int ib_umem_dmabuf_sgt_slice(struct sg_table *sgt, u64 offset,
> > > +                               u64 length, struct sg_table *new_sgt) {
> > > +   struct scatterlist *sg, *new_sg;
> > > +   u64 start, end, off, addr, len;
> > > +   unsigned int new_nents;
> > > +   int err;
> > > +   int i;
> > > +
> > > +   start = ALIGN_DOWN(offset, PAGE_SIZE);
> > > +   end = ALIGN(offset + length, PAGE_SIZE);
> > > +
> > > +   offset = start;
> > > +   length = end - start;
> > > +   new_nents = 0;
> > > +   for_each_sgtable_dma_sg(sgt, sg, i) {
> > > +           len = sg_dma_len(sg);
> > > +           off = min(len, offset);
> > > +           len -= off;
> > > +           len = min(len, length);
> > > +           if (len)
> > > +                   new_nents++;
> > > +           length -= len;
> > > +           offset -= off;
> > > +   }
> > > +
> > > +   err = sg_alloc_table(new_sgt, new_nents, GFP_KERNEL);
> > > +   if (err)
> > > +           return err;
> > > +
> > > +   offset = start;
> > > +   length = end - start;
> > > +   new_sg = new_sgt->sgl;
> > > +   for_each_sgtable_dma_sg(sgt, sg, i) {
> > > +           addr = sg_dma_address(sg);
> > > +           len = sg_dma_len(sg);
> > > +           off = min(len, offset);
> > > +           addr += off;
> > > +           len -= off;
> > > +           len = min(len, length);
> > > +           if (len) {
> > > +                   sg_dma_address(new_sg) = addr;
> > > +                   sg_dma_len(new_sg) = len;
> > > +                   new_sg = sg_next(new_sg);
> > > +           }
> > > +           length -= len;
> > > +           offset -= off;
> > > +   }
> > > +
> > > +   return 0;
> > > +}
> > > +
> > > +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) {
> > > +   struct sg_table *sgt;
> > > +   struct dma_fence *fence;
> > > +   int err;
> > > +
> > > +   dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> > > +
> > > +   sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> > > +                                DMA_BIDIRECTIONAL);
> > > +
> > > +   if (IS_ERR(sgt))
> > > +           return PTR_ERR(sgt);
> > > +
> > > +   err = ib_umem_dmabuf_sgt_slice(sgt, umem_dmabuf->umem.address,
> > > +                                  umem_dmabuf->umem.length,
> > > +                                  &umem_dmabuf->umem.sg_head);
> > > +   if (err) {
> > > +           dma_buf_unmap_attachment(umem_dmabuf->attach, sgt,
> > > +                                    DMA_BIDIRECTIONAL);
> > > +           return err;
> > > +   }
> > > +
> > > +   umem_dmabuf->umem.nmap = umem_dmabuf->umem.sg_head.nents;
> > > +   umem_dmabuf->sgt = sgt;
> > > +
> > > +   /*
> > > +    * Although the sg list is valid now, the content of the pages
> > > +    * may be not up-to-date. Wait for the exporter to finish
> > > +    * the migration.
> > > +    */
> > > +   fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
> > > +   if (fence)
> > > +           dma_fence_wait(fence, false);
> > > +
> > > +   return 0;
> > > +}
> > > +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
> > > +
> > > +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) {
> > > +   dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> > > +
> > > +   if (!umem_dmabuf->sgt)
> > > +           return;
> > > +
> > > +   sg_free_table(&umem_dmabuf->umem.sg_head);
> > > +   dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
> > > +                            DMA_BIDIRECTIONAL);
> > > +   umem_dmabuf->sgt = NULL;
> > > +}
> > > +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
> > > +
> > > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > > +                              unsigned long offset, size_t size,
> > > +                              int fd, int access,
> > > +                              const struct dma_buf_attach_ops *ops) {
> > > +   struct dma_buf *dmabuf;
> > > +   struct ib_umem_dmabuf *umem_dmabuf;
> > > +   struct ib_umem *umem;
> > > +   unsigned long end;
> > > +   long ret;
> > > +
> > > +   if (check_add_overflow(offset, (unsigned long)size, &end))
> > > +           return ERR_PTR(-EINVAL);
> > > +
> > > +   if (unlikely(PAGE_ALIGN(end) < PAGE_SIZE))
> > > +           return ERR_PTR(-EINVAL);
> > > +
> > > +   if (unlikely(!ops || !ops->move_notify))
> > > +           return ERR_PTR(-EINVAL);
> > > +
> > > +#ifdef CONFIG_DMA_VIRT_OPS
> > > +   if (device->dma_device->dma_ops == &dma_virt_ops)
> > > +           return ERR_PTR(-EINVAL);
> > > +#endif
> >
> > Maybe I'm confused, but should we have this check in dma_buf_attach, or at least in dma_buf_dynamic_attach? The p2pdma functions use
> > that too, and I can't imagine how zerocopy should work (which is like the entire point of
> > dma-buf) when we have dma_virt_ops.
> >
> > A similar problem exists for swiotlb bounce buffers, not sure how that's solved.
> > -Daniel
> >
>
> This is also checked by dma_buf_dynamic_attach(), not in the common code, but
> in the exporter's attach() method. For example, the attach method of 'amdgpu' calls p2pdma_distance_many and would disable p2p if dma_virt_ops is detected.
>
> Here we could instead check the peer2peer flag from the returned 'attach' structure.

The thing is, if you're a virtual device, there's cpu access functions
for your in dma_buf. So this should not happen, irrespective of p2p or
not. Or I'm totally missing the point here.

And in general I'd say if importers expect certain invariants for
stuff the exporter gives them, then the dma-buf layer should enforce
that. At least with debugging enabled, like we've done for the page
alignement.
-Daniel

>
> > > +
> > > +   umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
> > > +   if (!umem_dmabuf)
> > > +           return ERR_PTR(-ENOMEM);
> > > +
> > > +   umem = &umem_dmabuf->umem;
> > > +   umem->ibdev = device;
> > > +   umem->length = size;
> > > +   umem->address = offset;
> > > +   umem->iova = offset;
> > > +   umem->writable = ib_access_writable(access);
> > > +   umem->is_dmabuf = 1;
> > > +
> > > +   dmabuf = dma_buf_get(fd);
> > > +   if (IS_ERR(dmabuf)) {
> > > +           ret = PTR_ERR(dmabuf);
> > > +           goto out_free_umem;
> > > +   }
> > > +
> > > +   umem_dmabuf->attach = dma_buf_dynamic_attach(
> > > +                                   dmabuf,
> > > +                                   device->dma_device,
> > > +                                   ops,
> > > +                                   umem_dmabuf);
> > > +   if (IS_ERR(umem_dmabuf->attach)) {
> > > +           ret = PTR_ERR(umem_dmabuf->attach);
> > > +           goto out_release_dmabuf;
> > > +   }
> > > +
> > > +   return umem;
> > > +
> > > +out_release_dmabuf:
> > > +   dma_buf_put(dmabuf);
> > > +
> > > +out_free_umem:
> > > +   kfree(umem_dmabuf);
> > > +   return ERR_PTR(ret);
> > > +}
> > > +EXPORT_SYMBOL(ib_umem_dmabuf_get);
> > > +
> > > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) {
> > > +   struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
> > > +
> > > +   dma_resv_lock(dmabuf->resv, NULL);
> > > +   ib_umem_dmabuf_unmap_pages(umem_dmabuf);
> > > +   dma_resv_unlock(dmabuf->resv);
> > > +
> > > +   dma_buf_detach(dmabuf, umem_dmabuf->attach);
> > > +   dma_buf_put(dmabuf);
> > > +   kfree(umem_dmabuf);
> > > +}
> > > diff --git a/drivers/infiniband/core/umem_dmabuf.h
> > > b/drivers/infiniband/core/umem_dmabuf.h
> > > new file mode 100644
> > > index 0000000..13acf55
> > > --- /dev/null
> > > +++ b/drivers/infiniband/core/umem_dmabuf.h
> > > @@ -0,0 +1,11 @@
> > > +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> > > +/*
> > > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > > + */
> > > +
> > > +#ifndef UMEM_DMABUF_H
> > > +#define UMEM_DMABUF_H
> > > +
> > > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> > > +
> > > +#endif /* UMEM_DMABUF_H */
> > > diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index
> > > 7059750..73a7b19 100644
> > > --- a/include/rdma/ib_umem.h
> > > +++ b/include/rdma/ib_umem.h
> > > @@ -1,6 +1,7 @@
> > >  /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> > >  /*
> > >   * Copyright (c) 2007 Cisco Systems.  All rights reserved.
> > > + * Copyright (c) 2020 Intel Corporation.  All rights reserved.
> > >   */
> > >
> > >  #ifndef IB_UMEM_H
> > > @@ -13,6 +14,7 @@
> > >
> > >  struct ib_ucontext;
> > >  struct ib_umem_odp;
> > > +struct dma_buf_attach_ops;
> > >
> > >  struct ib_umem {
> > >     struct ib_device       *ibdev;
> > > @@ -22,12 +24,25 @@ struct ib_umem {
> > >     unsigned long           address;
> > >     u32 writable : 1;
> > >     u32 is_odp : 1;
> > > +   u32 is_dmabuf : 1;
> > >     struct work_struct      work;
> > >     struct sg_table sg_head;
> > >     int             nmap;
> > >     unsigned int    sg_nents;
> > >  };
> > >
> > > +struct ib_umem_dmabuf {
> > > +   struct ib_umem umem;
> > > +   struct dma_buf_attachment *attach;
> > > +   struct sg_table *sgt;
> > > +   void *device_context;
> > > +};
> > > +
> > > +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem
> > > +*umem) {
> > > +   return container_of(umem, struct ib_umem_dmabuf, umem); }
> > > +
> > >  /* Returns the offset of the umem start relative to the first page.
> > > */  static inline int ib_umem_offset(struct ib_umem *umem)  { @@ -79,6
> > > +94,12 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem,
> > > size_t offset,  unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
> > >                                  unsigned long pgsz_bitmap,
> > >                                  unsigned long virt);
> > > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > > +                              unsigned long offset, size_t size,
> > > +                              int fd, int access,
> > > +                              const struct dma_buf_attach_ops *ops); int
> > > +ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void
> > > +ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
> > >
> > >  #else /* CONFIG_INFINIBAND_USER_MEM */
> > >
> > > @@ -101,7 +122,19 @@ static inline unsigned long
> > > ib_umem_find_best_pgsz(struct ib_umem *umem,  {
> > >     return 0;
> > >  }
> > > +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > > +                                            unsigned long offset,
> > > +                                            size_t size, int fd,
> > > +                                            int access,
> > > +                                            struct dma_buf_attach_ops *ops) {
> > > +   return ERR_PTR(-EINVAL);
> > > +}
> > > +static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf
> > > +*umem_dmabuf) {
> > > +   return -EINVAL;
> > > +}
> > > +static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf
> > > +*umem_dmabuf) { }
> > >
> > >  #endif /* CONFIG_INFINIBAND_USER_MEM */
> > > -
> > >  #endif /* IB_UMEM_H */
> > > --
> > > 1.8.3.1
> > >
> > > _______________________________________________
> > > dri-devel mailing list
> > > dri-devel@lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Jason Gunthorpe Oct. 23, 2020, 6:20 p.m. UTC | #4
On Fri, Oct 23, 2020 at 06:49:11PM +0200, Daniel Vetter wrote:
> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > +				   unsigned long offset, size_t size,
> > +				   int fd, int access,
> > +				   const struct dma_buf_attach_ops *ops)
> > +{
> > +	struct dma_buf *dmabuf;
> > +	struct ib_umem_dmabuf *umem_dmabuf;
> > +	struct ib_umem *umem;
> > +	unsigned long end;
> > +	long ret;
> > +
> > +	if (check_add_overflow(offset, (unsigned long)size, &end))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	if (unlikely(PAGE_ALIGN(end) < PAGE_SIZE))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	if (unlikely(!ops || !ops->move_notify))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +#ifdef CONFIG_DMA_VIRT_OPS
> > +	if (device->dma_device->dma_ops == &dma_virt_ops)
> > +		return ERR_PTR(-EINVAL);
> > +#endif
> 
> Maybe I'm confused, but should we have this check in dma_buf_attach, or at
> least in dma_buf_dynamic_attach? The p2pdma functions use that too, and I
> can't imagine how zerocopy should work (which is like the entire point of
> dma-buf) when we have dma_virt_ops.

The problem is we have RDMA drivers that assume SGL's have a valid
struct page, and these hacky/wrong P2P sgls that DMABUF creates cannot
be passed into those drivers.

But maybe this is just a 'drivers are using it wrong' if they call
this function and expect struct pages..

The check in the p2p stuff was done to avoid this too, but it was on a
different flow.

Jason
Daniel Vetter Oct. 24, 2020, 1:45 a.m. UTC | #5
On Fri, Oct 23, 2020 at 8:20 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Oct 23, 2020 at 06:49:11PM +0200, Daniel Vetter wrote:
> > > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > > +                              unsigned long offset, size_t size,
> > > +                              int fd, int access,
> > > +                              const struct dma_buf_attach_ops *ops)
> > > +{
> > > +   struct dma_buf *dmabuf;
> > > +   struct ib_umem_dmabuf *umem_dmabuf;
> > > +   struct ib_umem *umem;
> > > +   unsigned long end;
> > > +   long ret;
> > > +
> > > +   if (check_add_overflow(offset, (unsigned long)size, &end))
> > > +           return ERR_PTR(-EINVAL);
> > > +
> > > +   if (unlikely(PAGE_ALIGN(end) < PAGE_SIZE))
> > > +           return ERR_PTR(-EINVAL);
> > > +
> > > +   if (unlikely(!ops || !ops->move_notify))
> > > +           return ERR_PTR(-EINVAL);
> > > +
> > > +#ifdef CONFIG_DMA_VIRT_OPS
> > > +   if (device->dma_device->dma_ops == &dma_virt_ops)
> > > +           return ERR_PTR(-EINVAL);
> > > +#endif
> >
> > Maybe I'm confused, but should we have this check in dma_buf_attach, or at
> > least in dma_buf_dynamic_attach? The p2pdma functions use that too, and I
> > can't imagine how zerocopy should work (which is like the entire point of
> > dma-buf) when we have dma_virt_ops.
>
> The problem is we have RDMA drivers that assume SGL's have a valid
> struct page, and these hacky/wrong P2P sgls that DMABUF creates cannot
> be passed into those drivers.
>
> But maybe this is just a 'drivers are using it wrong' if they call
> this function and expect struct pages..
>
> The check in the p2p stuff was done to avoid this too, but it was on a
> different flow.

Yeah definitely don't call dma_buf_map_attachment and expect a page
back. In practice you'll get a page back fairly often, but I don't
think we want to bake that in, maybe we eventually get to non-hacky
dma_addr_t only sgl.

What I'm wondering is whether dma_buf_attach shouldn't reject such
devices directly, instead of each importer having to do that.
-Daniel
Christoph Hellwig Oct. 24, 2020, 7:48 a.m. UTC | #6
On Fri, Oct 23, 2020 at 03:20:05PM -0300, Jason Gunthorpe wrote:
> The problem is we have RDMA drivers that assume SGL's have a valid
> struct page, and these hacky/wrong P2P sgls that DMABUF creates cannot
> be passed into those drivers.

RDMA drivers do not assume scatterlist have a valid struct page,
scatterlists are defined to have a valid struct page.  Any scatterlist
without a struct page is completely buggy.
Jason Gunthorpe Oct. 26, 2020, 12:26 p.m. UTC | #7
On Sat, Oct 24, 2020 at 08:48:07AM +0100, Christoph Hellwig wrote:
> On Fri, Oct 23, 2020 at 03:20:05PM -0300, Jason Gunthorpe wrote:
> > The problem is we have RDMA drivers that assume SGL's have a valid
> > struct page, and these hacky/wrong P2P sgls that DMABUF creates cannot
> > be passed into those drivers.
> 
> RDMA drivers do not assume scatterlist have a valid struct page,
> scatterlists are defined to have a valid struct page.  Any scatterlist
> without a struct page is completely buggy.

It is not just having the struct page, it needs to be a CPU accessible
one for memcpy/etc. They aren't correct with the
MEMORY_DEVICE_PCI_P2PDMA SGLs either.

Jason
Christoph Hellwig Oct. 27, 2020, 8:08 a.m. UTC | #8
On Mon, Oct 26, 2020 at 09:26:37AM -0300, Jason Gunthorpe wrote:
> On Sat, Oct 24, 2020 at 08:48:07AM +0100, Christoph Hellwig wrote:
> > On Fri, Oct 23, 2020 at 03:20:05PM -0300, Jason Gunthorpe wrote:
> > > The problem is we have RDMA drivers that assume SGL's have a valid
> > > struct page, and these hacky/wrong P2P sgls that DMABUF creates cannot
> > > be passed into those drivers.
> > 
> > RDMA drivers do not assume scatterlist have a valid struct page,
> > scatterlists are defined to have a valid struct page.  Any scatterlist
> > without a struct page is completely buggy.
> 
> It is not just having the struct page, it needs to be a CPU accessible
> one for memcpy/etc. They aren't correct with the
> MEMORY_DEVICE_PCI_P2PDMA SGLs either.

Exactly.
Xiong, Jianxin Oct. 27, 2020, 5:32 p.m. UTC | #9
> -----Original Message-----
> From: Christoph Hellwig <hch@infradead.org>
> Sent: Tuesday, October 27, 2020 1:08 AM
> To: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Christoph Hellwig <hch@infradead.org>; Daniel Vetter <daniel@ffwll.ch>; Xiong, Jianxin <jianxin.xiong@intel.com>; linux-
> rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Leon Romanovsky <leon@kernel.org>; Doug Ledford <dledford@redhat.com>;
> Vetter, Daniel <daniel.vetter@intel.com>; Christian Koenig <christian.koenig@amd.com>
> Subject: Re: [PATCH v6 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> On Mon, Oct 26, 2020 at 09:26:37AM -0300, Jason Gunthorpe wrote:
> > On Sat, Oct 24, 2020 at 08:48:07AM +0100, Christoph Hellwig wrote:
> > > On Fri, Oct 23, 2020 at 03:20:05PM -0300, Jason Gunthorpe wrote:
> > > > The problem is we have RDMA drivers that assume SGL's have a valid
> > > > struct page, and these hacky/wrong P2P sgls that DMABUF creates
> > > > cannot be passed into those drivers.
> > >
> > > RDMA drivers do not assume scatterlist have a valid struct page,
> > > scatterlists are defined to have a valid struct page.  Any
> > > scatterlist without a struct page is completely buggy.
> >
> > It is not just having the struct page, it needs to be a CPU accessible
> > one for memcpy/etc. They aren't correct with the
> > MEMORY_DEVICE_PCI_P2PDMA SGLs either.
> 
> Exactly.

In the function ib_umem_dmabuf_sgt_slice() (part of this patch) we could generate
a dma address array instead of filling the scatterlist 'umem->sg_head'. The array
would be handled similar to 'umem_odp->dma_list'. With such change, the RDMA
drivers wouldn't see incorrectly formed scatterlist. The check for dma_virt_ops here
wouldn't be needed either.

Would such proposal address the concern here?

-Jianxin
Jason Gunthorpe Oct. 27, 2020, 7:51 p.m. UTC | #10
On Tue, Oct 27, 2020 at 05:32:26PM +0000, Xiong, Jianxin wrote:
> > On Mon, Oct 26, 2020 at 09:26:37AM -0300, Jason Gunthorpe wrote:
> > > On Sat, Oct 24, 2020 at 08:48:07AM +0100, Christoph Hellwig wrote:
> > > > On Fri, Oct 23, 2020 at 03:20:05PM -0300, Jason Gunthorpe wrote:
> > > > > The problem is we have RDMA drivers that assume SGL's have a valid
> > > > > struct page, and these hacky/wrong P2P sgls that DMABUF creates
> > > > > cannot be passed into those drivers.
> > > >
> > > > RDMA drivers do not assume scatterlist have a valid struct page,
> > > > scatterlists are defined to have a valid struct page.  Any
> > > > scatterlist without a struct page is completely buggy.
> > >
> > > It is not just having the struct page, it needs to be a CPU accessible
> > > one for memcpy/etc. They aren't correct with the
> > > MEMORY_DEVICE_PCI_P2PDMA SGLs either.
> > 
> > Exactly.
> 
> In the function ib_umem_dmabuf_sgt_slice() (part of this patch) we could generate
> a dma address array instead of filling the scatterlist
> 'umem->sg_head'. 

I don't think we should change the format, the SGL comes out of the
dmabuf and all the umem code is able to process it like that. Adding
another datastructure just for this one case is going to be trouble.

Ultimately I'd like to see some 'dma only sgl', CH has been talking
about this for a while. When we have that settled just change
everything connected to umem

I think in the meantime the answer for this patch is drivers just
can't call these APIs and use the struct page side, just like they
can't call the DMA buf API and use the struct page side..

Jason
Jason Gunthorpe Oct. 27, 2020, 8 p.m. UTC | #11
On Fri, Oct 23, 2020 at 09:39:58AM -0700, Jianxin Xiong wrote:
> +/*
> + * Generate a new dma sg list from a sub range of an existing dma sg list.
> + * Both the input and output have their entries page aligned.
> + */
> +static int ib_umem_dmabuf_sgt_slice(struct sg_table *sgt, u64 offset,
> +				    u64 length, struct sg_table *new_sgt)
> +{
> +	struct scatterlist *sg, *new_sg;
> +	u64 start, end, off, addr, len;
> +	unsigned int new_nents;
> +	int err;
> +	int i;
> +
> +	start = ALIGN_DOWN(offset, PAGE_SIZE);
> +	end = ALIGN(offset + length, PAGE_SIZE);
> +
> +	offset = start;
> +	length = end - start;
> +	new_nents = 0;
> +	for_each_sgtable_dma_sg(sgt, sg, i) {
> +		len = sg_dma_len(sg);
> +		off = min(len, offset);
> +		len -= off;
> +		len = min(len, length);
> +		if (len)
> +			new_nents++;
> +		length -= len;
> +		offset -= off;
> +	}
> +
> +	err = sg_alloc_table(new_sgt, new_nents, GFP_KERNEL);
> +	if (err)
> +		return err;

I would really rather not allocate an entirely new table just to take
a slice of an existing SGT. Ideally the expoter API from DMA buf would
prepare the SGL slice properly instead of always giving a whole
buffer.

Alternatively making some small edit to rdma_umem_for_each_dma_block()
and ib_umem_find_best_pgsz() would let it slice the SGL at runtime

You need to rebase on top of this series:

https://patchwork.kernel.org/project/linux-rdma/list/?series=370437

Which makes mlx5 use those new APIs

Jason
Xiong, Jianxin Oct. 27, 2020, 8:11 p.m. UTC | #12
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Tuesday, October 27, 2020 1:00 PM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [PATCH v6 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> On Fri, Oct 23, 2020 at 09:39:58AM -0700, Jianxin Xiong wrote:
> > +/*
> > + * Generate a new dma sg list from a sub range of an existing dma sg list.
> > + * Both the input and output have their entries page aligned.
> > + */
> > +static int ib_umem_dmabuf_sgt_slice(struct sg_table *sgt, u64 offset,
> > +				    u64 length, struct sg_table *new_sgt) {
> > +	struct scatterlist *sg, *new_sg;
> > +	u64 start, end, off, addr, len;
> > +	unsigned int new_nents;
> > +	int err;
> > +	int i;
> > +
> > +	start = ALIGN_DOWN(offset, PAGE_SIZE);
> > +	end = ALIGN(offset + length, PAGE_SIZE);
> > +
> > +	offset = start;
> > +	length = end - start;
> > +	new_nents = 0;
> > +	for_each_sgtable_dma_sg(sgt, sg, i) {
> > +		len = sg_dma_len(sg);
> > +		off = min(len, offset);
> > +		len -= off;
> > +		len = min(len, length);
> > +		if (len)
> > +			new_nents++;
> > +		length -= len;
> > +		offset -= off;
> > +	}
> > +
> > +	err = sg_alloc_table(new_sgt, new_nents, GFP_KERNEL);
> > +	if (err)
> > +		return err;
> 
> I would really rather not allocate an entirely new table just to take a slice of an existing SGT. Ideally the expoter API from DMA buf would
> prepare the SGL slice properly instead of always giving a whole buffer.
> 
> Alternatively making some small edit to rdma_umem_for_each_dma_block() and ib_umem_find_best_pgsz() would let it slice the SGL at
> runtime
> 
> You need to rebase on top of this series:
> 
> https://patchwork.kernel.org/project/linux-rdma/list/?series=370437
> 
> Which makes mlx5 use those new APIs
> 
> Jason

Thanks. Will rebase and work on the runtime slicing.
Xiong, Jianxin Nov. 3, 2020, 5:36 p.m. UTC | #13
> -----Original Message-----
> From: Daniel Vetter <daniel@ffwll.ch>
> Sent: Friday, October 23, 2020 6:45 PM
> To: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Xiong, Jianxin <jianxin.xiong@intel.com>; linux-rdma <linux-rdma@vger.kernel.org>; dri-devel <dri-devel@lists.freedesktop.org>; Leon
> Romanovsky <leon@kernel.org>; Doug Ledford <dledford@redhat.com>; Vetter, Daniel <daniel.vetter@intel.com>; Christian Koenig
> <christian.koenig@amd.com>
> Subject: Re: [PATCH v6 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> > > > +
> > > > +#ifdef CONFIG_DMA_VIRT_OPS
> > > > +   if (device->dma_device->dma_ops == &dma_virt_ops)
> > > > +           return ERR_PTR(-EINVAL); #endif
> > >
> > > Maybe I'm confused, but should we have this check in dma_buf_attach,
> > > or at least in dma_buf_dynamic_attach? The p2pdma functions use that
> > > too, and I can't imagine how zerocopy should work (which is like the
> > > entire point of
> > > dma-buf) when we have dma_virt_ops.
> >
> > The problem is we have RDMA drivers that assume SGL's have a valid
> > struct page, and these hacky/wrong P2P sgls that DMABUF creates cannot
> > be passed into those drivers.
> >
> > But maybe this is just a 'drivers are using it wrong' if they call
> > this function and expect struct pages..
> >
> > The check in the p2p stuff was done to avoid this too, but it was on a
> > different flow.
> 
> Yeah definitely don't call dma_buf_map_attachment and expect a page back. In practice you'll get a page back fairly often, but I don't think
> we want to bake that in, maybe we eventually get to non-hacky dma_addr_t only sgl.
> 
> What I'm wondering is whether dma_buf_attach shouldn't reject such devices directly, instead of each importer having to do that.

Come back here to see if consensus can be reached on who should do the check. My
thinking is that it could be over restrictive for dma_buf_attach to always reject 
dma_virt_ops. According to dma-buf documentation the back storage would be
moved to system area upon mapping unless p2p is requested and can be supported
by the exporter. The sg_list for system memory would have struct page present. 

-Jianxin
  
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter Nov. 3, 2020, 8:43 p.m. UTC | #14
On Tue, Nov 3, 2020 at 6:36 PM Xiong, Jianxin <jianxin.xiong@intel.com> wrote:
>
>
> > -----Original Message-----
> > From: Daniel Vetter <daniel@ffwll.ch>
> > Sent: Friday, October 23, 2020 6:45 PM
> > To: Jason Gunthorpe <jgg@ziepe.ca>
> > Cc: Xiong, Jianxin <jianxin.xiong@intel.com>; linux-rdma <linux-rdma@vger.kernel.org>; dri-devel <dri-devel@lists.freedesktop.org>; Leon
> > Romanovsky <leon@kernel.org>; Doug Ledford <dledford@redhat.com>; Vetter, Daniel <daniel.vetter@intel.com>; Christian Koenig
> > <christian.koenig@amd.com>
> > Subject: Re: [PATCH v6 1/4] RDMA/umem: Support importing dma-buf as user memory region
> >
> > > > > +
> > > > > +#ifdef CONFIG_DMA_VIRT_OPS
> > > > > +   if (device->dma_device->dma_ops == &dma_virt_ops)
> > > > > +           return ERR_PTR(-EINVAL); #endif
> > > >
> > > > Maybe I'm confused, but should we have this check in dma_buf_attach,
> > > > or at least in dma_buf_dynamic_attach? The p2pdma functions use that
> > > > too, and I can't imagine how zerocopy should work (which is like the
> > > > entire point of
> > > > dma-buf) when we have dma_virt_ops.
> > >
> > > The problem is we have RDMA drivers that assume SGL's have a valid
> > > struct page, and these hacky/wrong P2P sgls that DMABUF creates cannot
> > > be passed into those drivers.
> > >
> > > But maybe this is just a 'drivers are using it wrong' if they call
> > > this function and expect struct pages..
> > >
> > > The check in the p2p stuff was done to avoid this too, but it was on a
> > > different flow.
> >
> > Yeah definitely don't call dma_buf_map_attachment and expect a page back. In practice you'll get a page back fairly often, but I don't think
> > we want to bake that in, maybe we eventually get to non-hacky dma_addr_t only sgl.
> >
> > What I'm wondering is whether dma_buf_attach shouldn't reject such devices directly, instead of each importer having to do that.
>
> Come back here to see if consensus can be reached on who should do the check. My
> thinking is that it could be over restrictive for dma_buf_attach to always reject
> dma_virt_ops. According to dma-buf documentation the back storage would be
> moved to system area upon mapping unless p2p is requested and can be supported
> by the exporter. The sg_list for system memory would have struct page present.

So I'm not clear on what this dma_virt_ops stuff is for, but if it's
an entirely virtual device with cpu access, then you shouldn't do
dma_buf_map_attachment, and then peek at the struct page in the sgl.
Instead you need to use dma_buf_vmap/vunmap and
dma_buf_begin/end_cpu_access. Otherwise the coherency managed is all
potentially busted. Also note that cpu access from the kernel to
dma-buf is a rather niche feature (only some usb device drivers use
it), so expect warts.

If this is the case, then I think dma_buf_attach should check for this
and reject such imports, since that's an importer bug.

If it's otoh something rdma specific, then I guess rdma checking for this is ok.

As a third option, if it's something about the connectivity between
the importing and exporting device, then this should be checked in the
->attach callback the exporter can provide, like the p2p check. The
idea here is that for device specific remapping units (mostly found
ond SoC, and not something like a standard iommu managed by the
dma-api), some of which can even do funny stuff like rotation of 2d
images, can be access by some, but not other. And only the exporter is
aware of these restrictions.

Now I dunno which case this one here is.
-Daniel
Xiong, Jianxin Nov. 4, 2020, 12:01 a.m. UTC | #15
> -----Original Message-----
> From: Daniel Vetter <daniel@ffwll.ch>
> Sent: Tuesday, November 03, 2020 12:43 PM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: Jason Gunthorpe <jgg@ziepe.ca>; linux-rdma <linux-rdma@vger.kernel.org>; dri-devel <dri-devel@lists.freedesktop.org>; Leon
> Romanovsky <leon@kernel.org>; Doug Ledford <dledford@redhat.com>; Vetter, Daniel <daniel.vetter@intel.com>; Christian Koenig
> <christian.koenig@amd.com>
> Subject: Re: [PATCH v6 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> On Tue, Nov 3, 2020 at 6:36 PM Xiong, Jianxin <jianxin.xiong@intel.com> wrote:
> >
> >
> > > -----Original Message-----
> > > From: Daniel Vetter <daniel@ffwll.ch>
> > > Sent: Friday, October 23, 2020 6:45 PM
> > > To: Jason Gunthorpe <jgg@ziepe.ca>
> > > Cc: Xiong, Jianxin <jianxin.xiong@intel.com>; linux-rdma
> > > <linux-rdma@vger.kernel.org>; dri-devel
> > > <dri-devel@lists.freedesktop.org>; Leon Romanovsky
> > > <leon@kernel.org>; Doug Ledford <dledford@redhat.com>; Vetter,
> > > Daniel <daniel.vetter@intel.com>; Christian Koenig
> > > <christian.koenig@amd.com>
> > > Subject: Re: [PATCH v6 1/4] RDMA/umem: Support importing dma-buf as
> > > user memory region
> > >
> > > > > > +
> > > > > > +#ifdef CONFIG_DMA_VIRT_OPS
> > > > > > +   if (device->dma_device->dma_ops == &dma_virt_ops)
> > > > > > +           return ERR_PTR(-EINVAL); #endif
> > > > >
> > > > > Maybe I'm confused, but should we have this check in
> > > > > dma_buf_attach, or at least in dma_buf_dynamic_attach? The
> > > > > p2pdma functions use that too, and I can't imagine how zerocopy
> > > > > should work (which is like the entire point of
> > > > > dma-buf) when we have dma_virt_ops.
> > > >
> > > > The problem is we have RDMA drivers that assume SGL's have a valid
> > > > struct page, and these hacky/wrong P2P sgls that DMABUF creates
> > > > cannot be passed into those drivers.
> > > >
> > > > But maybe this is just a 'drivers are using it wrong' if they call
> > > > this function and expect struct pages..
> > > >
> > > > The check in the p2p stuff was done to avoid this too, but it was
> > > > on a different flow.
> > >
> > > Yeah definitely don't call dma_buf_map_attachment and expect a page
> > > back. In practice you'll get a page back fairly often, but I don't think we want to bake that in, maybe we eventually get to non-hacky
> dma_addr_t only sgl.
> > >
> > > What I'm wondering is whether dma_buf_attach shouldn't reject such devices directly, instead of each importer having to do that.
> >
> > Come back here to see if consensus can be reached on who should do the
> > check. My thinking is that it could be over restrictive for
> > dma_buf_attach to always reject dma_virt_ops. According to dma-buf
> > documentation the back storage would be moved to system area upon
> > mapping unless p2p is requested and can be supported by the exporter. The sg_list for system memory would have struct page present.
> 
> So I'm not clear on what this dma_virt_ops stuff is for, but if it's an entirely virtual device with cpu access, then you shouldn't do
> dma_buf_map_attachment, and then peek at the struct page in the sgl.

This is the key, thanks for pointing that out. I was assuming the importer could
use the struct page if it exists. 

> Instead you need to use dma_buf_vmap/vunmap and dma_buf_begin/end_cpu_access. Otherwise the coherency managed is all potentially
> busted. Also note that cpu access from the kernel to dma-buf is a rather niche feature (only some usb device drivers use it), so expect warts.
> 

dma_virt_ops is a set of dma mapping operations that map page/sgl to virtual addresses
instead of dma addresses. Drivers that use dma_virt_ops would use the mapping
result for cpu access (to emulate DMA) instead of real DMA, and thus the dma mapping
returned from dma-buf is not compatible with the expectation of such drivers. If these
drivers are not allowed to peek into the struct page of the sgl, they have no way to
correctly use the sgl. In this sense I agree that drivers that use dma_virt_ops should not
call dma_buf_attach(). They should use dma_buf_vmap() et al to get cpu access. 

> If this is the case, then I think dma_buf_attach should check for this and reject such imports, since that's an importer bug.

So here we go. I will move the check to dma_buf_dynamic_attach (and dma_buf_attach
is a wrapper over that).

> 
> If it's otoh something rdma specific, then I guess rdma checking for this is ok.
> 
> As a third option, if it's something about the connectivity between the importing and exporting device, then this should be checked in the
> ->attach callback the exporter can provide, like the p2p check. The
> idea here is that for device specific remapping units (mostly found ond SoC, and not something like a standard iommu managed by the dma-
> api), some of which can even do funny stuff like rotation of 2d images, can be access by some, but not other. And only the exporter is
> aware of these restrictions.
> 
> Now I dunno which case this one here is.
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Jason Gunthorpe Nov. 5, 2020, 3:02 p.m. UTC | #16
On Tue, Nov 03, 2020 at 09:43:17PM +0100, Daniel Vetter wrote:

> > > Yeah definitely don't call dma_buf_map_attachment and expect a page back. In practice you'll get a page back fairly often, but I don't think
> > > we want to bake that in, maybe we eventually get to non-hacky dma_addr_t only sgl.
> > >
> > > What I'm wondering is whether dma_buf_attach shouldn't reject such devices directly, instead of each importer having to do that.
> >
> > Come back here to see if consensus can be reached on who should do the check. My
> > thinking is that it could be over restrictive for dma_buf_attach to always reject
> > dma_virt_ops. According to dma-buf documentation the back storage would be
> > moved to system area upon mapping unless p2p is requested and can be supported
> > by the exporter. The sg_list for system memory would have struct page present.
> 
> So I'm not clear on what this dma_virt_ops stuff is for, but if it's
> an entirely virtual device with cpu access, then you shouldn't do
> dma_buf_map_attachment, and then peek at the struct page in the sgl.

Yes, so I think the answer is it is rdma device driver error to call these
new APIs and touch the struct page side of the SGL.

After Christophs series removign dma_virt_ops we could make that more
explicit, like was done for the pci p2p case.


> As a third option, if it's something about the connectivity between
> the importing and exporting device, then this should be checked in the
> ->attach callback the exporter can provide, like the p2p check. The

Drivers doing p2p are supposed to be calling the p2p distance stuff
and p2p dma map stuff which already has these checks.

Doing p2p and skipping all that in the dma buf side we already knew
was a hacky thing.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index ccf2670..8ab4eea 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -40,5 +40,5 @@  ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				uverbs_std_types_srq.o \
 				uverbs_std_types_wq.o \
 				uverbs_std_types_qp.o
-ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
 ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index e9fecbd..2c45525 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -2,6 +2,7 @@ 
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Cisco Systems.  All rights reserved.
  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -43,6 +44,7 @@ 
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
+#include "umem_dmabuf.h"
 
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
@@ -269,6 +271,8 @@  void ib_umem_release(struct ib_umem *umem)
 {
 	if (!umem)
 		return;
+	if (umem->is_dmabuf)
+		return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
 	if (umem->is_odp)
 		return ib_umem_odp_release(to_ib_umem_odp(umem));
 
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
new file mode 100644
index 0000000..66b234d
--- /dev/null
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -0,0 +1,197 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#include <linux/dma-mapping.h>
+
+#include "uverbs.h"
+#include "umem_dmabuf.h"
+
+/*
+ * Generate a new dma sg list from a sub range of an existing dma sg list.
+ * Both the input and output have their entries page aligned.
+ */
+static int ib_umem_dmabuf_sgt_slice(struct sg_table *sgt, u64 offset,
+				    u64 length, struct sg_table *new_sgt)
+{
+	struct scatterlist *sg, *new_sg;
+	u64 start, end, off, addr, len;
+	unsigned int new_nents;
+	int err;
+	int i;
+
+	start = ALIGN_DOWN(offset, PAGE_SIZE);
+	end = ALIGN(offset + length, PAGE_SIZE);
+
+	offset = start;
+	length = end - start;
+	new_nents = 0;
+	for_each_sgtable_dma_sg(sgt, sg, i) {
+		len = sg_dma_len(sg);
+		off = min(len, offset);
+		len -= off;
+		len = min(len, length);
+		if (len)
+			new_nents++;
+		length -= len;
+		offset -= off;
+	}
+
+	err = sg_alloc_table(new_sgt, new_nents, GFP_KERNEL);
+	if (err)
+		return err;
+
+	offset = start;
+	length = end - start;
+	new_sg = new_sgt->sgl;
+	for_each_sgtable_dma_sg(sgt, sg, i) {
+		addr = sg_dma_address(sg);
+		len = sg_dma_len(sg);
+		off = min(len, offset);
+		addr += off;
+		len -= off;
+		len = min(len, length);
+		if (len) {
+			sg_dma_address(new_sg) = addr;
+			sg_dma_len(new_sg) = len;
+			new_sg = sg_next(new_sg);
+		}
+		length -= len;
+		offset -= off;
+	}
+
+	return 0;
+}
+
+int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+	struct sg_table *sgt;
+	struct dma_fence *fence;
+	int err;
+
+	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
+				     DMA_BIDIRECTIONAL);
+
+	if (IS_ERR(sgt))
+		return PTR_ERR(sgt);
+
+	err = ib_umem_dmabuf_sgt_slice(sgt, umem_dmabuf->umem.address,
+				       umem_dmabuf->umem.length,
+				       &umem_dmabuf->umem.sg_head);
+	if (err) {
+		dma_buf_unmap_attachment(umem_dmabuf->attach, sgt,
+					 DMA_BIDIRECTIONAL);
+		return err;
+	}
+
+	umem_dmabuf->umem.nmap = umem_dmabuf->umem.sg_head.nents;
+	umem_dmabuf->sgt = sgt;
+
+	/*
+	 * Although the sg list is valid now, the content of the pages
+	 * may be not up-to-date. Wait for the exporter to finish
+	 * the migration.
+	 */
+	fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv);
+	if (fence)
+		dma_fence_wait(fence, false);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
+
+void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+	if (!umem_dmabuf->sgt)
+		return;
+
+	sg_free_table(&umem_dmabuf->umem.sg_head);
+	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
+				 DMA_BIDIRECTIONAL);
+	umem_dmabuf->sgt = NULL;
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
+
+struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+				   unsigned long offset, size_t size,
+				   int fd, int access,
+				   const struct dma_buf_attach_ops *ops)
+{
+	struct dma_buf *dmabuf;
+	struct ib_umem_dmabuf *umem_dmabuf;
+	struct ib_umem *umem;
+	unsigned long end;
+	long ret;
+
+	if (check_add_overflow(offset, (unsigned long)size, &end))
+		return ERR_PTR(-EINVAL);
+
+	if (unlikely(PAGE_ALIGN(end) < PAGE_SIZE))
+		return ERR_PTR(-EINVAL);
+
+	if (unlikely(!ops || !ops->move_notify))
+		return ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_DMA_VIRT_OPS
+	if (device->dma_device->dma_ops == &dma_virt_ops)
+		return ERR_PTR(-EINVAL);
+#endif
+
+	umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
+	if (!umem_dmabuf)
+		return ERR_PTR(-ENOMEM);
+
+	umem = &umem_dmabuf->umem;
+	umem->ibdev = device;
+	umem->length = size;
+	umem->address = offset;
+	umem->iova = offset;
+	umem->writable = ib_access_writable(access);
+	umem->is_dmabuf = 1;
+
+	dmabuf = dma_buf_get(fd);
+	if (IS_ERR(dmabuf)) {
+		ret = PTR_ERR(dmabuf);
+		goto out_free_umem;
+	}
+
+	umem_dmabuf->attach = dma_buf_dynamic_attach(
+					dmabuf,
+					device->dma_device,
+					ops,
+					umem_dmabuf);
+	if (IS_ERR(umem_dmabuf->attach)) {
+		ret = PTR_ERR(umem_dmabuf->attach);
+		goto out_release_dmabuf;
+	}
+
+	return umem;
+
+out_release_dmabuf:
+	dma_buf_put(dmabuf);
+
+out_free_umem:
+	kfree(umem_dmabuf);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_get);
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+{
+	struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
+
+	dma_resv_lock(dmabuf->resv, NULL);
+	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+	dma_resv_unlock(dmabuf->resv);
+
+	dma_buf_detach(dmabuf, umem_dmabuf->attach);
+	dma_buf_put(dmabuf);
+	kfree(umem_dmabuf);
+}
diff --git a/drivers/infiniband/core/umem_dmabuf.h b/drivers/infiniband/core/umem_dmabuf.h
new file mode 100644
index 0000000..13acf55
--- /dev/null
+++ b/drivers/infiniband/core/umem_dmabuf.h
@@ -0,0 +1,11 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#ifndef UMEM_DMABUF_H
+#define UMEM_DMABUF_H
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
+
+#endif /* UMEM_DMABUF_H */
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 7059750..73a7b19 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -1,6 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2020 Intel Corporation.  All rights reserved.
  */
 
 #ifndef IB_UMEM_H
@@ -13,6 +14,7 @@ 
 
 struct ib_ucontext;
 struct ib_umem_odp;
+struct dma_buf_attach_ops;
 
 struct ib_umem {
 	struct ib_device       *ibdev;
@@ -22,12 +24,25 @@  struct ib_umem {
 	unsigned long		address;
 	u32 writable : 1;
 	u32 is_odp : 1;
+	u32 is_dmabuf : 1;
 	struct work_struct	work;
 	struct sg_table sg_head;
 	int             nmap;
 	unsigned int    sg_nents;
 };
 
+struct ib_umem_dmabuf {
+	struct ib_umem umem;
+	struct dma_buf_attachment *attach;
+	struct sg_table *sgt;
+	void *device_context;
+};
+
+static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
+{
+	return container_of(umem, struct ib_umem_dmabuf, umem);
+}
+
 /* Returns the offset of the umem start relative to the first page. */
 static inline int ib_umem_offset(struct ib_umem *umem)
 {
@@ -79,6 +94,12 @@  int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 				     unsigned long pgsz_bitmap,
 				     unsigned long virt);
+struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+				   unsigned long offset, size_t size,
+				   int fd, int access,
+				   const struct dma_buf_attach_ops *ops);
+int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf);
+void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -101,7 +122,19 @@  static inline unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 {
 	return 0;
 }
+static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+						 unsigned long offset,
+						 size_t size, int fd,
+						 int access,
+						 struct dma_buf_attach_ops *ops)
+{
+	return ERR_PTR(-EINVAL);
+}
+static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+	return -EINVAL;
+}
+static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { }
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */
-
 #endif /* IB_UMEM_H */