diff mbox series

[RFC,1/3] RDMA/umem: Support importing dma-buf as user memory region

Message ID 1587056973-101760-2-git-send-email-jianxin.xiong@intel.com (mailing list archive)
State RFC
Headers show
Series RDMA: Add dma-buf support | expand

Commit Message

Xiong, Jianxin April 16, 2020, 5:09 p.m. UTC
Dma-buf, a standard cross-driver buffer sharing mechanism, is chosen to
be the basis of a non-proprietary approach for supporting RDMA to/from
buffers allocated from device local memory (e.g. GPU VRAM).

Dma-buf is supported by mainstream GPU drivers. By using ioctl calls
via the devices under /dev/dri/, user space applications can allocate
and export GPU buffers as dma-buf objects with associated file
descriptors.

In order to use the exported GPU buffers for RDMA operations, the RDMA
driver needs to be able to import dma-buf objects. This happens at the
time of memory registration. A GPU buffer is registered as a special
type of user space memory region with the dma-buf file descriptor as
an extra parameter. The uverbs API needs to be extended to allow the
extra parameter be passed from user space to kernel.

Implements the common code for pinning and mapping dma-buf pages and
adds config option for RDMA driver dma-buf support. The common code
is utilized by the new uverbs commands introduced by follow-up patches.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
---
 drivers/infiniband/Kconfig            |  10 ++++
 drivers/infiniband/core/Makefile      |   1 +
 drivers/infiniband/core/umem.c        |   3 +
 drivers/infiniband/core/umem_dmabuf.c | 100 ++++++++++++++++++++++++++++++++++
 include/rdma/ib_umem.h                |   2 +
 include/rdma/ib_umem_dmabuf.h         |  50 +++++++++++++++++
 6 files changed, 166 insertions(+)
 create mode 100644 drivers/infiniband/core/umem_dmabuf.c
 create mode 100644 include/rdma/ib_umem_dmabuf.h

Comments

Jason Gunthorpe April 16, 2020, 6:01 p.m. UTC | #1
On Thu, Apr 16, 2020 at 10:09:31AM -0700, Jianxin Xiong wrote:
> Dma-buf, a standard cross-driver buffer sharing mechanism, is chosen to
> be the basis of a non-proprietary approach for supporting RDMA to/from
> buffers allocated from device local memory (e.g. GPU VRAM).
> 
> Dma-buf is supported by mainstream GPU drivers. By using ioctl calls
> via the devices under /dev/dri/, user space applications can allocate
> and export GPU buffers as dma-buf objects with associated file
> descriptors.
> 
> In order to use the exported GPU buffers for RDMA operations, the RDMA
> driver needs to be able to import dma-buf objects. This happens at the
> time of memory registration. A GPU buffer is registered as a special
> type of user space memory region with the dma-buf file descriptor as
> an extra parameter. The uverbs API needs to be extended to allow the
> extra parameter be passed from user space to kernel.
> 
> Implements the common code for pinning and mapping dma-buf pages and
> adds config option for RDMA driver dma-buf support. The common code
> is utilized by the new uverbs commands introduced by follow-up patches.
> 
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
>  drivers/infiniband/Kconfig            |  10 ++++
>  drivers/infiniband/core/Makefile      |   1 +
>  drivers/infiniband/core/umem.c        |   3 +
>  drivers/infiniband/core/umem_dmabuf.c | 100 ++++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h                |   2 +
>  include/rdma/ib_umem_dmabuf.h         |  50 +++++++++++++++++
>  6 files changed, 166 insertions(+)
>  create mode 100644 drivers/infiniband/core/umem_dmabuf.c
>  create mode 100644 include/rdma/ib_umem_dmabuf.h
> 
> diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
> index ade8638..1dcfc59 100644
> +++ b/drivers/infiniband/Kconfig
> @@ -63,6 +63,16 @@ config INFINIBAND_ON_DEMAND_PAGING
>  	  memory regions without pinning their pages, fetching the
>  	  pages on demand instead.
>  
> +config INFINIBAND_DMABUF
> +	bool "InfiniBand dma-buf support"
> +	depends on INFINIBAND_USER_MEM

select ..some kind of DMABUF symbol...

> +	default n
> +	help
> +	  Support for dma-buf based user memory.
> +	  This allows userspace processes register memory regions
> +	  backed by device memory exported as dma-buf, and thus
> +	  enables RDMA operations using device memory.

See remarks on the cover letter

> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +				   unsigned long addr, size_t size,
> +				   int dmabuf_fd, int access)
> +{
> +	struct ib_umem_dmabuf *umem_dmabuf;
> +	struct sg_table *sgt;
> +	enum dma_data_direction dir;
> +	long ret;
> +
> +	if (((addr + size) < addr) ||
> +	    PAGE_ALIGN(addr + size) < (addr + size))
> +		return ERR_PTR(-EINVAL);

This math validating the user parameters can overflow in various bad
ways

> +	if (!can_do_mlock())
> +		return ERR_PTR(-EPERM);

Why?

> +	umem_dmabuf->umem.ibdev = device;
> +	umem_dmabuf->umem.length = size;
> +	umem_dmabuf->umem.address = addr;
> +	umem_dmabuf->umem.writable = ib_access_writable(access);
> +	umem_dmabuf->umem.is_dmabuf = 1;
> +	umem_dmabuf->umem.owning_mm = current->mm;

Why does this need to store owning_mm?

> +	umem_dmabuf->fd = dmabuf_fd;

Doesn't need to store fd

> +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> +	enum dma_data_direction dir;
> +
> +	dir = umem_dmabuf->umem.writable ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
> +
> +	/*
> +	 * Only use the original sgt returned from dma_buf_map_attachment(),
> +	 * otherwise the scatterlist may be freed twice due to the map caching
> +	 * mechanism.
> +	 */
> +	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, dir);
> +	dma_buf_detach(umem_dmabuf->dmabuf, umem_dmabuf->attach);
> +	dma_buf_put(umem_dmabuf->dmabuf);
> +	mmdrop(umem_dmabuf->umem.owning_mm);
> +	kfree(umem_dmabuf);
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_release);

Why is this an EXPORT_SYMBOL?

> diff --git a/include/rdma/ib_umem_dmabuf.h b/include/rdma/ib_umem_dmabuf.h
> new file mode 100644
> index 0000000..e82b205
> +++ b/include/rdma/ib_umem_dmabuf.h

This should not be a public header

> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +/*
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> + */
> +
> +#ifndef IB_UMEM_DMABUF_H
> +#define IB_UMEM_DMABUF_H
> +
> +#include <linux/dma-buf.h>
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +struct ib_umem_dmabuf {
> +	struct ib_umem	umem;
> +	int		fd;
> +	struct dma_buf	*dmabuf;
> +	struct dma_buf_attachment *attach;
> +	struct sg_table *sgt;

Probably the ib_umem should be changed to hold a struct sg_table. Not
clear to me why dma_buf wants to allocate this as a pointer..

Also this can be in umem_dmabuf.c

> +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
> +{
> +	return container_of(umem, struct ib_umem_dmabuf, umem);
> +}

Put in ummem_dmabuf.c

> +
> +#ifdef CONFIG_INFINIBAND_DMABUF
> +
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +				   unsigned long addr, size_t size,
> +				   int dmabuf_fd, int access);
> +
> +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> +
> +#else /* CONFIG_INFINIBAND_DMABUF */
> +
> +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +						 unsigned long addr,
> +						 size_t size, int dmabuf_fd,
> +						 int access)
> +{
> +	return ERR_PTR(-EINVAL);
> +}

This should be in the existing ib_umem.h

> +
> +static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> +}

In uverbs_priv.h

Jason
Leon Romanovsky April 16, 2020, 6:02 p.m. UTC | #2
On Thu, Apr 16, 2020 at 10:09:31AM -0700, Jianxin Xiong wrote:
> Dma-buf, a standard cross-driver buffer sharing mechanism, is chosen to
> be the basis of a non-proprietary approach for supporting RDMA to/from
> buffers allocated from device local memory (e.g. GPU VRAM).

Where can I read more about "is chosen to be the basis of a
non-proprietary approach" part of the sentence?

Especially HMM vs. dma-buf in this regard.

>
> Dma-buf is supported by mainstream GPU drivers. By using ioctl calls
> via the devices under /dev/dri/, user space applications can allocate
> and export GPU buffers as dma-buf objects with associated file
> descriptors.
>
> In order to use the exported GPU buffers for RDMA operations, the RDMA
> driver needs to be able to import dma-buf objects. This happens at the
> time of memory registration. A GPU buffer is registered as a special
> type of user space memory region with the dma-buf file descriptor as
> an extra parameter. The uverbs API needs to be extended to allow the
> extra parameter be passed from user space to kernel.
>
> Implements the common code for pinning and mapping dma-buf pages and
> adds config option for RDMA driver dma-buf support. The common code
> is utilized by the new uverbs commands introduced by follow-up patches.
>
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> ---
>  drivers/infiniband/Kconfig            |  10 ++++
>  drivers/infiniband/core/Makefile      |   1 +
>  drivers/infiniband/core/umem.c        |   3 +
>  drivers/infiniband/core/umem_dmabuf.c | 100 ++++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h                |   2 +
>  include/rdma/ib_umem_dmabuf.h         |  50 +++++++++++++++++
>  6 files changed, 166 insertions(+)
>  create mode 100644 drivers/infiniband/core/umem_dmabuf.c
>  create mode 100644 include/rdma/ib_umem_dmabuf.h
>
> diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
> index ade8638..1dcfc59 100644
> --- a/drivers/infiniband/Kconfig
> +++ b/drivers/infiniband/Kconfig
> @@ -63,6 +63,16 @@ config INFINIBAND_ON_DEMAND_PAGING
>  	  memory regions without pinning their pages, fetching the
>  	  pages on demand instead.
>
> +config INFINIBAND_DMABUF

There is no need to add extra config, it is not different from any
other verbs feature which is handled by some sort of mask.

> +	bool "InfiniBand dma-buf support"
> +	depends on INFINIBAND_USER_MEM
> +	default n
> +	help
> +	  Support for dma-buf based user memory.
> +	  This allows userspace processes register memory regions
> +	  backed by device memory exported as dma-buf, and thus
> +	  enables RDMA operations using device memory.
> +
>  config INFINIBAND_ADDR_TRANS
>  	bool "RDMA/CM"
>  	depends on INFINIBAND
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index d1b14887..7981d0f 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -39,3 +39,4 @@ ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
>  				uverbs_std_types_async_fd.o
>  ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
>  ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
> +ib_uverbs-$(CONFIG_INFINIBAND_DMABUF) += umem_dmabuf.o
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 82455a1..54b35df 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -40,6 +40,7 @@
>  #include <linux/slab.h>
>  #include <linux/pagemap.h>
>  #include <rdma/ib_umem_odp.h>
> +#include <rdma/ib_umem_dmabuf.h>
>
>  #include "uverbs.h"
>
> @@ -317,6 +318,8 @@ void ib_umem_release(struct ib_umem *umem)
>  {
>  	if (!umem)
>  		return;
> +	if (umem->is_dmabuf)
> +		return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
>  	if (umem->is_odp)
>  		return ib_umem_odp_release(to_ib_umem_odp(umem));
>
> diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
> new file mode 100644
> index 0000000..325d44f
> --- /dev/null
> +++ b/drivers/infiniband/core/umem_dmabuf.c
> @@ -0,0 +1,100 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +/*
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/sched/mm.h>
> +#include <linux/dma-mapping.h>
> +#include <rdma/ib_umem_dmabuf.h>
> +
> +#include "uverbs.h"
> +
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +				   unsigned long addr, size_t size,
> +				   int dmabuf_fd, int access)
> +{
> +	struct ib_umem_dmabuf *umem_dmabuf;
> +	struct sg_table *sgt;
> +	enum dma_data_direction dir;
> +	long ret;
> +
> +	if (((addr + size) < addr) ||
> +	    PAGE_ALIGN(addr + size) < (addr + size))
> +		return ERR_PTR(-EINVAL);
> +
> +	if (!can_do_mlock())
> +		return ERR_PTR(-EPERM);
> +
> +	if (access & IB_ACCESS_ON_DEMAND)
> +		return ERR_PTR(-EOPNOTSUPP);

Does dma-buf really need to prohibit ODP?

> +
> +	umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
> +	if (!umem_dmabuf)
> +		return ERR_PTR(-ENOMEM);
> +
> +	umem_dmabuf->umem.ibdev = device;
> +	umem_dmabuf->umem.length = size;
> +	umem_dmabuf->umem.address = addr;
> +	umem_dmabuf->umem.writable = ib_access_writable(access);
> +	umem_dmabuf->umem.is_dmabuf = 1;
> +	umem_dmabuf->umem.owning_mm = current->mm;
> +	mmgrab(umem_dmabuf->umem.owning_mm);
> +
> +	umem_dmabuf->fd = dmabuf_fd;
> +	umem_dmabuf->dmabuf = dma_buf_get(umem_dmabuf->fd);
> +	if (IS_ERR(umem_dmabuf->dmabuf)) {
> +		ret = PTR_ERR(umem_dmabuf->dmabuf);
> +		goto out_free_umem;
> +	}
> +
> +	umem_dmabuf->attach = dma_buf_attach(umem_dmabuf->dmabuf,
> +					     device->dma_device);
> +	if (IS_ERR(umem_dmabuf->attach)) {
> +		ret = PTR_ERR(umem_dmabuf->attach);
> +		goto out_release_dmabuf;
> +	}
> +
> +	dir = umem_dmabuf->umem.writable ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
> +	sgt = dma_buf_map_attachment(umem_dmabuf->attach, dir);
> +	if (IS_ERR(sgt)) {
> +		ret = PTR_ERR(sgt);
> +		goto out_detach_dmabuf;
> +	}
> +
> +	umem_dmabuf->sgt = sgt;
> +	umem_dmabuf->umem.sg_head = *sgt;
> +	umem_dmabuf->umem.nmap = sgt->nents;
> +	return &umem_dmabuf->umem;
> +
> +out_detach_dmabuf:
> +	dma_buf_detach(umem_dmabuf->dmabuf, umem_dmabuf->attach);
> +
> +out_release_dmabuf:
> +	dma_buf_put(umem_dmabuf->dmabuf);
> +
> +out_free_umem:
> +	mmdrop(umem_dmabuf->umem.owning_mm);
> +	kfree(umem_dmabuf);
> +	return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_get);
> +
> +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> +	enum dma_data_direction dir;
> +
> +	dir = umem_dmabuf->umem.writable ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
> +
> +	/*
> +	 * Only use the original sgt returned from dma_buf_map_attachment(),
> +	 * otherwise the scatterlist may be freed twice due to the map caching
> +	 * mechanism.
> +	 */
> +	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, dir);
> +	dma_buf_detach(umem_dmabuf->dmabuf, umem_dmabuf->attach);
> +	dma_buf_put(umem_dmabuf->dmabuf);
> +	mmdrop(umem_dmabuf->umem.owning_mm);
> +	kfree(umem_dmabuf);
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_release);
> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> index e3518fd..026a3cf 100644
> --- a/include/rdma/ib_umem.h
> +++ b/include/rdma/ib_umem.h
> @@ -40,6 +40,7 @@
>
>  struct ib_ucontext;
>  struct ib_umem_odp;
> +struct ib_umem_dmabuf;
>
>  struct ib_umem {
>  	struct ib_device       *ibdev;
> @@ -48,6 +49,7 @@ struct ib_umem {
>  	unsigned long		address;
>  	u32 writable : 1;
>  	u32 is_odp : 1;
> +	u32 is_dmabuf : 1;
>  	struct work_struct	work;
>  	struct sg_table sg_head;
>  	int             nmap;
> diff --git a/include/rdma/ib_umem_dmabuf.h b/include/rdma/ib_umem_dmabuf.h
> new file mode 100644
> index 0000000..e82b205
> --- /dev/null
> +++ b/include/rdma/ib_umem_dmabuf.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +/*
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> + */
> +
> +#ifndef IB_UMEM_DMABUF_H
> +#define IB_UMEM_DMABUF_H
> +
> +#include <linux/dma-buf.h>
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +struct ib_umem_dmabuf {
> +	struct ib_umem	umem;
> +	int		fd;
> +	struct dma_buf	*dmabuf;
> +	struct dma_buf_attachment *attach;
> +	struct sg_table *sgt;
> +};
> +
> +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
> +{
> +	return container_of(umem, struct ib_umem_dmabuf, umem);
> +}
> +
> +#ifdef CONFIG_INFINIBAND_DMABUF
> +
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +				   unsigned long addr, size_t size,
> +				   int dmabuf_fd, int access);
> +
> +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> +
> +#else /* CONFIG_INFINIBAND_DMABUF */
> +
> +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +						 unsigned long addr,
> +						 size_t size, int dmabuf_fd,
> +						 int access)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
> +
> +static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
> +{
> +}
> +
> +#endif /* CONFIG_INFINIBAND_DMABUF */
> +
> +#endif /* IB_UMEM_DMABUF_H */
> --
> 1.8.3.1
>
Jason Gunthorpe April 16, 2020, 6:04 p.m. UTC | #3
On Thu, Apr 16, 2020 at 09:02:01PM +0300, Leon Romanovsky wrote:

> > diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
> > index ade8638..1dcfc59 100644
> > +++ b/drivers/infiniband/Kconfig
> > @@ -63,6 +63,16 @@ config INFINIBAND_ON_DEMAND_PAGING
> >  	  memory regions without pinning their pages, fetching the
> >  	  pages on demand instead.
> >
> > +config INFINIBAND_DMABUF
> 
> There is no need to add extra config, it is not different from any
> other verbs feature which is handled by some sort of mask.

That works too, but then it infiniband_user_mem needs the 
 depends on DMABUF || !DMABUF construct

> > +	if (access & IB_ACCESS_ON_DEMAND)
> > +		return ERR_PTR(-EOPNOTSUPP);
> 
> Does dma-buf really need to prohibit ODP?

ODP fundamentally can only be applied to a mm_struct

Jason
Leon Romanovsky April 16, 2020, 6:30 p.m. UTC | #4
On Thu, Apr 16, 2020 at 03:04:07PM -0300, Jason Gunthorpe wrote:
> On Thu, Apr 16, 2020 at 09:02:01PM +0300, Leon Romanovsky wrote:
>
> > > diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
> > > index ade8638..1dcfc59 100644
> > > +++ b/drivers/infiniband/Kconfig
> > > @@ -63,6 +63,16 @@ config INFINIBAND_ON_DEMAND_PAGING
> > >  	  memory regions without pinning their pages, fetching the
> > >  	  pages on demand instead.
> > >
> > > +config INFINIBAND_DMABUF
> >
> > There is no need to add extra config, it is not different from any
> > other verbs feature which is handled by some sort of mask.
>
> That works too, but then it infiniband_user_mem needs the
>  depends on DMABUF || !DMABUF construct

IS_REACHABLE() ? :)

>
> > > +	if (access & IB_ACCESS_ON_DEMAND)
> > > +		return ERR_PTR(-EOPNOTSUPP);
> >
> > Does dma-buf really need to prohibit ODP?
>
> ODP fundamentally can only be applied to a mm_struct

Right, I forgot about it, thanks.

>
> Jason
Xiong, Jianxin April 16, 2020, 7:41 p.m. UTC | #5
> > +config INFINIBAND_DMABUF
> > +	bool "InfiniBand dma-buf support"
> > +	depends on INFINIBAND_USER_MEM
> 
> select ..some kind of DMABUF symbol...

Good catch. Will add the dependency.

> 
> > +	default n
> > +	help
> > +	  Support for dma-buf based user memory.
> > +	  This allows userspace processes register memory regions
> > +	  backed by device memory exported as dma-buf, and thus
> > +	  enables RDMA operations using device memory.
> 
> See remarks on the cover letter

Thanks.

> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > +				   unsigned long addr, size_t size,
> > +				   int dmabuf_fd, int access)
> > +{
> > +	struct ib_umem_dmabuf *umem_dmabuf;
> > +	struct sg_table *sgt;
> > +	enum dma_data_direction dir;
> > +	long ret;
> > +
> > +	if (((addr + size) < addr) ||
> > +	    PAGE_ALIGN(addr + size) < (addr + size))
> > +		return ERR_PTR(-EINVAL);
> 
> This math validating the user parameters can overflow in various bad ways

This is modeled after the original ib_umem_get() function. Let me see how it can be done better.

> 
> > +	if (!can_do_mlock())
> > +		return ERR_PTR(-EPERM);
> 
> Why?

Hmm. Probably not needed.

> 
> > +	umem_dmabuf->umem.ibdev = device;
> > +	umem_dmabuf->umem.length = size;
> > +	umem_dmabuf->umem.address = addr;
> > +	umem_dmabuf->umem.writable = ib_access_writable(access);
> > +	umem_dmabuf->umem.is_dmabuf = 1;
> > +	umem_dmabuf->umem.owning_mm = current->mm;
> 
> Why does this need to store owning_mm?

For the mmdrop() call. But again, the mmgrab() and mmdrop() calls are probably not needed.
 
> 
> > +	umem_dmabuf->fd = dmabuf_fd;
> 
> Doesn't need to store fd
> 

You are right. 

> > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) {
> > +	enum dma_data_direction dir;
> > +
> > +	dir = umem_dmabuf->umem.writable ? DMA_BIDIRECTIONAL :
> > +DMA_FROM_DEVICE;
> > +
> > +	/*
> > +	 * Only use the original sgt returned from dma_buf_map_attachment(),
> > +	 * otherwise the scatterlist may be freed twice due to the map caching
> > +	 * mechanism.
> > +	 */
> > +	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, dir);
> > +	dma_buf_detach(umem_dmabuf->dmabuf, umem_dmabuf->attach);
> > +	dma_buf_put(umem_dmabuf->dmabuf);
> > +	mmdrop(umem_dmabuf->umem.owning_mm);
> > +	kfree(umem_dmabuf);
> > +}
> > +EXPORT_SYMBOL(ib_umem_dmabuf_release);
> 
> Why is this an EXPORT_SYMBOL?

It is called from ib_umem_release() which is in a different file.

> 
> > diff --git a/include/rdma/ib_umem_dmabuf.h
> > b/include/rdma/ib_umem_dmabuf.h new file mode 100644 index
> > 0000000..e82b205
> > +++ b/include/rdma/ib_umem_dmabuf.h
> 
> This should not be a public header
> 

It's put there to be consistent with similar headers such as "ib_umem_odp.h". Can be changed.

> > @@ -0,0 +1,50 @@
> > +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> > +/*
> > + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> > + */
> > +
> > +#ifndef IB_UMEM_DMABUF_H
> > +#define IB_UMEM_DMABUF_H
> > +
> > +#include <linux/dma-buf.h>
> > +#include <rdma/ib_umem.h>
> > +#include <rdma/ib_verbs.h>
> > +
> > +struct ib_umem_dmabuf {
> > +	struct ib_umem	umem;
> > +	int		fd;
> > +	struct dma_buf	*dmabuf;
> > +	struct dma_buf_attachment *attach;
> > +	struct sg_table *sgt;
> 
> Probably the ib_umem should be changed to hold a struct sg_table. Not clear to me why dma_buf wants to allocate this as a pointer..

ib_umem does hold a structure sg_table. The pointer field here is needed to avoid double free when the dma-buf is unmapped and detached. The internal caching mechanism of dma-buf checks the value of sgt itself to determine if a sg table needs to be freed at the time of unmapping or detaching. Passing the address of the sg table structure of ib_umem would lead to double free. 

> 
> Also this can be in umem_dmabuf.c
> 
> > +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem
> > +*umem) {
> > +	return container_of(umem, struct ib_umem_dmabuf, umem); }
> 
> Put in ummem_dmabuf.c

Will do.

> 
> > +
> > +#ifdef CONFIG_INFINIBAND_DMABUF
> > +
> > +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > +				   unsigned long addr, size_t size,
> > +				   int dmabuf_fd, int access);
> > +
> > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
> > +
> > +#else /* CONFIG_INFINIBAND_DMABUF */
> > +
> > +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> > +						 unsigned long addr,
> > +						 size_t size, int dmabuf_fd,
> > +						 int access)
> > +{
> > +	return ERR_PTR(-EINVAL);
> > +}
> 
> This should be in the existing ib_umem.h
> 

Good. Thanks.

> > +
> > +static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf
> > +*umem_dmabuf) { }
> 
> In uverbs_priv.h
> 

Will do.

> Jason
Xiong, Jianxin April 16, 2020, 9:08 p.m. UTC | #6
> > Dma-buf, a standard cross-driver buffer sharing mechanism, is chosen
> > to be the basis of a non-proprietary approach for supporting RDMA
> > to/from buffers allocated from device local memory (e.g. GPU VRAM).
> 
> Where can I read more about "is chosen to be the basis of a non-proprietary approach" part of the sentence?
> 
> Especially HMM vs. dma-buf in this regard.
> 

I will write up more details. Thanks.
Jason Gunthorpe April 17, 2020, 12:31 p.m. UTC | #7
On Thu, Apr 16, 2020 at 07:41:33PM +0000, Xiong, Jianxin wrote:
> > > +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) {
> > > +	enum dma_data_direction dir;
> > > +
> > > +	dir = umem_dmabuf->umem.writable ? DMA_BIDIRECTIONAL :
> > > +DMA_FROM_DEVICE;
> > > +
> > > +	/*
> > > +	 * Only use the original sgt returned from dma_buf_map_attachment(),
> > > +	 * otherwise the scatterlist may be freed twice due to the map caching
> > > +	 * mechanism.
> > > +	 */
> > > +	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, dir);
> > > +	dma_buf_detach(umem_dmabuf->dmabuf, umem_dmabuf->attach);
> > > +	dma_buf_put(umem_dmabuf->dmabuf);
> > > +	mmdrop(umem_dmabuf->umem.owning_mm);
> > > +	kfree(umem_dmabuf);
> > > +}
> > > +EXPORT_SYMBOL(ib_umem_dmabuf_release);
> > 
> > Why is this an EXPORT_SYMBOL?
> 
> It is called from ib_umem_release() which is in a different file.

export is only required to call outside the current module, which is
is not.

> > > diff --git a/include/rdma/ib_umem_dmabuf.h
> > > b/include/rdma/ib_umem_dmabuf.h new file mode 100644 index
> > > 0000000..e82b205
> > > +++ b/include/rdma/ib_umem_dmabuf.h
> > 
> > This should not be a public header
> 
> It's put there to be consistent with similar headers such as "ib_umem_odp.h". Can be changed.

ib_umem_odp is needed by drivers, this is not.

Jason
Xiong, Jianxin April 17, 2020, 3:49 p.m. UTC | #8
> > > > +}
> > > > +EXPORT_SYMBOL(ib_umem_dmabuf_release);
> > >
> > > Why is this an EXPORT_SYMBOL?
> >
> > It is called from ib_umem_release() which is in a different file.
> 
> export is only required to call outside the current module, which is is not.

Will fix it. Thanks.

> 
> > > > diff --git a/include/rdma/ib_umem_dmabuf.h
> > > > b/include/rdma/ib_umem_dmabuf.h new file mode 100644 index
> > > > 0000000..e82b205
> > > > +++ b/include/rdma/ib_umem_dmabuf.h
> > >
> > > This should not be a public header
> >
> > It's put there to be consistent with similar headers such as "ib_umem_odp.h". Can be changed.
> 
> ib_umem_odp is needed by drivers, this is not.

In the current series, it's used by the mlx5_ib driver. However, if as you suggested the function prototype of ib_umem_dmabuf_get is moved to ib_umem.h, this is no longer needed.

> 
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index ade8638..1dcfc59 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -63,6 +63,16 @@  config INFINIBAND_ON_DEMAND_PAGING
 	  memory regions without pinning their pages, fetching the
 	  pages on demand instead.
 
+config INFINIBAND_DMABUF
+	bool "InfiniBand dma-buf support"
+	depends on INFINIBAND_USER_MEM
+	default n
+	help
+	  Support for dma-buf based user memory.
+	  This allows userspace processes register memory regions
+	  backed by device memory exported as dma-buf, and thus
+	  enables RDMA operations using device memory.
+
 config INFINIBAND_ADDR_TRANS
 	bool "RDMA/CM"
 	depends on INFINIBAND
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d1b14887..7981d0f 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -39,3 +39,4 @@  ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				uverbs_std_types_async_fd.o
 ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
+ib_uverbs-$(CONFIG_INFINIBAND_DMABUF) += umem_dmabuf.o
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 82455a1..54b35df 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -40,6 +40,7 @@ 
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <rdma/ib_umem_odp.h>
+#include <rdma/ib_umem_dmabuf.h>
 
 #include "uverbs.h"
 
@@ -317,6 +318,8 @@  void ib_umem_release(struct ib_umem *umem)
 {
 	if (!umem)
 		return;
+	if (umem->is_dmabuf)
+		return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
 	if (umem->is_odp)
 		return ib_umem_odp_release(to_ib_umem_odp(umem));
 
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
new file mode 100644
index 0000000..325d44f
--- /dev/null
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -0,0 +1,100 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/dma-mapping.h>
+#include <rdma/ib_umem_dmabuf.h>
+
+#include "uverbs.h"
+
+struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+				   unsigned long addr, size_t size,
+				   int dmabuf_fd, int access)
+{
+	struct ib_umem_dmabuf *umem_dmabuf;
+	struct sg_table *sgt;
+	enum dma_data_direction dir;
+	long ret;
+
+	if (((addr + size) < addr) ||
+	    PAGE_ALIGN(addr + size) < (addr + size))
+		return ERR_PTR(-EINVAL);
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	if (access & IB_ACCESS_ON_DEMAND)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
+	if (!umem_dmabuf)
+		return ERR_PTR(-ENOMEM);
+
+	umem_dmabuf->umem.ibdev = device;
+	umem_dmabuf->umem.length = size;
+	umem_dmabuf->umem.address = addr;
+	umem_dmabuf->umem.writable = ib_access_writable(access);
+	umem_dmabuf->umem.is_dmabuf = 1;
+	umem_dmabuf->umem.owning_mm = current->mm;
+	mmgrab(umem_dmabuf->umem.owning_mm);
+
+	umem_dmabuf->fd = dmabuf_fd;
+	umem_dmabuf->dmabuf = dma_buf_get(umem_dmabuf->fd);
+	if (IS_ERR(umem_dmabuf->dmabuf)) {
+		ret = PTR_ERR(umem_dmabuf->dmabuf);
+		goto out_free_umem;
+	}
+
+	umem_dmabuf->attach = dma_buf_attach(umem_dmabuf->dmabuf,
+					     device->dma_device);
+	if (IS_ERR(umem_dmabuf->attach)) {
+		ret = PTR_ERR(umem_dmabuf->attach);
+		goto out_release_dmabuf;
+	}
+
+	dir = umem_dmabuf->umem.writable ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
+	sgt = dma_buf_map_attachment(umem_dmabuf->attach, dir);
+	if (IS_ERR(sgt)) {
+		ret = PTR_ERR(sgt);
+		goto out_detach_dmabuf;
+	}
+
+	umem_dmabuf->sgt = sgt;
+	umem_dmabuf->umem.sg_head = *sgt;
+	umem_dmabuf->umem.nmap = sgt->nents;
+	return &umem_dmabuf->umem;
+
+out_detach_dmabuf:
+	dma_buf_detach(umem_dmabuf->dmabuf, umem_dmabuf->attach);
+
+out_release_dmabuf:
+	dma_buf_put(umem_dmabuf->dmabuf);
+
+out_free_umem:
+	mmdrop(umem_dmabuf->umem.owning_mm);
+	kfree(umem_dmabuf);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_get);
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+{
+	enum dma_data_direction dir;
+
+	dir = umem_dmabuf->umem.writable ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
+
+	/*
+	 * Only use the original sgt returned from dma_buf_map_attachment(),
+	 * otherwise the scatterlist may be freed twice due to the map caching
+	 * mechanism.
+	 */
+	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, dir);
+	dma_buf_detach(umem_dmabuf->dmabuf, umem_dmabuf->attach);
+	dma_buf_put(umem_dmabuf->dmabuf);
+	mmdrop(umem_dmabuf->umem.owning_mm);
+	kfree(umem_dmabuf);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_release);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index e3518fd..026a3cf 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -40,6 +40,7 @@ 
 
 struct ib_ucontext;
 struct ib_umem_odp;
+struct ib_umem_dmabuf;
 
 struct ib_umem {
 	struct ib_device       *ibdev;
@@ -48,6 +49,7 @@  struct ib_umem {
 	unsigned long		address;
 	u32 writable : 1;
 	u32 is_odp : 1;
+	u32 is_dmabuf : 1;
 	struct work_struct	work;
 	struct sg_table sg_head;
 	int             nmap;
diff --git a/include/rdma/ib_umem_dmabuf.h b/include/rdma/ib_umem_dmabuf.h
new file mode 100644
index 0000000..e82b205
--- /dev/null
+++ b/include/rdma/ib_umem_dmabuf.h
@@ -0,0 +1,50 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#ifndef IB_UMEM_DMABUF_H
+#define IB_UMEM_DMABUF_H
+
+#include <linux/dma-buf.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+
+struct ib_umem_dmabuf {
+	struct ib_umem	umem;
+	int		fd;
+	struct dma_buf	*dmabuf;
+	struct dma_buf_attachment *attach;
+	struct sg_table *sgt;
+};
+
+static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
+{
+	return container_of(umem, struct ib_umem_dmabuf, umem);
+}
+
+#ifdef CONFIG_INFINIBAND_DMABUF
+
+struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+				   unsigned long addr, size_t size,
+				   int dmabuf_fd, int access);
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
+
+#else /* CONFIG_INFINIBAND_DMABUF */
+
+static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+						 unsigned long addr,
+						 size_t size, int dmabuf_fd,
+						 int access)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+{
+}
+
+#endif /* CONFIG_INFINIBAND_DMABUF */
+
+#endif /* IB_UMEM_DMABUF_H */