diff mbox series

[rdma-core,14/27] mlx5: Implement basic verbs operation for PD and MR over vfio

Message ID 20210720081647.1980-15-yishaih@nvidia.com (mailing list archive)
State Not Applicable
Headers show
Series Introduce mlx5 user space driver over VFIO | expand

Commit Message

Yishai Hadas July 20, 2021, 8:16 a.m. UTC
Implement basic verbs operation for PD and MR over vfio, this includes:
- PD alloc/dealloc
- MR reg/dereg.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 providers/mlx5/mlx5_ifc.h  |  76 ++++++++++++-
 providers/mlx5/mlx5_vfio.c | 273 +++++++++++++++++++++++++++++++++++++++++++++
 providers/mlx5/mlx5_vfio.h |  25 +++++
 util/util.h                |   5 +
 4 files changed, 377 insertions(+), 2 deletions(-)
diff mbox series

Patch

diff --git a/providers/mlx5/mlx5_ifc.h b/providers/mlx5/mlx5_ifc.h
index 2129779..1cbe846 100644
--- a/providers/mlx5/mlx5_ifc.h
+++ b/providers/mlx5/mlx5_ifc.h
@@ -51,6 +51,7 @@  enum {
 	MLX5_CMD_OP_QUERY_ISSI = 0x10a,
 	MLX5_CMD_OP_SET_ISSI = 0x10b,
 	MLX5_CMD_OP_CREATE_MKEY = 0x200,
+	MLX5_CMD_OP_DESTROY_MKEY = 0x202,
 	MLX5_CMD_OP_CREATE_EQ = 0x301,
 	MLX5_CMD_OP_DESTROY_EQ = 0x302,
 	MLX5_CMD_OP_CREATE_QP = 0x500,
@@ -67,6 +68,8 @@  enum {
 	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754,
 	MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755,
 	MLX5_CMD_OP_QUERY_ROCE_ADDRESS = 0x760,
+	MLX5_CMD_OP_ALLOC_PD = 0x800,
+	MLX5_CMD_OP_DEALLOC_PD = 0x801,
 	MLX5_CMD_OP_ALLOC_UAR = 0x802,
 	MLX5_CMD_OP_DEALLOC_UAR = 0x803,
 	MLX5_CMD_OP_ACCESS_REG = 0x805,
@@ -1380,7 +1383,8 @@  enum {
 };
 
 enum {
-	MLX5_MKC_ACCESS_MODE_KLMS  = 0x2,
+	MLX5_MKC_ACCESS_MODE_MTT = 0x1,
+	MLX5_MKC_ACCESS_MODE_KLMS = 0x2,
 };
 
 struct mlx5_ifc_mkc_bits {
@@ -1425,7 +1429,9 @@  struct mlx5_ifc_mkc_bits {
 
 	u8         translations_octword_size[0x20];
 
-	u8         reserved_at_1c0[0x1b];
+	u8         reserved_at_1c0[0x19];
+	u8         relaxed_ordering_read[0x1];
+	u8         reserved_at_1d9[0x1];
 	u8         log_page_size[0x5];
 
 	u8         reserved_at_1e0[0x20];
@@ -1467,6 +1473,28 @@  struct mlx5_ifc_create_mkey_in_bits {
 	u8         klm_pas_mtt[0][0x20];
 };
 
+struct mlx5_ifc_destroy_mkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_mkey_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         mkey_index[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
 struct mlx5_ifc_l2_hdr_bits {
 	u8         dmac_47_16[0x20];
 	u8         dmac_15_0[0x10];
@@ -4584,4 +4612,48 @@  struct mlx5_ifc_destroy_eq_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_alloc_pd_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_pd_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_pd_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_pd_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
 #endif /* MLX5_IFC_H */
diff --git a/providers/mlx5/mlx5_vfio.c b/providers/mlx5/mlx5_vfio.c
index c37358c..e85a8cc 100644
--- a/providers/mlx5/mlx5_vfio.c
+++ b/providers/mlx5/mlx5_vfio.c
@@ -33,6 +33,12 @@  enum {
 	MLX5_VFIO_CMD_VEC_IDX,
 };
 
+enum {
+	MLX5_VFIO_SUPP_MR_ACCESS_FLAGS = IBV_ACCESS_LOCAL_WRITE |
+		IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+		IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_RELAXED_ORDERING,
+};
+
 static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, uint16_t func_id,
 				int32_t npages, bool is_event);
 static int mlx5_vfio_reclaim_pages(struct mlx5_vfio_context *ctx, uint32_t func_id,
@@ -2191,6 +2197,268 @@  static int mlx5_vfio_setup_function(struct mlx5_vfio_context *ctx)
 	return err;
 }
 
+static struct ibv_pd *mlx5_vfio_alloc_pd(struct ibv_context *ibctx)
+{
+	struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
+	uint32_t in[DEVX_ST_SZ_DW(alloc_pd_in)] = {0};
+	uint32_t out[DEVX_ST_SZ_DW(alloc_pd_out)] = {0};
+	int err;
+	struct mlx5_pd *pd;
+
+	pd = calloc(1, sizeof(*pd));
+	if (!pd)
+		return NULL;
+
+	DEVX_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
+	err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
+
+	if (err)
+		goto err;
+
+	pd->pdn = DEVX_GET(alloc_pd_out, out, pd);
+
+	return &pd->ibv_pd;
+err:
+	free(pd);
+	return NULL;
+}
+
+static int mlx5_vfio_dealloc_pd(struct ibv_pd *pd)
+{
+	struct mlx5_vfio_context *ctx = to_mvfio_ctx(pd->context);
+	uint32_t in[DEVX_ST_SZ_DW(dealloc_pd_in)] = {};
+	uint32_t out[DEVX_ST_SZ_DW(dealloc_pd_out)] = {};
+	struct mlx5_pd *mpd = to_mpd(pd);
+	int ret;
+
+	DEVX_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD);
+	DEVX_SET(dealloc_pd_in, in, pd, mpd->pdn);
+
+	ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
+	if (ret)
+		return ret;
+
+	free(mpd);
+	return 0;
+}
+
+static size_t calc_num_dma_blocks(uint64_t iova, size_t length,
+				   unsigned long pgsz)
+{
+	return (size_t)((align(iova + length, pgsz) -
+			 align_down(iova, pgsz)) / pgsz);
+}
+
+static int get_octo_len(uint64_t addr, uint64_t len, int page_shift)
+{
+	uint64_t page_size = 1ULL << page_shift;
+	uint64_t offset;
+	int npages;
+
+	offset = addr & (page_size - 1);
+	npages = align(len + offset, page_size) >> page_shift;
+	return (npages + 1) / 2;
+}
+
+static inline uint32_t mlx5_mkey_to_idx(uint32_t mkey)
+{
+	return mkey >> 8;
+}
+
+static inline uint32_t mlx5_idx_to_mkey(uint32_t mkey_idx)
+{
+	return mkey_idx << 8;
+}
+
+static void set_mkc_access_pd_addr_fields(void *mkc, int acc, uint64_t start_addr,
+					  struct ibv_pd *pd)
+{
+	struct mlx5_pd *mpd = to_mpd(pd);
+
+	DEVX_SET(mkc, mkc, a, !!(acc & IBV_ACCESS_REMOTE_ATOMIC));
+	DEVX_SET(mkc, mkc, rw, !!(acc & IBV_ACCESS_REMOTE_WRITE));
+	DEVX_SET(mkc, mkc, rr, !!(acc & IBV_ACCESS_REMOTE_READ));
+	DEVX_SET(mkc, mkc, lw, !!(acc & IBV_ACCESS_LOCAL_WRITE));
+	DEVX_SET(mkc, mkc, lr, 1);
+	/* Application is responsible to set based on caps */
+	DEVX_SET(mkc, mkc, relaxed_ordering_write,
+		 !!(acc & IBV_ACCESS_RELAXED_ORDERING));
+	DEVX_SET(mkc, mkc, relaxed_ordering_read,
+		 !!(acc & IBV_ACCESS_RELAXED_ORDERING));
+	DEVX_SET(mkc, mkc, pd, mpd->pdn);
+	DEVX_SET(mkc, mkc, qpn, 0xffffff);
+	DEVX_SET64(mkc, mkc, start_addr, start_addr);
+}
+
+static int mlx5_vfio_dereg_mr(struct verbs_mr *vmr)
+{
+	struct mlx5_vfio_context *ctx = to_mvfio_ctx(vmr->ibv_mr.context);
+	struct mlx5_vfio_mr *mr = to_mvfio_mr(&vmr->ibv_mr);
+	uint32_t in[DEVX_ST_SZ_DW(destroy_mkey_in)] = {};
+	uint32_t out[DEVX_ST_SZ_DW(destroy_mkey_in)] = {};
+	int ret;
+
+	DEVX_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
+	DEVX_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(vmr->ibv_mr.lkey));
+	ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
+	if (ret)
+		return ret;
+
+	mlx5_vfio_unregister_mem(ctx, mr->iova + mr->iova_aligned_offset,
+				 mr->iova_reg_size);
+	iset_insert_range(ctx->iova_alloc, mr->iova, mr->iova_page_size);
+
+	free(vmr);
+	return 0;
+}
+
+static void mlx5_vfio_populate_pas(uint64_t dma_addr, int num_dma, size_t page_size,
+				  __be64 *pas, uint64_t access_flags)
+{
+	int i;
+
+	for (i = 0; i < num_dma; i++) {
+		*pas = htobe64(dma_addr | access_flags);
+		pas++;
+		dma_addr += page_size;
+	}
+}
+
+static uint64_t calc_spanning_page_size(uint64_t start, uint64_t length)
+{
+	/* Compute a page_size such that:
+	 * start & (page_size-1) == (start + length) & (page_size - 1)
+	 */
+	uint64_t diffs = start ^ (start + length - 1);
+
+	return roundup_pow_of_two(diffs + 1);
+}
+
+static struct ibv_mr *mlx5_vfio_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
+				       uint64_t hca_va, int access)
+{
+	struct mlx5_vfio_device *dev = to_mvfio_dev(pd->context->device);
+	struct mlx5_vfio_context *ctx = to_mvfio_ctx(pd->context);
+	uint32_t out[DEVX_ST_SZ_DW(create_mkey_out)] = {};
+	uint32_t mkey_index;
+	uint32_t *in;
+	int inlen, num_pas, ret;
+	struct mlx5_vfio_mr *mr;
+	struct verbs_mr *vmr;
+	int page_shift, iova_min_page_shift;
+	__be64 *pas;
+	uint8_t key;
+	void *mkc;
+	void *aligned_va;
+
+	if (!check_comp_mask(access, MLX5_VFIO_SUPP_MR_ACCESS_FLAGS)) {
+		errno = EOPNOTSUPP;
+		return NULL;
+	}
+
+	if (((uint64_t)addr & (ctx->iova_min_page_size - 1)) !=
+	    (hca_va & (ctx->iova_min_page_size - 1))) {
+		errno = EOPNOTSUPP;
+		return NULL;
+	}
+
+	mr = calloc(1, sizeof(*mr));
+	if (!mr) {
+		errno = ENOMEM;
+		return NULL;
+	}
+
+	/* Page size that encloses the start and end of the mkey's hca_va range */
+	mr->iova_page_size = max(calc_spanning_page_size(hca_va, length),
+				 ctx->iova_min_page_size);
+
+	ret = iset_alloc_range(ctx->iova_alloc, mr->iova_page_size, &mr->iova);
+	if (ret)
+		goto end;
+
+	aligned_va = (void *)((unsigned long)addr & ~(ctx->iova_min_page_size - 1));
+	page_shift = ilog32(mr->iova_page_size - 1);
+	iova_min_page_shift = ilog32(ctx->iova_min_page_size - 1);
+	if (page_shift > iova_min_page_shift)
+		/* Ensure the low bis of the mkey VA match the low bits of the IOVA because the mkc
+		 * start_addr specifies both the wire VA and the DMA VA.
+		 */
+		mr->iova_aligned_offset = hca_va & GENMASK(page_shift - 1, iova_min_page_shift);
+
+	mr->iova_reg_size = align(length + hca_va, ctx->iova_min_page_size) -
+				  align_down(hca_va, ctx->iova_min_page_size);
+
+	assert(mr->iova_page_size >= mr->iova_aligned_offset + mr->iova_reg_size);
+	ret = mlx5_vfio_register_mem(ctx, aligned_va,
+				     mr->iova + mr->iova_aligned_offset,
+				     mr->iova_reg_size);
+
+	if (ret)
+		goto err_reg;
+
+	num_pas = 1;
+	if (page_shift > MLX5_MAX_PAGE_SHIFT) {
+		page_shift = MLX5_MAX_PAGE_SHIFT;
+		num_pas = calc_num_dma_blocks(hca_va, length, (1ULL << MLX5_MAX_PAGE_SHIFT));
+	}
+
+	inlen = DEVX_ST_SZ_BYTES(create_mkey_in) + (sizeof(*pas) * align(num_pas, 2));
+
+	in = calloc(1, inlen);
+	if (!in) {
+		errno = ENOMEM;
+		goto err_in;
+	}
+
+	pas = (__be64 *)DEVX_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+	mlx5_vfio_populate_pas(mr->iova, num_pas, (1ULL << page_shift), pas, MLX5_MTT_PRESENT);
+
+	DEVX_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+	DEVX_SET(create_mkey_in, in, pg_access, 1);
+	mkc = DEVX_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	set_mkc_access_pd_addr_fields(mkc, access, hca_va, pd);
+	DEVX_SET(mkc, mkc, free, 0);
+	DEVX_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+	DEVX_SET64(mkc, mkc, len, length);
+	DEVX_SET(mkc, mkc, bsf_octword_size, 0);
+	DEVX_SET(mkc, mkc, translations_octword_size,
+		 get_octo_len(hca_va, length, page_shift));
+	DEVX_SET(mkc, mkc, log_page_size, page_shift);
+
+	DEVX_SET(create_mkey_in, in, translations_octword_actual_size,
+		 get_octo_len(hca_va, length, page_shift));
+
+	key = atomic_fetch_add(&dev->mkey_var, 1);
+	DEVX_SET(mkc, mkc, mkey_7_0, key);
+
+	ret = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0);
+	if (ret)
+		goto err_exec;
+
+	free(in);
+	mkey_index = DEVX_GET(create_mkey_out, out, mkey_index);
+	vmr = &mr->vmr;
+	vmr->ibv_mr.lkey = key | mlx5_idx_to_mkey(mkey_index);
+	vmr->ibv_mr.rkey = vmr->ibv_mr.lkey;
+	vmr->ibv_mr.context = pd->context;
+	vmr->mr_type = IBV_MR_TYPE_MR;
+	vmr->access = access;
+	vmr->ibv_mr.handle = 0;
+
+	return &mr->vmr.ibv_mr;
+
+err_exec:
+	free(in);
+err_in:
+	mlx5_vfio_unregister_mem(ctx, mr->iova + mr->iova_aligned_offset,
+				 mr->iova_reg_size);
+err_reg:
+	iset_insert_range(ctx->iova_alloc, mr->iova, mr->iova_page_size);
+end:
+	free(mr);
+	return NULL;
+}
+
 static void mlx5_vfio_uninit_context(struct mlx5_vfio_context *ctx)
 {
 	mlx5_close_debug_file(ctx->dbg_fp);
@@ -2213,6 +2481,10 @@  static void mlx5_vfio_free_context(struct ibv_context *ibctx)
 }
 
 static const struct verbs_context_ops mlx5_vfio_common_ops = {
+	.alloc_pd = mlx5_vfio_alloc_pd,
+	.dealloc_pd = mlx5_vfio_dealloc_pd,
+	.reg_mr = mlx5_vfio_reg_mr,
+	.dereg_mr = mlx5_vfio_dereg_mr,
 	.free_context = mlx5_vfio_free_context,
 };
 
@@ -2446,6 +2718,7 @@  mlx5dv_get_vfio_device_list(struct mlx5dv_vfio_context_attr *attr)
 
 	vfio_dev->flags = attr->flags;
 	vfio_dev->page_size = sysconf(_SC_PAGESIZE);
+	atomic_init(&vfio_dev->mkey_var, 0);
 
 	list[0] = &vfio_dev->vdev.device;
 	return list;
diff --git a/providers/mlx5/mlx5_vfio.h b/providers/mlx5/mlx5_vfio.h
index 296d6d1..5311c6f 100644
--- a/providers/mlx5/mlx5_vfio.h
+++ b/providers/mlx5/mlx5_vfio.h
@@ -23,17 +23,37 @@  enum {
 	MLX5_PCI_CMD_XPORT = 7,
 };
 
+enum mlx5_ib_mtt_access_flags {
+	MLX5_MTT_READ  = (1 << 0),
+	MLX5_MTT_WRITE = (1 << 1),
+};
+
+enum {
+	MLX5_MAX_PAGE_SHIFT = 31,
+};
+
+#define MLX5_MTT_PRESENT (MLX5_MTT_READ | MLX5_MTT_WRITE)
+
 enum {
 	MLX5_VFIO_BLOCK_SIZE = 2 * 1024 * 1024,
 	MLX5_VFIO_BLOCK_NUM_PAGES = MLX5_VFIO_BLOCK_SIZE / MLX5_ADAPTER_PAGE_SIZE,
 };
 
+struct mlx5_vfio_mr {
+	struct verbs_mr vmr;
+	uint64_t iova;
+	uint64_t iova_page_size;
+	uint64_t iova_aligned_offset;
+	uint64_t iova_reg_size;
+};
+
 struct mlx5_vfio_device {
 	struct verbs_device vdev;
 	char *pci_name;
 	char vfio_path[IBV_SYSFS_PATH_MAX];
 	int page_size;
 	uint32_t flags;
+	atomic_int mkey_var;
 };
 
 #if __BYTE_ORDER == __LITTLE_ENDIAN
@@ -282,4 +302,9 @@  static inline struct mlx5_vfio_context *to_mvfio_ctx(struct ibv_context *ibctx)
 	return container_of(ibctx, struct mlx5_vfio_context, vctx.context);
 }
 
+static inline struct mlx5_vfio_mr *to_mvfio_mr(struct ibv_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx5_vfio_mr, vmr.ibv_mr);
+}
+
 #endif
diff --git a/util/util.h b/util/util.h
index 2c05631..45f5065 100644
--- a/util/util.h
+++ b/util/util.h
@@ -70,6 +70,11 @@  static inline unsigned long align(unsigned long val, unsigned long align)
 	return (val + align - 1) & ~(align - 1);
 }
 
+static inline unsigned long align_down(unsigned long val, unsigned long _align)
+{
+	return align(val - (_align - 1), _align);
+}
+
 static inline uint64_t roundup_pow_of_two(uint64_t n)
 {
 	return n == 1 ? 1 : 1ULL << ilog64(n - 1);