diff mbox series

[06/20] RDMA/mlx5: Implement mlx5_ib_map_mr_sg_pi and mlx5_ib_alloc_mr_integrity

Message ID 1559222731-16715-7-git-send-email-maxg@mellanox.com (mailing list archive)
State Superseded
Headers show
Series Introduce new API for T10-PI offload | expand

Commit Message

Max Gurtovoy May 30, 2019, 1:25 p.m. UTC
mlx5_ib_map_mr_sg_pi() will map the PI and data dma mapped SG lists to the
mlx5 memory region prior to the registration operation. In the new
API, the mlx5 driver will allocate an internal memory region for the
UMR operation to register both PI and data SG lists. The internal MR
will use KLM mode in order to map 2 (possibly non-contiguous/non-align)
SG lists using 1 memory key. In the new API, each ULP will use 1 memory
region for the signature operation (instead of 3 in the old API). This
memory region will have a key that will be exposed to remote server to
perform RDMA operation. The internal memory key that will map the SG lists
will stay private.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c    |   2 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  11 +++
 drivers/infiniband/hw/mlx5/mr.c      | 184 ++++++++++++++++++++++++++++++++---
 3 files changed, 186 insertions(+), 11 deletions(-)

Comments

Shamir Rabinovitch June 2, 2019, 9:13 a.m. UTC | #1
On Thu, May 30, 2019 at 04:25:17PM +0300, Max Gurtovoy wrote:
> mlx5_ib_map_mr_sg_pi() will map the PI and data dma mapped SG lists to the
> mlx5 memory region prior to the registration operation. In the new
> API, the mlx5 driver will allocate an internal memory region for the
> UMR operation to register both PI and data SG lists. The internal MR
> will use KLM mode in order to map 2 (possibly non-contiguous/non-align)
> SG lists using 1 memory key. In the new API, each ULP will use 1 memory
> region for the signature operation (instead of 3 in the old API). This
> memory region will have a key that will be exposed to remote server to
> perform RDMA operation. The internal memory key that will map the SG lists
> will stay private.
> 
> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
> Signed-off-by: Israel Rukshin <israelr@mellanox.com>
> ---
>  drivers/infiniband/hw/mlx5/main.c    |   2 +
>  drivers/infiniband/hw/mlx5/mlx5_ib.h |  11 +++
>  drivers/infiniband/hw/mlx5/mr.c      | 184 ++++++++++++++++++++++++++++++++---
>  3 files changed, 186 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
> index abac70ad5c7c..b6588cdef1cf 100644
> --- a/drivers/infiniband/hw/mlx5/main.c
> +++ b/drivers/infiniband/hw/mlx5/main.c
> @@ -6126,6 +6126,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
>  static const struct ib_device_ops mlx5_ib_dev_ops = {
>  	.add_gid = mlx5_ib_add_gid,
>  	.alloc_mr = mlx5_ib_alloc_mr,
> +	.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
>  	.alloc_pd = mlx5_ib_alloc_pd,
>  	.alloc_ucontext = mlx5_ib_alloc_ucontext,
>  	.attach_mcast = mlx5_ib_mcg_attach,
> @@ -6155,6 +6156,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
>  	.get_dma_mr = mlx5_ib_get_dma_mr,
>  	.get_link_layer = mlx5_ib_port_link_layer,
>  	.map_mr_sg = mlx5_ib_map_mr_sg,
> +	.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
>  	.mmap = mlx5_ib_mmap,
>  	.modify_cq = mlx5_ib_modify_cq,
>  	.modify_device = mlx5_ib_modify_device,
> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index 40eb8be482e4..07bac37c3450 100644
> --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -587,6 +587,9 @@ struct mlx5_ib_mr {
>  	void			*descs;
>  	dma_addr_t		desc_map;
>  	int			ndescs;
> +	int			data_length;
> +	int			meta_ndescs;
> +	int			meta_length;
>  	int			max_descs;
>  	int			desc_size;
>  	int			access_mode;
> @@ -605,6 +608,7 @@ struct mlx5_ib_mr {
>  	int			access_flags; /* Needed for rereg MR */
>  
>  	struct mlx5_ib_mr      *parent;
> +	struct mlx5_ib_mr      *pi_mr; /* Needed for IB_MR_TYPE_INTEGRITY */
>  	atomic_t		num_leaf_free;
>  	wait_queue_head_t       q_leaf_free;
>  	struct mlx5_async_work  cb_work;
> @@ -1148,8 +1152,15 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
>  int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
>  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>  			       u32 max_num_sg, struct ib_udata *udata);
> +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
> +					 u32 max_num_sg,
> +					 u32 max_num_meta_sg);
>  int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
>  		      unsigned int *sg_offset);
> +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
> +			 int data_sg_nents, unsigned int *data_sg_offset,
> +			 struct scatterlist *meta_sg, int meta_sg_nents,
> +			 unsigned int *meta_sg_offset);
>  int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
>  			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
>  			const struct ib_mad_hdr *in, size_t in_mad_size,
> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> index 5f09699fab98..6820d80c6a7f 100644
> --- a/drivers/infiniband/hw/mlx5/mr.c
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@ -1639,16 +1639,22 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
>  
>  int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
>  {
> -	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
> +	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
> +
> +	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
> +		dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr);
> +
> +	dereg_mr(to_mdev(ibmr->device), mmr);
> +
>  	return 0;
>  }
>  
> -struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> -			       u32 max_num_sg, struct ib_udata *udata)
> +static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
> +				u32 max_num_sg, u32 max_num_meta_sg)
>  {
>  	struct mlx5_ib_dev *dev = to_mdev(pd->device);
>  	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
> -	int ndescs = ALIGN(max_num_sg, 4);
> +	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
>  	struct mlx5_ib_mr *mr;
>  	void *mkc;
>  	u32 *in;
> @@ -1670,8 +1676,72 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>  	MLX5_SET(mkc, mkc, qpn, 0xffffff);
>  	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
>  
> +	mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
> +
> +	err = mlx5_alloc_priv_descs(pd->device, mr,
> +				    ndescs, sizeof(struct mlx5_klm));
> +	if (err)
> +		goto err_free_in;
> +	mr->desc_size = sizeof(struct mlx5_klm);
> +	mr->max_descs = ndescs;
> +
> +	MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
> +	MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
> +	MLX5_SET(mkc, mkc, umr_en, 1);
> +
> +	mr->ibmr.pd = pd;
> +	mr->ibmr.device = pd->device;
> +	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
> +	if (err)
> +		goto err_priv_descs;
> +
> +	mr->mmkey.type = MLX5_MKEY_MR;
> +	mr->ibmr.lkey = mr->mmkey.key;
> +	mr->ibmr.rkey = mr->mmkey.key;
> +	mr->umem = NULL;
> +	kfree(in);
> +
> +	return mr;
> +
> +err_priv_descs:
> +	mlx5_free_priv_descs(mr);
> +err_free_in:
> +	kfree(in);
> +err_free:
> +	kfree(mr);
> +	return ERR_PTR(err);
> +}
> +
> +static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
> +					enum ib_mr_type mr_type, u32 max_num_sg,
> +					u32 max_num_meta_sg)
> +{
> +	struct mlx5_ib_dev *dev = to_mdev(pd->device);
> +	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
> +	int ndescs = ALIGN(max_num_sg, 4);
> +	struct mlx5_ib_mr *mr;
> +	void *mkc;
> +	u32 *in;
> +	int err;
> +
> +	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
> +	if (!mr)
> +		return ERR_PTR(-ENOMEM);
> +
> +	in = kzalloc(inlen, GFP_KERNEL);
> +	if (!in) {
> +		err = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
> +	MLX5_SET(mkc, mkc, free, 1);
> +	MLX5_SET(mkc, mkc, qpn, 0xffffff);
> +	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
> +
>  	if (mr_type == IB_MR_TYPE_MEM_REG) {
>  		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
> +		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
>  		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
>  		err = mlx5_alloc_priv_descs(pd->device, mr,
>  					    ndescs, sizeof(struct mlx5_mtt));
> @@ -1682,6 +1752,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>  		mr->max_descs = ndescs;
>  	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
>  		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
> +		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
>  
>  		err = mlx5_alloc_priv_descs(pd->device, mr,
>  					    ndescs, sizeof(struct mlx5_klm));
> @@ -1689,11 +1760,13 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>  			goto err_free_in;
>  		mr->desc_size = sizeof(struct mlx5_klm);
>  		mr->max_descs = ndescs;
> -	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
> +	} else if (mr_type == IB_MR_TYPE_SIGNATURE ||
> +		   mr_type == IB_MR_TYPE_INTEGRITY) {
>  		u32 psv_index[2];
>  
>  		MLX5_SET(mkc, mkc, bsf_en, 1);
>  		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
> +		MLX5_SET(mkc, mkc, translations_octword_size, 4);
>  		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
>  		if (!mr->sig) {
>  			err = -ENOMEM;
> @@ -1714,6 +1787,14 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>  		mr->sig->sig_err_exists = false;
>  		/* Next UMR, Arm SIGERR */
>  		++mr->sig->sigerr_count;
> +		if (mr_type == IB_MR_TYPE_INTEGRITY) {
> +			mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg,
> +							max_num_meta_sg);
> +			if (IS_ERR(mr->pi_mr)) {
> +				err = PTR_ERR(mr->pi_mr);
> +				goto err_destroy_psv;
> +			}
> +		}
>  	} else {
>  		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
>  		err = -EINVAL;
> @@ -1727,7 +1808,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>  	mr->ibmr.device = pd->device;
>  	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
>  	if (err)
> -		goto err_destroy_psv;
> +		goto err_free_pi_mr;
>  
>  	mr->mmkey.type = MLX5_MKEY_MR;
>  	mr->ibmr.lkey = mr->mmkey.key;
> @@ -1737,6 +1818,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>  
>  	return &mr->ibmr;
>  
> +err_free_pi_mr:
> +	if (mr->pi_mr) {
> +		dereg_mr(to_mdev(mr->pi_mr->ibmr.device), mr->pi_mr);
> +		mr->pi_mr = NULL;
> +	}
>  err_destroy_psv:
>  	if (mr->sig) {
>  		if (mlx5_core_destroy_psv(dev->mdev,
> @@ -1758,6 +1844,19 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>  	return ERR_PTR(err);
>  }
>  
> +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> +			       u32 max_num_sg, struct ib_udata *udata)
> +{
> +	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
> +}
> +
> +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
> +					 u32 max_num_sg, u32 max_num_meta_sg)
> +{
> +	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
> +				  max_num_meta_sg);
> +}
> +
>  struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
>  			       struct ib_udata *udata)
>  {
> @@ -1890,13 +1989,16 @@ static int
>  mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
>  		   struct scatterlist *sgl,
>  		   unsigned short sg_nents,
> -		   unsigned int *sg_offset_p)
> +		   unsigned int *sg_offset_p,
> +		   struct scatterlist *meta_sgl,
> +		   unsigned short meta_sg_nents,
> +		   unsigned int *meta_sg_offset_p)
>  {
>  	struct scatterlist *sg = sgl;
>  	struct mlx5_klm *klms = mr->descs;
>  	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
>  	u32 lkey = mr->ibmr.pd->local_dma_lkey;
> -	int i;
> +	int i, j = 0;
>  
>  	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
>  	mr->ibmr.length = 0;
> @@ -1911,12 +2013,36 @@ mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
>  
>  		sg_offset = 0;
>  	}
> -	mr->ndescs = i;
>  
>  	if (sg_offset_p)
>  		*sg_offset_p = sg_offset;
>  
> -	return i;
> +	mr->ndescs = i;
> +	mr->data_length = mr->ibmr.length;
> +
> +	if (meta_sg_nents) {
> +		sg = meta_sgl;
> +		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
> +		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
> +			if (unlikely(i + j >= mr->max_descs))
> +				break;
> +			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
> +						     sg_offset);
> +			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
> +							 sg_offset);
> +			klms[i + j].key = cpu_to_be32(lkey);
> +			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
> +
> +			sg_offset = 0;
> +		}
> +		if (meta_sg_offset_p)
> +			*meta_sg_offset_p = sg_offset;
> +
> +		mr->meta_ndescs = j;
> +		mr->meta_length = mr->ibmr.length - mr->data_length;
> +	}
> +
> +	return i + j;
>  }
>  
>  static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
> @@ -1933,6 +2059,41 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
>  	return 0;
>  }
>  
> +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
> +			 int data_sg_nents, unsigned int *data_sg_offset,
> +			 struct scatterlist *meta_sg, int meta_sg_nents,
> +			 unsigned int *meta_sg_offset)
> +{
> +	struct mlx5_ib_mr *mr = to_mmr(ibmr);
> +	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
> +	int n;
> +
> +	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
> +
> +	pi_mr->ndescs = 0;
> +	pi_mr->meta_ndescs = 0;
> +	pi_mr->meta_length = 0;
> +
> +	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
> +				   pi_mr->desc_size * pi_mr->max_descs,
> +				   DMA_TO_DEVICE);
> +
> +	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
> +			       meta_sg, meta_sg_nents, meta_sg_offset);
> +
> +	/* This is zero-based memory region */
> +	pi_mr->ibmr.iova = 0;

Are you aware that Yuval enabled zero based mr from rdma-core with the
follow patch?

https://marc.info/?l=linux-rdma&m=155919637918880&w=2

> +	ibmr->length = pi_mr->ibmr.length;
> +	ibmr->iova = pi_mr->ibmr.iova;
> +	ibmr->sig_attrs->meta_length = pi_mr->meta_length;
> +
> +	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
> +				      pi_mr->desc_size * pi_mr->max_descs,
> +				      DMA_TO_DEVICE);
> +
> +	return n;
> +}
> +
>  int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
>  		      unsigned int *sg_offset)
>  {
> @@ -1946,7 +2107,8 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
>  				   DMA_TO_DEVICE);
>  
>  	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
> -		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
> +		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
> +				       NULL);
>  	else
>  		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
>  				mlx5_set_page);
> -- 
> 2.16.3
>
Max Gurtovoy June 2, 2019, 2:22 p.m. UTC | #2
On 6/2/2019 12:13 PM, Shamir Rabinovitch wrote:
> On Thu, May 30, 2019 at 04:25:17PM +0300, Max Gurtovoy wrote:
>> mlx5_ib_map_mr_sg_pi() will map the PI and data dma mapped SG lists to the
>> mlx5 memory region prior to the registration operation. In the new
>> API, the mlx5 driver will allocate an internal memory region for the
>> UMR operation to register both PI and data SG lists. The internal MR
>> will use KLM mode in order to map 2 (possibly non-contiguous/non-align)
>> SG lists using 1 memory key. In the new API, each ULP will use 1 memory
>> region for the signature operation (instead of 3 in the old API). This
>> memory region will have a key that will be exposed to remote server to
>> perform RDMA operation. The internal memory key that will map the SG lists
>> will stay private.
>>
>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
>> Signed-off-by: Israel Rukshin <israelr@mellanox.com>
>> ---
>>   drivers/infiniband/hw/mlx5/main.c    |   2 +
>>   drivers/infiniband/hw/mlx5/mlx5_ib.h |  11 +++
>>   drivers/infiniband/hw/mlx5/mr.c      | 184 ++++++++++++++++++++++++++++++++---
>>   3 files changed, 186 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
>> index abac70ad5c7c..b6588cdef1cf 100644
>> --- a/drivers/infiniband/hw/mlx5/main.c
>> +++ b/drivers/infiniband/hw/mlx5/main.c
>> @@ -6126,6 +6126,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
>>   static const struct ib_device_ops mlx5_ib_dev_ops = {
>>   	.add_gid = mlx5_ib_add_gid,
>>   	.alloc_mr = mlx5_ib_alloc_mr,
>> +	.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
>>   	.alloc_pd = mlx5_ib_alloc_pd,
>>   	.alloc_ucontext = mlx5_ib_alloc_ucontext,
>>   	.attach_mcast = mlx5_ib_mcg_attach,
>> @@ -6155,6 +6156,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
>>   	.get_dma_mr = mlx5_ib_get_dma_mr,
>>   	.get_link_layer = mlx5_ib_port_link_layer,
>>   	.map_mr_sg = mlx5_ib_map_mr_sg,
>> +	.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
>>   	.mmap = mlx5_ib_mmap,
>>   	.modify_cq = mlx5_ib_modify_cq,
>>   	.modify_device = mlx5_ib_modify_device,
>> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
>> index 40eb8be482e4..07bac37c3450 100644
>> --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
>> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
>> @@ -587,6 +587,9 @@ struct mlx5_ib_mr {
>>   	void			*descs;
>>   	dma_addr_t		desc_map;
>>   	int			ndescs;
>> +	int			data_length;
>> +	int			meta_ndescs;
>> +	int			meta_length;
>>   	int			max_descs;
>>   	int			desc_size;
>>   	int			access_mode;
>> @@ -605,6 +608,7 @@ struct mlx5_ib_mr {
>>   	int			access_flags; /* Needed for rereg MR */
>>   
>>   	struct mlx5_ib_mr      *parent;
>> +	struct mlx5_ib_mr      *pi_mr; /* Needed for IB_MR_TYPE_INTEGRITY */
>>   	atomic_t		num_leaf_free;
>>   	wait_queue_head_t       q_leaf_free;
>>   	struct mlx5_async_work  cb_work;
>> @@ -1148,8 +1152,15 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
>>   int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
>>   struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>   			       u32 max_num_sg, struct ib_udata *udata);
>> +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
>> +					 u32 max_num_sg,
>> +					 u32 max_num_meta_sg);
>>   int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
>>   		      unsigned int *sg_offset);
>> +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
>> +			 int data_sg_nents, unsigned int *data_sg_offset,
>> +			 struct scatterlist *meta_sg, int meta_sg_nents,
>> +			 unsigned int *meta_sg_offset);
>>   int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
>>   			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
>>   			const struct ib_mad_hdr *in, size_t in_mad_size,
>> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
>> index 5f09699fab98..6820d80c6a7f 100644
>> --- a/drivers/infiniband/hw/mlx5/mr.c
>> +++ b/drivers/infiniband/hw/mlx5/mr.c
>> @@ -1639,16 +1639,22 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
>>   
>>   int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
>>   {
>> -	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
>> +	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
>> +
>> +	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
>> +		dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr);
>> +
>> +	dereg_mr(to_mdev(ibmr->device), mmr);
>> +
>>   	return 0;
>>   }
>>   
>> -struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>> -			       u32 max_num_sg, struct ib_udata *udata)
>> +static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
>> +				u32 max_num_sg, u32 max_num_meta_sg)
>>   {
>>   	struct mlx5_ib_dev *dev = to_mdev(pd->device);
>>   	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
>> -	int ndescs = ALIGN(max_num_sg, 4);
>> +	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
>>   	struct mlx5_ib_mr *mr;
>>   	void *mkc;
>>   	u32 *in;
>> @@ -1670,8 +1676,72 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>   	MLX5_SET(mkc, mkc, qpn, 0xffffff);
>>   	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
>>   
>> +	mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
>> +
>> +	err = mlx5_alloc_priv_descs(pd->device, mr,
>> +				    ndescs, sizeof(struct mlx5_klm));
>> +	if (err)
>> +		goto err_free_in;
>> +	mr->desc_size = sizeof(struct mlx5_klm);
>> +	mr->max_descs = ndescs;
>> +
>> +	MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
>> +	MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
>> +	MLX5_SET(mkc, mkc, umr_en, 1);
>> +
>> +	mr->ibmr.pd = pd;
>> +	mr->ibmr.device = pd->device;
>> +	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
>> +	if (err)
>> +		goto err_priv_descs;
>> +
>> +	mr->mmkey.type = MLX5_MKEY_MR;
>> +	mr->ibmr.lkey = mr->mmkey.key;
>> +	mr->ibmr.rkey = mr->mmkey.key;
>> +	mr->umem = NULL;
>> +	kfree(in);
>> +
>> +	return mr;
>> +
>> +err_priv_descs:
>> +	mlx5_free_priv_descs(mr);
>> +err_free_in:
>> +	kfree(in);
>> +err_free:
>> +	kfree(mr);
>> +	return ERR_PTR(err);
>> +}
>> +
>> +static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
>> +					enum ib_mr_type mr_type, u32 max_num_sg,
>> +					u32 max_num_meta_sg)
>> +{
>> +	struct mlx5_ib_dev *dev = to_mdev(pd->device);
>> +	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
>> +	int ndescs = ALIGN(max_num_sg, 4);
>> +	struct mlx5_ib_mr *mr;
>> +	void *mkc;
>> +	u32 *in;
>> +	int err;
>> +
>> +	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
>> +	if (!mr)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	in = kzalloc(inlen, GFP_KERNEL);
>> +	if (!in) {
>> +		err = -ENOMEM;
>> +		goto err_free;
>> +	}
>> +
>> +	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
>> +	MLX5_SET(mkc, mkc, free, 1);
>> +	MLX5_SET(mkc, mkc, qpn, 0xffffff);
>> +	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
>> +
>>   	if (mr_type == IB_MR_TYPE_MEM_REG) {
>>   		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
>> +		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
>>   		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
>>   		err = mlx5_alloc_priv_descs(pd->device, mr,
>>   					    ndescs, sizeof(struct mlx5_mtt));
>> @@ -1682,6 +1752,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>   		mr->max_descs = ndescs;
>>   	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
>>   		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
>> +		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
>>   
>>   		err = mlx5_alloc_priv_descs(pd->device, mr,
>>   					    ndescs, sizeof(struct mlx5_klm));
>> @@ -1689,11 +1760,13 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>   			goto err_free_in;
>>   		mr->desc_size = sizeof(struct mlx5_klm);
>>   		mr->max_descs = ndescs;
>> -	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
>> +	} else if (mr_type == IB_MR_TYPE_SIGNATURE ||
>> +		   mr_type == IB_MR_TYPE_INTEGRITY) {
>>   		u32 psv_index[2];
>>   
>>   		MLX5_SET(mkc, mkc, bsf_en, 1);
>>   		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
>> +		MLX5_SET(mkc, mkc, translations_octword_size, 4);
>>   		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
>>   		if (!mr->sig) {
>>   			err = -ENOMEM;
>> @@ -1714,6 +1787,14 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>   		mr->sig->sig_err_exists = false;
>>   		/* Next UMR, Arm SIGERR */
>>   		++mr->sig->sigerr_count;
>> +		if (mr_type == IB_MR_TYPE_INTEGRITY) {
>> +			mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg,
>> +							max_num_meta_sg);
>> +			if (IS_ERR(mr->pi_mr)) {
>> +				err = PTR_ERR(mr->pi_mr);
>> +				goto err_destroy_psv;
>> +			}
>> +		}
>>   	} else {
>>   		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
>>   		err = -EINVAL;
>> @@ -1727,7 +1808,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>   	mr->ibmr.device = pd->device;
>>   	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
>>   	if (err)
>> -		goto err_destroy_psv;
>> +		goto err_free_pi_mr;
>>   
>>   	mr->mmkey.type = MLX5_MKEY_MR;
>>   	mr->ibmr.lkey = mr->mmkey.key;
>> @@ -1737,6 +1818,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>   
>>   	return &mr->ibmr;
>>   
>> +err_free_pi_mr:
>> +	if (mr->pi_mr) {
>> +		dereg_mr(to_mdev(mr->pi_mr->ibmr.device), mr->pi_mr);
>> +		mr->pi_mr = NULL;
>> +	}
>>   err_destroy_psv:
>>   	if (mr->sig) {
>>   		if (mlx5_core_destroy_psv(dev->mdev,
>> @@ -1758,6 +1844,19 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>   	return ERR_PTR(err);
>>   }
>>   
>> +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>> +			       u32 max_num_sg, struct ib_udata *udata)
>> +{
>> +	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
>> +}
>> +
>> +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
>> +					 u32 max_num_sg, u32 max_num_meta_sg)
>> +{
>> +	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
>> +				  max_num_meta_sg);
>> +}
>> +
>>   struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
>>   			       struct ib_udata *udata)
>>   {
>> @@ -1890,13 +1989,16 @@ static int
>>   mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
>>   		   struct scatterlist *sgl,
>>   		   unsigned short sg_nents,
>> -		   unsigned int *sg_offset_p)
>> +		   unsigned int *sg_offset_p,
>> +		   struct scatterlist *meta_sgl,
>> +		   unsigned short meta_sg_nents,
>> +		   unsigned int *meta_sg_offset_p)
>>   {
>>   	struct scatterlist *sg = sgl;
>>   	struct mlx5_klm *klms = mr->descs;
>>   	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
>>   	u32 lkey = mr->ibmr.pd->local_dma_lkey;
>> -	int i;
>> +	int i, j = 0;
>>   
>>   	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
>>   	mr->ibmr.length = 0;
>> @@ -1911,12 +2013,36 @@ mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
>>   
>>   		sg_offset = 0;
>>   	}
>> -	mr->ndescs = i;
>>   
>>   	if (sg_offset_p)
>>   		*sg_offset_p = sg_offset;
>>   
>> -	return i;
>> +	mr->ndescs = i;
>> +	mr->data_length = mr->ibmr.length;
>> +
>> +	if (meta_sg_nents) {
>> +		sg = meta_sgl;
>> +		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
>> +		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
>> +			if (unlikely(i + j >= mr->max_descs))
>> +				break;
>> +			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
>> +						     sg_offset);
>> +			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
>> +							 sg_offset);
>> +			klms[i + j].key = cpu_to_be32(lkey);
>> +			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
>> +
>> +			sg_offset = 0;
>> +		}
>> +		if (meta_sg_offset_p)
>> +			*meta_sg_offset_p = sg_offset;
>> +
>> +		mr->meta_ndescs = j;
>> +		mr->meta_length = mr->ibmr.length - mr->data_length;
>> +	}
>> +
>> +	return i + j;
>>   }
>>   
>>   static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
>> @@ -1933,6 +2059,41 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
>>   	return 0;
>>   }
>>   
>> +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
>> +			 int data_sg_nents, unsigned int *data_sg_offset,
>> +			 struct scatterlist *meta_sg, int meta_sg_nents,
>> +			 unsigned int *meta_sg_offset)
>> +{
>> +	struct mlx5_ib_mr *mr = to_mmr(ibmr);
>> +	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
>> +	int n;
>> +
>> +	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
>> +
>> +	pi_mr->ndescs = 0;
>> +	pi_mr->meta_ndescs = 0;
>> +	pi_mr->meta_length = 0;
>> +
>> +	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
>> +				   pi_mr->desc_size * pi_mr->max_descs,
>> +				   DMA_TO_DEVICE);
>> +
>> +	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
>> +			       meta_sg, meta_sg_nents, meta_sg_offset);
>> +
>> +	/* This is zero-based memory region */
>> +	pi_mr->ibmr.iova = 0;
> Are you aware that Yuval enabled zero based mr from rdma-core with the
> follow patch?
>
> https://marc.info/?l=linux-rdma&m=155919637918880&w=2

Well, AFAIU, his API uses user space reg-mr that mapped to some 
device->ops.reg_user_mr.

This series is kernel only. In user-land there is no MR mappings and all 
the MTTs are set during reg-mr.

In kernel we just allocate the mkey tables and update the addresses in 
runtime using UMR operation.

what is your concern here ?
Shamir Rabinovitch June 3, 2019, 6:51 a.m. UTC | #3
On Sun, Jun 02, 2019 at 05:22:18PM +0300, Max Gurtovoy wrote:
> 
> On 6/2/2019 12:13 PM, Shamir Rabinovitch wrote:
> > On Thu, May 30, 2019 at 04:25:17PM +0300, Max Gurtovoy wrote:
> > > mlx5_ib_map_mr_sg_pi() will map the PI and data dma mapped SG lists to the
> > > mlx5 memory region prior to the registration operation. In the new
> > > API, the mlx5 driver will allocate an internal memory region for the
> > > UMR operation to register both PI and data SG lists. The internal MR
> > > will use KLM mode in order to map 2 (possibly non-contiguous/non-align)
> > > SG lists using 1 memory key. In the new API, each ULP will use 1 memory
> > > region for the signature operation (instead of 3 in the old API). This
> > > memory region will have a key that will be exposed to remote server to
> > > perform RDMA operation. The internal memory key that will map the SG lists
> > > will stay private.
> > > 
> > > Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
> > > Signed-off-by: Israel Rukshin <israelr@mellanox.com>
> > > ---
> > >   drivers/infiniband/hw/mlx5/main.c    |   2 +
> > >   drivers/infiniband/hw/mlx5/mlx5_ib.h |  11 +++
> > >   drivers/infiniband/hw/mlx5/mr.c      | 184 ++++++++++++++++++++++++++++++++---
> > >   3 files changed, 186 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
> > > index abac70ad5c7c..b6588cdef1cf 100644
> > > --- a/drivers/infiniband/hw/mlx5/main.c
> > > +++ b/drivers/infiniband/hw/mlx5/main.c
> > > @@ -6126,6 +6126,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
> > >   static const struct ib_device_ops mlx5_ib_dev_ops = {
> > >   	.add_gid = mlx5_ib_add_gid,
> > >   	.alloc_mr = mlx5_ib_alloc_mr,
> > > +	.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
> > >   	.alloc_pd = mlx5_ib_alloc_pd,
> > >   	.alloc_ucontext = mlx5_ib_alloc_ucontext,
> > >   	.attach_mcast = mlx5_ib_mcg_attach,
> > > @@ -6155,6 +6156,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
> > >   	.get_dma_mr = mlx5_ib_get_dma_mr,
> > >   	.get_link_layer = mlx5_ib_port_link_layer,
> > >   	.map_mr_sg = mlx5_ib_map_mr_sg,
> > > +	.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
> > >   	.mmap = mlx5_ib_mmap,
> > >   	.modify_cq = mlx5_ib_modify_cq,
> > >   	.modify_device = mlx5_ib_modify_device,
> > > diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> > > index 40eb8be482e4..07bac37c3450 100644
> > > --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> > > +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> > > @@ -587,6 +587,9 @@ struct mlx5_ib_mr {
> > >   	void			*descs;
> > >   	dma_addr_t		desc_map;
> > >   	int			ndescs;
> > > +	int			data_length;
> > > +	int			meta_ndescs;
> > > +	int			meta_length;
> > >   	int			max_descs;
> > >   	int			desc_size;
> > >   	int			access_mode;
> > > @@ -605,6 +608,7 @@ struct mlx5_ib_mr {
> > >   	int			access_flags; /* Needed for rereg MR */
> > >   	struct mlx5_ib_mr      *parent;
> > > +	struct mlx5_ib_mr      *pi_mr; /* Needed for IB_MR_TYPE_INTEGRITY */
> > >   	atomic_t		num_leaf_free;
> > >   	wait_queue_head_t       q_leaf_free;
> > >   	struct mlx5_async_work  cb_work;
> > > @@ -1148,8 +1152,15 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
> > >   int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
> > >   struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > >   			       u32 max_num_sg, struct ib_udata *udata);
> > > +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
> > > +					 u32 max_num_sg,
> > > +					 u32 max_num_meta_sg);
> > >   int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
> > >   		      unsigned int *sg_offset);
> > > +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
> > > +			 int data_sg_nents, unsigned int *data_sg_offset,
> > > +			 struct scatterlist *meta_sg, int meta_sg_nents,
> > > +			 unsigned int *meta_sg_offset);
> > >   int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
> > >   			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
> > >   			const struct ib_mad_hdr *in, size_t in_mad_size,
> > > diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> > > index 5f09699fab98..6820d80c6a7f 100644
> > > --- a/drivers/infiniband/hw/mlx5/mr.c
> > > +++ b/drivers/infiniband/hw/mlx5/mr.c
> > > @@ -1639,16 +1639,22 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
> > >   int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
> > >   {
> > > -	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
> > > +	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
> > > +
> > > +	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
> > > +		dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr);
> > > +
> > > +	dereg_mr(to_mdev(ibmr->device), mmr);
> > > +
> > >   	return 0;
> > >   }
> > > -struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > > -			       u32 max_num_sg, struct ib_udata *udata)
> > > +static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
> > > +				u32 max_num_sg, u32 max_num_meta_sg)
> > >   {
> > >   	struct mlx5_ib_dev *dev = to_mdev(pd->device);
> > >   	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
> > > -	int ndescs = ALIGN(max_num_sg, 4);
> > > +	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
> > >   	struct mlx5_ib_mr *mr;
> > >   	void *mkc;
> > >   	u32 *in;
> > > @@ -1670,8 +1676,72 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > >   	MLX5_SET(mkc, mkc, qpn, 0xffffff);
> > >   	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
> > > +	mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
> > > +
> > > +	err = mlx5_alloc_priv_descs(pd->device, mr,
> > > +				    ndescs, sizeof(struct mlx5_klm));
> > > +	if (err)
> > > +		goto err_free_in;
> > > +	mr->desc_size = sizeof(struct mlx5_klm);
> > > +	mr->max_descs = ndescs;
> > > +
> > > +	MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
> > > +	MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
> > > +	MLX5_SET(mkc, mkc, umr_en, 1);
> > > +
> > > +	mr->ibmr.pd = pd;
> > > +	mr->ibmr.device = pd->device;
> > > +	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
> > > +	if (err)
> > > +		goto err_priv_descs;
> > > +
> > > +	mr->mmkey.type = MLX5_MKEY_MR;
> > > +	mr->ibmr.lkey = mr->mmkey.key;
> > > +	mr->ibmr.rkey = mr->mmkey.key;
> > > +	mr->umem = NULL;
> > > +	kfree(in);
> > > +
> > > +	return mr;
> > > +
> > > +err_priv_descs:
> > > +	mlx5_free_priv_descs(mr);
> > > +err_free_in:
> > > +	kfree(in);
> > > +err_free:
> > > +	kfree(mr);
> > > +	return ERR_PTR(err);
> > > +}
> > > +
> > > +static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
> > > +					enum ib_mr_type mr_type, u32 max_num_sg,
> > > +					u32 max_num_meta_sg)
> > > +{
> > > +	struct mlx5_ib_dev *dev = to_mdev(pd->device);
> > > +	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
> > > +	int ndescs = ALIGN(max_num_sg, 4);
> > > +	struct mlx5_ib_mr *mr;
> > > +	void *mkc;
> > > +	u32 *in;
> > > +	int err;
> > > +
> > > +	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
> > > +	if (!mr)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	in = kzalloc(inlen, GFP_KERNEL);
> > > +	if (!in) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
> > > +	MLX5_SET(mkc, mkc, free, 1);
> > > +	MLX5_SET(mkc, mkc, qpn, 0xffffff);
> > > +	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
> > > +
> > >   	if (mr_type == IB_MR_TYPE_MEM_REG) {
> > >   		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
> > > +		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
> > >   		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
> > >   		err = mlx5_alloc_priv_descs(pd->device, mr,
> > >   					    ndescs, sizeof(struct mlx5_mtt));
> > > @@ -1682,6 +1752,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > >   		mr->max_descs = ndescs;
> > >   	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
> > >   		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
> > > +		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
> > >   		err = mlx5_alloc_priv_descs(pd->device, mr,
> > >   					    ndescs, sizeof(struct mlx5_klm));
> > > @@ -1689,11 +1760,13 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > >   			goto err_free_in;
> > >   		mr->desc_size = sizeof(struct mlx5_klm);
> > >   		mr->max_descs = ndescs;
> > > -	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
> > > +	} else if (mr_type == IB_MR_TYPE_SIGNATURE ||
> > > +		   mr_type == IB_MR_TYPE_INTEGRITY) {
> > >   		u32 psv_index[2];
> > >   		MLX5_SET(mkc, mkc, bsf_en, 1);
> > >   		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
> > > +		MLX5_SET(mkc, mkc, translations_octword_size, 4);
> > >   		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
> > >   		if (!mr->sig) {
> > >   			err = -ENOMEM;
> > > @@ -1714,6 +1787,14 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > >   		mr->sig->sig_err_exists = false;
> > >   		/* Next UMR, Arm SIGERR */
> > >   		++mr->sig->sigerr_count;
> > > +		if (mr_type == IB_MR_TYPE_INTEGRITY) {
> > > +			mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg,
> > > +							max_num_meta_sg);
> > > +			if (IS_ERR(mr->pi_mr)) {
> > > +				err = PTR_ERR(mr->pi_mr);
> > > +				goto err_destroy_psv;
> > > +			}
> > > +		}
> > >   	} else {
> > >   		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
> > >   		err = -EINVAL;
> > > @@ -1727,7 +1808,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > >   	mr->ibmr.device = pd->device;
> > >   	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
> > >   	if (err)
> > > -		goto err_destroy_psv;
> > > +		goto err_free_pi_mr;
> > >   	mr->mmkey.type = MLX5_MKEY_MR;
> > >   	mr->ibmr.lkey = mr->mmkey.key;
> > > @@ -1737,6 +1818,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > >   	return &mr->ibmr;
> > > +err_free_pi_mr:
> > > +	if (mr->pi_mr) {
> > > +		dereg_mr(to_mdev(mr->pi_mr->ibmr.device), mr->pi_mr);
> > > +		mr->pi_mr = NULL;
> > > +	}
> > >   err_destroy_psv:
> > >   	if (mr->sig) {
> > >   		if (mlx5_core_destroy_psv(dev->mdev,
> > > @@ -1758,6 +1844,19 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > >   	return ERR_PTR(err);
> > >   }
> > > +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
> > > +			       u32 max_num_sg, struct ib_udata *udata)
> > > +{
> > > +	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
> > > +}
> > > +
> > > +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
> > > +					 u32 max_num_sg, u32 max_num_meta_sg)
> > > +{
> > > +	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
> > > +				  max_num_meta_sg);
> > > +}
> > > +
> > >   struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
> > >   			       struct ib_udata *udata)
> > >   {
> > > @@ -1890,13 +1989,16 @@ static int
> > >   mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
> > >   		   struct scatterlist *sgl,
> > >   		   unsigned short sg_nents,
> > > -		   unsigned int *sg_offset_p)
> > > +		   unsigned int *sg_offset_p,
> > > +		   struct scatterlist *meta_sgl,
> > > +		   unsigned short meta_sg_nents,
> > > +		   unsigned int *meta_sg_offset_p)
> > >   {
> > >   	struct scatterlist *sg = sgl;
> > >   	struct mlx5_klm *klms = mr->descs;
> > >   	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
> > >   	u32 lkey = mr->ibmr.pd->local_dma_lkey;
> > > -	int i;
> > > +	int i, j = 0;
> > >   	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
> > >   	mr->ibmr.length = 0;
> > > @@ -1911,12 +2013,36 @@ mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
> > >   		sg_offset = 0;
> > >   	}
> > > -	mr->ndescs = i;
> > >   	if (sg_offset_p)
> > >   		*sg_offset_p = sg_offset;
> > > -	return i;
> > > +	mr->ndescs = i;
> > > +	mr->data_length = mr->ibmr.length;
> > > +
> > > +	if (meta_sg_nents) {
> > > +		sg = meta_sgl;
> > > +		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
> > > +		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
> > > +			if (unlikely(i + j >= mr->max_descs))
> > > +				break;
> > > +			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
> > > +						     sg_offset);
> > > +			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
> > > +							 sg_offset);
> > > +			klms[i + j].key = cpu_to_be32(lkey);
> > > +			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
> > > +
> > > +			sg_offset = 0;
> > > +		}
> > > +		if (meta_sg_offset_p)
> > > +			*meta_sg_offset_p = sg_offset;
> > > +
> > > +		mr->meta_ndescs = j;
> > > +		mr->meta_length = mr->ibmr.length - mr->data_length;
> > > +	}
> > > +
> > > +	return i + j;
> > >   }
> > >   static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
> > > @@ -1933,6 +2059,41 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
> > >   	return 0;
> > >   }
> > > +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
> > > +			 int data_sg_nents, unsigned int *data_sg_offset,
> > > +			 struct scatterlist *meta_sg, int meta_sg_nents,
> > > +			 unsigned int *meta_sg_offset)
> > > +{
> > > +	struct mlx5_ib_mr *mr = to_mmr(ibmr);
> > > +	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
> > > +	int n;
> > > +
> > > +	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
> > > +
> > > +	pi_mr->ndescs = 0;
> > > +	pi_mr->meta_ndescs = 0;
> > > +	pi_mr->meta_length = 0;
> > > +
> > > +	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
> > > +				   pi_mr->desc_size * pi_mr->max_descs,
> > > +				   DMA_TO_DEVICE);
> > > +
> > > +	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
> > > +			       meta_sg, meta_sg_nents, meta_sg_offset);
> > > +
> > > +	/* This is zero-based memory region */
> > > +	pi_mr->ibmr.iova = 0;
> > Are you aware that Yuval enabled zero based mr from rdma-core with the
> > follow patch?
> > 
> > https://marc.info/?l=linux-rdma&m=155919637918880&w=2
> 
> Well, AFAIU, his API uses user space reg-mr that mapped to some
> device->ops.reg_user_mr.
> 
> This series is kernel only. In user-land there is no MR mappings and all the
> MTTs are set during reg-mr.
> 
> In kernel we just allocate the mkey tables and update the addresses in
> runtime using UMR operation.
> 
> what is your concern here ?
> 
> 

Reading this comment from prior patch in this series gave the impression
that you deal with user MR: "Also introduce new mr types IB_MR_TYPE_USER" .

If we need (user) zero based MR, Yuval patch (rdma-core) can help.

Just wanted to know if you are aware to this change.
Max Gurtovoy June 3, 2019, 9:03 a.m. UTC | #4
On 6/3/2019 9:51 AM, Shamir Rabinovitch wrote:
> On Sun, Jun 02, 2019 at 05:22:18PM +0300, Max Gurtovoy wrote:
>> On 6/2/2019 12:13 PM, Shamir Rabinovitch wrote:
>>> On Thu, May 30, 2019 at 04:25:17PM +0300, Max Gurtovoy wrote:
>>>> mlx5_ib_map_mr_sg_pi() will map the PI and data dma mapped SG lists to the
>>>> mlx5 memory region prior to the registration operation. In the new
>>>> API, the mlx5 driver will allocate an internal memory region for the
>>>> UMR operation to register both PI and data SG lists. The internal MR
>>>> will use KLM mode in order to map 2 (possibly non-contiguous/non-align)
>>>> SG lists using 1 memory key. In the new API, each ULP will use 1 memory
>>>> region for the signature operation (instead of 3 in the old API). This
>>>> memory region will have a key that will be exposed to remote server to
>>>> perform RDMA operation. The internal memory key that will map the SG lists
>>>> will stay private.
>>>>
>>>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
>>>> Signed-off-by: Israel Rukshin <israelr@mellanox.com>
>>>> ---
>>>>    drivers/infiniband/hw/mlx5/main.c    |   2 +
>>>>    drivers/infiniband/hw/mlx5/mlx5_ib.h |  11 +++
>>>>    drivers/infiniband/hw/mlx5/mr.c      | 184 ++++++++++++++++++++++++++++++++---
>>>>    3 files changed, 186 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
>>>> index abac70ad5c7c..b6588cdef1cf 100644
>>>> --- a/drivers/infiniband/hw/mlx5/main.c
>>>> +++ b/drivers/infiniband/hw/mlx5/main.c
>>>> @@ -6126,6 +6126,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
>>>>    static const struct ib_device_ops mlx5_ib_dev_ops = {
>>>>    	.add_gid = mlx5_ib_add_gid,
>>>>    	.alloc_mr = mlx5_ib_alloc_mr,
>>>> +	.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
>>>>    	.alloc_pd = mlx5_ib_alloc_pd,
>>>>    	.alloc_ucontext = mlx5_ib_alloc_ucontext,
>>>>    	.attach_mcast = mlx5_ib_mcg_attach,
>>>> @@ -6155,6 +6156,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
>>>>    	.get_dma_mr = mlx5_ib_get_dma_mr,
>>>>    	.get_link_layer = mlx5_ib_port_link_layer,
>>>>    	.map_mr_sg = mlx5_ib_map_mr_sg,
>>>> +	.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
>>>>    	.mmap = mlx5_ib_mmap,
>>>>    	.modify_cq = mlx5_ib_modify_cq,
>>>>    	.modify_device = mlx5_ib_modify_device,
>>>> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
>>>> index 40eb8be482e4..07bac37c3450 100644
>>>> --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
>>>> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
>>>> @@ -587,6 +587,9 @@ struct mlx5_ib_mr {
>>>>    	void			*descs;
>>>>    	dma_addr_t		desc_map;
>>>>    	int			ndescs;
>>>> +	int			data_length;
>>>> +	int			meta_ndescs;
>>>> +	int			meta_length;
>>>>    	int			max_descs;
>>>>    	int			desc_size;
>>>>    	int			access_mode;
>>>> @@ -605,6 +608,7 @@ struct mlx5_ib_mr {
>>>>    	int			access_flags; /* Needed for rereg MR */
>>>>    	struct mlx5_ib_mr      *parent;
>>>> +	struct mlx5_ib_mr      *pi_mr; /* Needed for IB_MR_TYPE_INTEGRITY */
>>>>    	atomic_t		num_leaf_free;
>>>>    	wait_queue_head_t       q_leaf_free;
>>>>    	struct mlx5_async_work  cb_work;
>>>> @@ -1148,8 +1152,15 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
>>>>    int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
>>>>    struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>>    			       u32 max_num_sg, struct ib_udata *udata);
>>>> +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
>>>> +					 u32 max_num_sg,
>>>> +					 u32 max_num_meta_sg);
>>>>    int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
>>>>    		      unsigned int *sg_offset);
>>>> +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
>>>> +			 int data_sg_nents, unsigned int *data_sg_offset,
>>>> +			 struct scatterlist *meta_sg, int meta_sg_nents,
>>>> +			 unsigned int *meta_sg_offset);
>>>>    int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
>>>>    			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
>>>>    			const struct ib_mad_hdr *in, size_t in_mad_size,
>>>> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
>>>> index 5f09699fab98..6820d80c6a7f 100644
>>>> --- a/drivers/infiniband/hw/mlx5/mr.c
>>>> +++ b/drivers/infiniband/hw/mlx5/mr.c
>>>> @@ -1639,16 +1639,22 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
>>>>    int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
>>>>    {
>>>> -	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
>>>> +	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
>>>> +
>>>> +	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
>>>> +		dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr);
>>>> +
>>>> +	dereg_mr(to_mdev(ibmr->device), mmr);
>>>> +
>>>>    	return 0;
>>>>    }
>>>> -struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>> -			       u32 max_num_sg, struct ib_udata *udata)
>>>> +static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
>>>> +				u32 max_num_sg, u32 max_num_meta_sg)
>>>>    {
>>>>    	struct mlx5_ib_dev *dev = to_mdev(pd->device);
>>>>    	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
>>>> -	int ndescs = ALIGN(max_num_sg, 4);
>>>> +	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
>>>>    	struct mlx5_ib_mr *mr;
>>>>    	void *mkc;
>>>>    	u32 *in;
>>>> @@ -1670,8 +1676,72 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>>    	MLX5_SET(mkc, mkc, qpn, 0xffffff);
>>>>    	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
>>>> +	mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
>>>> +
>>>> +	err = mlx5_alloc_priv_descs(pd->device, mr,
>>>> +				    ndescs, sizeof(struct mlx5_klm));
>>>> +	if (err)
>>>> +		goto err_free_in;
>>>> +	mr->desc_size = sizeof(struct mlx5_klm);
>>>> +	mr->max_descs = ndescs;
>>>> +
>>>> +	MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
>>>> +	MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
>>>> +	MLX5_SET(mkc, mkc, umr_en, 1);
>>>> +
>>>> +	mr->ibmr.pd = pd;
>>>> +	mr->ibmr.device = pd->device;
>>>> +	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
>>>> +	if (err)
>>>> +		goto err_priv_descs;
>>>> +
>>>> +	mr->mmkey.type = MLX5_MKEY_MR;
>>>> +	mr->ibmr.lkey = mr->mmkey.key;
>>>> +	mr->ibmr.rkey = mr->mmkey.key;
>>>> +	mr->umem = NULL;
>>>> +	kfree(in);
>>>> +
>>>> +	return mr;
>>>> +
>>>> +err_priv_descs:
>>>> +	mlx5_free_priv_descs(mr);
>>>> +err_free_in:
>>>> +	kfree(in);
>>>> +err_free:
>>>> +	kfree(mr);
>>>> +	return ERR_PTR(err);
>>>> +}
>>>> +
>>>> +static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
>>>> +					enum ib_mr_type mr_type, u32 max_num_sg,
>>>> +					u32 max_num_meta_sg)
>>>> +{
>>>> +	struct mlx5_ib_dev *dev = to_mdev(pd->device);
>>>> +	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
>>>> +	int ndescs = ALIGN(max_num_sg, 4);
>>>> +	struct mlx5_ib_mr *mr;
>>>> +	void *mkc;
>>>> +	u32 *in;
>>>> +	int err;
>>>> +
>>>> +	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
>>>> +	if (!mr)
>>>> +		return ERR_PTR(-ENOMEM);
>>>> +
>>>> +	in = kzalloc(inlen, GFP_KERNEL);
>>>> +	if (!in) {
>>>> +		err = -ENOMEM;
>>>> +		goto err_free;
>>>> +	}
>>>> +
>>>> +	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
>>>> +	MLX5_SET(mkc, mkc, free, 1);
>>>> +	MLX5_SET(mkc, mkc, qpn, 0xffffff);
>>>> +	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
>>>> +
>>>>    	if (mr_type == IB_MR_TYPE_MEM_REG) {
>>>>    		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
>>>> +		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
>>>>    		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
>>>>    		err = mlx5_alloc_priv_descs(pd->device, mr,
>>>>    					    ndescs, sizeof(struct mlx5_mtt));
>>>> @@ -1682,6 +1752,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>>    		mr->max_descs = ndescs;
>>>>    	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
>>>>    		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
>>>> +		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
>>>>    		err = mlx5_alloc_priv_descs(pd->device, mr,
>>>>    					    ndescs, sizeof(struct mlx5_klm));
>>>> @@ -1689,11 +1760,13 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>>    			goto err_free_in;
>>>>    		mr->desc_size = sizeof(struct mlx5_klm);
>>>>    		mr->max_descs = ndescs;
>>>> -	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
>>>> +	} else if (mr_type == IB_MR_TYPE_SIGNATURE ||
>>>> +		   mr_type == IB_MR_TYPE_INTEGRITY) {
>>>>    		u32 psv_index[2];
>>>>    		MLX5_SET(mkc, mkc, bsf_en, 1);
>>>>    		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
>>>> +		MLX5_SET(mkc, mkc, translations_octword_size, 4);
>>>>    		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
>>>>    		if (!mr->sig) {
>>>>    			err = -ENOMEM;
>>>> @@ -1714,6 +1787,14 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>>    		mr->sig->sig_err_exists = false;
>>>>    		/* Next UMR, Arm SIGERR */
>>>>    		++mr->sig->sigerr_count;
>>>> +		if (mr_type == IB_MR_TYPE_INTEGRITY) {
>>>> +			mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg,
>>>> +							max_num_meta_sg);
>>>> +			if (IS_ERR(mr->pi_mr)) {
>>>> +				err = PTR_ERR(mr->pi_mr);
>>>> +				goto err_destroy_psv;
>>>> +			}
>>>> +		}
>>>>    	} else {
>>>>    		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
>>>>    		err = -EINVAL;
>>>> @@ -1727,7 +1808,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>>    	mr->ibmr.device = pd->device;
>>>>    	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
>>>>    	if (err)
>>>> -		goto err_destroy_psv;
>>>> +		goto err_free_pi_mr;
>>>>    	mr->mmkey.type = MLX5_MKEY_MR;
>>>>    	mr->ibmr.lkey = mr->mmkey.key;
>>>> @@ -1737,6 +1818,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>>    	return &mr->ibmr;
>>>> +err_free_pi_mr:
>>>> +	if (mr->pi_mr) {
>>>> +		dereg_mr(to_mdev(mr->pi_mr->ibmr.device), mr->pi_mr);
>>>> +		mr->pi_mr = NULL;
>>>> +	}
>>>>    err_destroy_psv:
>>>>    	if (mr->sig) {
>>>>    		if (mlx5_core_destroy_psv(dev->mdev,
>>>> @@ -1758,6 +1844,19 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>>    	return ERR_PTR(err);
>>>>    }
>>>> +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
>>>> +			       u32 max_num_sg, struct ib_udata *udata)
>>>> +{
>>>> +	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
>>>> +}
>>>> +
>>>> +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
>>>> +					 u32 max_num_sg, u32 max_num_meta_sg)
>>>> +{
>>>> +	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
>>>> +				  max_num_meta_sg);
>>>> +}
>>>> +
>>>>    struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
>>>>    			       struct ib_udata *udata)
>>>>    {
>>>> @@ -1890,13 +1989,16 @@ static int
>>>>    mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
>>>>    		   struct scatterlist *sgl,
>>>>    		   unsigned short sg_nents,
>>>> -		   unsigned int *sg_offset_p)
>>>> +		   unsigned int *sg_offset_p,
>>>> +		   struct scatterlist *meta_sgl,
>>>> +		   unsigned short meta_sg_nents,
>>>> +		   unsigned int *meta_sg_offset_p)
>>>>    {
>>>>    	struct scatterlist *sg = sgl;
>>>>    	struct mlx5_klm *klms = mr->descs;
>>>>    	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
>>>>    	u32 lkey = mr->ibmr.pd->local_dma_lkey;
>>>> -	int i;
>>>> +	int i, j = 0;
>>>>    	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
>>>>    	mr->ibmr.length = 0;
>>>> @@ -1911,12 +2013,36 @@ mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
>>>>    		sg_offset = 0;
>>>>    	}
>>>> -	mr->ndescs = i;
>>>>    	if (sg_offset_p)
>>>>    		*sg_offset_p = sg_offset;
>>>> -	return i;
>>>> +	mr->ndescs = i;
>>>> +	mr->data_length = mr->ibmr.length;
>>>> +
>>>> +	if (meta_sg_nents) {
>>>> +		sg = meta_sgl;
>>>> +		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
>>>> +		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
>>>> +			if (unlikely(i + j >= mr->max_descs))
>>>> +				break;
>>>> +			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
>>>> +						     sg_offset);
>>>> +			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
>>>> +							 sg_offset);
>>>> +			klms[i + j].key = cpu_to_be32(lkey);
>>>> +			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
>>>> +
>>>> +			sg_offset = 0;
>>>> +		}
>>>> +		if (meta_sg_offset_p)
>>>> +			*meta_sg_offset_p = sg_offset;
>>>> +
>>>> +		mr->meta_ndescs = j;
>>>> +		mr->meta_length = mr->ibmr.length - mr->data_length;
>>>> +	}
>>>> +
>>>> +	return i + j;
>>>>    }
>>>>    static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
>>>> @@ -1933,6 +2059,41 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
>>>>    	return 0;
>>>>    }
>>>> +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
>>>> +			 int data_sg_nents, unsigned int *data_sg_offset,
>>>> +			 struct scatterlist *meta_sg, int meta_sg_nents,
>>>> +			 unsigned int *meta_sg_offset)
>>>> +{
>>>> +	struct mlx5_ib_mr *mr = to_mmr(ibmr);
>>>> +	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
>>>> +	int n;
>>>> +
>>>> +	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
>>>> +
>>>> +	pi_mr->ndescs = 0;
>>>> +	pi_mr->meta_ndescs = 0;
>>>> +	pi_mr->meta_length = 0;
>>>> +
>>>> +	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
>>>> +				   pi_mr->desc_size * pi_mr->max_descs,
>>>> +				   DMA_TO_DEVICE);
>>>> +
>>>> +	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
>>>> +			       meta_sg, meta_sg_nents, meta_sg_offset);
>>>> +
>>>> +	/* This is zero-based memory region */
>>>> +	pi_mr->ibmr.iova = 0;
>>> Are you aware that Yuval enabled zero based mr from rdma-core with the
>>> follow patch?
>>>
>>> https://marc.info/?l=linux-rdma&m=155919637918880&w=2
>> Well, AFAIU, his API uses user space reg-mr that mapped to some
>> device->ops.reg_user_mr.
>>
>> This series is kernel only. In user-land there is no MR mappings and all the
>> MTTs are set during reg-mr.
>>
>> In kernel we just allocate the mkey tables and update the addresses in
>> runtime using UMR operation.
>>
>> what is your concern here ?
>>
>>
> Reading this comment from prior patch in this series gave the impression
> that you deal with user MR: "Also introduce new mr types IB_MR_TYPE_USER" .
>
> If we need (user) zero based MR, Yuval patch (rdma-core) can help.
>
> Just wanted to know if you are aware to this change.

Thanks for the notification, I think we're good here from correctness 
perspective.

The MR type (and IB_MR_TYPE_USER) come to hint the LLD on which 
resources to allocate. It also helps enforcing the correct behavior (e.g 
if one tries to map MR with IB_MR_TYPE_USER type for integrity it will 
fail).

It can also enforce the correct behavior for userspace MRs (but this is 
not included in this long long series of patches).
Christoph Hellwig June 4, 2019, 7:38 a.m. UTC | #5
>  int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
>  {
> -	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
> +	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
> +
> +	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
> +		dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr);
> +
> +	dereg_mr(to_mdev(ibmr->device), mmr);

Just curious: how could the device for the PI MR be different?  In other
words, why can't this just be:

	struct mlx5_ib_mr *mmr = to_mmr(ibmr);

	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
		mmr = mmr->pi_mr;
	dereg_mr(to_mdev(ibmr->device), mmr);
Max Gurtovoy June 4, 2019, 8:36 a.m. UTC | #6
On 6/4/2019 10:38 AM, Christoph Hellwig wrote:
>>   int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
>>   {
>> -	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
>> +	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
>> +
>> +	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
>> +		dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr);
>> +
>> +	dereg_mr(to_mdev(ibmr->device), mmr);
> Just curious: how could the device for the PI MR be different?  In other
> words, why can't this just be:

The device is not different but we need to do both dereg_mr(mdev, 
mmr->pi_mr) and dereg_mr(mdev, mmr).

>
> 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
>
> 	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
> 		mmr = mmr->pi_mr;
> 	dereg_mr(to_mdev(ibmr->device), mmr);
>
Sagi Grimberg June 5, 2019, 7:26 p.m. UTC | #7
Looks reasonable to me, other than the return status question I had,

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index abac70ad5c7c..b6588cdef1cf 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6126,6 +6126,7 @@  static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
 static const struct ib_device_ops mlx5_ib_dev_ops = {
 	.add_gid = mlx5_ib_add_gid,
 	.alloc_mr = mlx5_ib_alloc_mr,
+	.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
 	.alloc_pd = mlx5_ib_alloc_pd,
 	.alloc_ucontext = mlx5_ib_alloc_ucontext,
 	.attach_mcast = mlx5_ib_mcg_attach,
@@ -6155,6 +6156,7 @@  static const struct ib_device_ops mlx5_ib_dev_ops = {
 	.get_dma_mr = mlx5_ib_get_dma_mr,
 	.get_link_layer = mlx5_ib_port_link_layer,
 	.map_mr_sg = mlx5_ib_map_mr_sg,
+	.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
 	.mmap = mlx5_ib_mmap,
 	.modify_cq = mlx5_ib_modify_cq,
 	.modify_device = mlx5_ib_modify_device,
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 40eb8be482e4..07bac37c3450 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -587,6 +587,9 @@  struct mlx5_ib_mr {
 	void			*descs;
 	dma_addr_t		desc_map;
 	int			ndescs;
+	int			data_length;
+	int			meta_ndescs;
+	int			meta_length;
 	int			max_descs;
 	int			desc_size;
 	int			access_mode;
@@ -605,6 +608,7 @@  struct mlx5_ib_mr {
 	int			access_flags; /* Needed for rereg MR */
 
 	struct mlx5_ib_mr      *parent;
+	struct mlx5_ib_mr      *pi_mr; /* Needed for IB_MR_TYPE_INTEGRITY */
 	atomic_t		num_leaf_free;
 	wait_queue_head_t       q_leaf_free;
 	struct mlx5_async_work  cb_work;
@@ -1148,8 +1152,15 @@  int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 			       u32 max_num_sg, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
+					 u32 max_num_sg,
+					 u32 max_num_meta_sg);
 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
+int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+			 int data_sg_nents, unsigned int *data_sg_offset,
+			 struct scatterlist *meta_sg, int meta_sg_nents,
+			 unsigned int *meta_sg_offset);
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
 			const struct ib_mad_hdr *in, size_t in_mad_size,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 5f09699fab98..6820d80c6a7f 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1639,16 +1639,22 @@  static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
-	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
+	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
+
+	if (ibmr->type == IB_MR_TYPE_INTEGRITY)
+		dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr);
+
+	dereg_mr(to_mdev(ibmr->device), mmr);
+
 	return 0;
 }
 
-struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata)
+static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
+				u32 max_num_sg, u32 max_num_meta_sg)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	int ndescs = ALIGN(max_num_sg, 4);
+	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
 	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
@@ -1670,8 +1676,72 @@  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
 
+	mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
+
+	err = mlx5_alloc_priv_descs(pd->device, mr,
+				    ndescs, sizeof(struct mlx5_klm));
+	if (err)
+		goto err_free_in;
+	mr->desc_size = sizeof(struct mlx5_klm);
+	mr->max_descs = ndescs;
+
+	MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+
+	mr->ibmr.pd = pd;
+	mr->ibmr.device = pd->device;
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+	if (err)
+		goto err_priv_descs;
+
+	mr->mmkey.type = MLX5_MKEY_MR;
+	mr->ibmr.lkey = mr->mmkey.key;
+	mr->ibmr.rkey = mr->mmkey.key;
+	mr->umem = NULL;
+	kfree(in);
+
+	return mr;
+
+err_priv_descs:
+	mlx5_free_priv_descs(mr);
+err_free_in:
+	kfree(in);
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
+					enum ib_mr_type mr_type, u32 max_num_sg,
+					u32 max_num_meta_sg)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	int ndescs = ALIGN(max_num_sg, 4);
+	struct mlx5_ib_mr *mr;
+	void *mkc;
+	u32 *in;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+
 	if (mr_type == IB_MR_TYPE_MEM_REG) {
 		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
 		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
 		err = mlx5_alloc_priv_descs(pd->device, mr,
 					    ndescs, sizeof(struct mlx5_mtt));
@@ -1682,6 +1752,7 @@  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 		mr->max_descs = ndescs;
 	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
 		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
+		MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
 
 		err = mlx5_alloc_priv_descs(pd->device, mr,
 					    ndescs, sizeof(struct mlx5_klm));
@@ -1689,11 +1760,13 @@  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 			goto err_free_in;
 		mr->desc_size = sizeof(struct mlx5_klm);
 		mr->max_descs = ndescs;
-	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
+	} else if (mr_type == IB_MR_TYPE_SIGNATURE ||
+		   mr_type == IB_MR_TYPE_INTEGRITY) {
 		u32 psv_index[2];
 
 		MLX5_SET(mkc, mkc, bsf_en, 1);
 		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
+		MLX5_SET(mkc, mkc, translations_octword_size, 4);
 		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
 		if (!mr->sig) {
 			err = -ENOMEM;
@@ -1714,6 +1787,14 @@  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 		mr->sig->sig_err_exists = false;
 		/* Next UMR, Arm SIGERR */
 		++mr->sig->sigerr_count;
+		if (mr_type == IB_MR_TYPE_INTEGRITY) {
+			mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg,
+							max_num_meta_sg);
+			if (IS_ERR(mr->pi_mr)) {
+				err = PTR_ERR(mr->pi_mr);
+				goto err_destroy_psv;
+			}
+		}
 	} else {
 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
 		err = -EINVAL;
@@ -1727,7 +1808,7 @@  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 	mr->ibmr.device = pd->device;
 	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
 	if (err)
-		goto err_destroy_psv;
+		goto err_free_pi_mr;
 
 	mr->mmkey.type = MLX5_MKEY_MR;
 	mr->ibmr.lkey = mr->mmkey.key;
@@ -1737,6 +1818,11 @@  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 
 	return &mr->ibmr;
 
+err_free_pi_mr:
+	if (mr->pi_mr) {
+		dereg_mr(to_mdev(mr->pi_mr->ibmr.device), mr->pi_mr);
+		mr->pi_mr = NULL;
+	}
 err_destroy_psv:
 	if (mr->sig) {
 		if (mlx5_core_destroy_psv(dev->mdev,
@@ -1758,6 +1844,19 @@  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 	return ERR_PTR(err);
 }
 
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			       u32 max_num_sg, struct ib_udata *udata)
+{
+	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
+}
+
+struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
+					 u32 max_num_sg, u32 max_num_meta_sg)
+{
+	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
+				  max_num_meta_sg);
+}
+
 struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
 			       struct ib_udata *udata)
 {
@@ -1890,13 +1989,16 @@  static int
 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
 		   struct scatterlist *sgl,
 		   unsigned short sg_nents,
-		   unsigned int *sg_offset_p)
+		   unsigned int *sg_offset_p,
+		   struct scatterlist *meta_sgl,
+		   unsigned short meta_sg_nents,
+		   unsigned int *meta_sg_offset_p)
 {
 	struct scatterlist *sg = sgl;
 	struct mlx5_klm *klms = mr->descs;
 	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
 	u32 lkey = mr->ibmr.pd->local_dma_lkey;
-	int i;
+	int i, j = 0;
 
 	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
 	mr->ibmr.length = 0;
@@ -1911,12 +2013,36 @@  mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
 
 		sg_offset = 0;
 	}
-	mr->ndescs = i;
 
 	if (sg_offset_p)
 		*sg_offset_p = sg_offset;
 
-	return i;
+	mr->ndescs = i;
+	mr->data_length = mr->ibmr.length;
+
+	if (meta_sg_nents) {
+		sg = meta_sgl;
+		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
+		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
+			if (unlikely(i + j >= mr->max_descs))
+				break;
+			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
+						     sg_offset);
+			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
+							 sg_offset);
+			klms[i + j].key = cpu_to_be32(lkey);
+			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
+
+			sg_offset = 0;
+		}
+		if (meta_sg_offset_p)
+			*meta_sg_offset_p = sg_offset;
+
+		mr->meta_ndescs = j;
+		mr->meta_length = mr->ibmr.length - mr->data_length;
+	}
+
+	return i + j;
 }
 
 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
@@ -1933,6 +2059,41 @@  static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
 	return 0;
 }
 
+int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+			 int data_sg_nents, unsigned int *data_sg_offset,
+			 struct scatterlist *meta_sg, int meta_sg_nents,
+			 unsigned int *meta_sg_offset)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
+	int n;
+
+	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
+
+	pi_mr->ndescs = 0;
+	pi_mr->meta_ndescs = 0;
+	pi_mr->meta_length = 0;
+
+	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
+				   pi_mr->desc_size * pi_mr->max_descs,
+				   DMA_TO_DEVICE);
+
+	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
+			       meta_sg, meta_sg_nents, meta_sg_offset);
+
+	/* This is zero-based memory region */
+	pi_mr->ibmr.iova = 0;
+	ibmr->length = pi_mr->ibmr.length;
+	ibmr->iova = pi_mr->ibmr.iova;
+	ibmr->sig_attrs->meta_length = pi_mr->meta_length;
+
+	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
+				      pi_mr->desc_size * pi_mr->max_descs,
+				      DMA_TO_DEVICE);
+
+	return n;
+}
+
 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset)
 {
@@ -1946,7 +2107,8 @@  int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 				   DMA_TO_DEVICE);
 
 	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
-		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
+		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
+				       NULL);
 	else
 		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
 				mlx5_set_page);