[rdma-next,v2,09/11] RDMA/efa: Add EFA verbs implementation

Message ID	1550763193-14128-10-git-send-email-galpress@amazon.com (mailing list archive)
State	Superseded
Headers	show Return-Path: <linux-rdma-owner@kernel.org> From: Gal Pressman <galpress@amazon.com> To: Jason Gunthorpe <jgg@ziepe.ca>, Doug Ledford <dledford@redhat.com> CC: Yossi Leybovich <sleybo@amazon.com>, Alexander Matushevsky <matua@amazon.com>, Leah Shalev <shalevl@amazon.com>, Dave Goodell <goodell@amazon.com>, Brian Barrett <bbarrett@amazon.com>, <linux-rdma@vger.kernel.org>, Sean Hefty <sean.hefty@intel.com>, "Dennis Dalessandro" <dennis.dalessandro@intel.com>, Leon Romanovsky <leon@kernel.org>, Christoph Hellwig <hch@infradead.org>, Parav Pandit <parav@mellanox.com>, Sagi Grimberg <sagi@grimberg.me>, Gal Pressman <galpress@amazon.com> Subject: [PATCH rdma-next v2 09/11] RDMA/efa: Add EFA verbs implementation Date: Thu, 21 Feb 2019 17:33:11 +0200 Message-ID: <1550763193-14128-10-git-send-email-galpress@amazon.com> In-Reply-To: <1550763193-14128-1-git-send-email-galpress@amazon.com> References: <1550763193-14128-1-git-send-email-galpress@amazon.com> MIME-Version: 1.0 Content-Type: text/plain Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk
Series	RDMA/efa: Elastic Fabric Adapter (EFA) driver \| expand [rdma-next,v2,00/11] RDMA/efa: Elastic Fabric Adapter (EFA) driver [rdma-next,v2,01/11] RDMA: Add EFA related definitions [rdma-next,v2,02/11] RDMA/efa: Add EFA device definitions [rdma-next,v2,03/11] RDMA/efa: Add the efa.h header file [rdma-next,v2,04/11] RDMA/efa: Add the efa_com.h file [rdma-next,v2,05/11] RDMA/efa: Add the com service API definitions [rdma-next,v2,06/11] RDMA/efa: Add the ABI definitions [rdma-next,v2,07/11] RDMA/efa: Implement functions that submit and complete admin commands [rdma-next,v2,08/11] RDMA/efa: Add common command handlers [rdma-next,v2,09/11] RDMA/efa: Add EFA verbs implementation [rdma-next,v2,10/11] RDMA/efa: Add the efa module [rdma-next,v2,11/11] RDMA/efa: Add driver to Kconfig/Makefile

Gal Pressman Feb. 21, 2019, 3:33 p.m. UTC

Add a file that implements the EFA verbs.

Signed-off-by: Gal Pressman <galpress@amazon.com>
---
 drivers/infiniband/hw/efa/efa_verbs.c | 1891 +++++++++++++++++++++++++++++++++
 1 file changed, 1891 insertions(+)
 create mode 100644 drivers/infiniband/hw/efa/efa_verbs.c

Steve Wise Feb. 26, 2019, 9:43 p.m. UTC | #1

On 2/21/2019 9:33 AM, Gal Pressman wrote:
> Add a file that implements the EFA verbs.
>
> Signed-off-by: Gal Pressman <galpress@amazon.com>
> ---
>  drivers/infiniband/hw/efa/efa_verbs.c | 1891 +++++++++++++++++++++++++++++++++
>  1 file changed, 1891 insertions(+)
>  create mode 100644 drivers/infiniband/hw/efa/efa_verbs.c
>
> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
> new file mode 100644
> index 000000000000..2bd39119afa6
> --- /dev/null
> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
> @@ -0,0 +1,1891 @@
> +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
> +/*
> + * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
> + */
> +
> +#include <linux/vmalloc.h>
> +
> +#include <rdma/ib_addr.h>
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_user_verbs.h>
> +#include <rdma/ib_verbs.h>
> +
> +#include "efa.h"
> +
> +#define EFA_MMAP_FLAG_SHIFT 56
> +
> +enum {
> +	EFA_MMAP_DMA_PAGE = 0,
> +	EFA_MMAP_IO_WC,
> +	EFA_MMAP_IO_NC,
> +};
> +
> +static void set_mmap_flag(u64 *mmap_key, u8 mmap_flag)
> +{
> +	*mmap_key |= (u64)mmap_flag << EFA_MMAP_FLAG_SHIFT;
> +}
> +
> +static u8 get_mmap_flag(u64 mmap_key)
> +{
> +	return mmap_key >> EFA_MMAP_FLAG_SHIFT;
> +}
> +
> +#define EFA_AENQ_ENABLED_GROUPS \
> +	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
> +	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
> +
> +struct efa_mmap_entry {
> +	struct list_head list;
> +	void  *obj;
> +	u64 address;
> +	u64 length;
> +	u64 key;
> +};
> +
> +#define EFA_PAGE_SHIFT       12
> +#define EFA_PAGE_SIZE        BIT(EFA_PAGE_SHIFT)
> +#define EFA_PAGE_PTR_SIZE    8
> +
> +#define EFA_CHUNK_ALLOC_SIZE BIT(EFA_PAGE_SHIFT)
> +#define EFA_CHUNK_PTR_SIZE   sizeof(struct efa_com_ctrl_buff_info)
> +
> +#define EFA_PAGE_PTRS_PER_CHUNK  \
> +	((EFA_CHUNK_ALLOC_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_PAGE_PTR_SIZE)
> +
> +#define EFA_CHUNK_USED_SIZE  \
> +	((EFA_PAGE_PTRS_PER_CHUNK * EFA_PAGE_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
> +
> +#define EFA_SUPPORTED_ACCESS_FLAGS IB_ACCESS_LOCAL_WRITE
> +
> +struct pbl_chunk {
> +	dma_addr_t dma_addr;
> +	u64 *buf;
> +	u32 length;
> +};
> +
> +struct pbl_chunk_list {
> +	struct pbl_chunk *chunks;
> +	unsigned int size;
> +};
> +
> +struct pbl_context {
> +	union {
> +		struct {
> +			dma_addr_t dma_addr;
> +		} continuous;
> +		struct {
> +			u32 pbl_buf_size_in_pages;
> +			struct scatterlist *sgl;
> +			int sg_dma_cnt;
> +			struct pbl_chunk_list chunk_list;
> +		} indirect;
> +	} phys;
> +	u64 *pbl_buf;
> +	u32 pbl_buf_size_in_bytes;
> +	u8 physically_continuous;
> +};
> +
> +static inline struct efa_dev *to_edev(struct ib_device *ibdev)
> +{
> +	return container_of(ibdev, struct efa_dev, ibdev);
> +}
> +
> +static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext)
> +{
> +	return container_of(ibucontext, struct efa_ucontext, ibucontext);
> +}
> +
> +static inline struct efa_pd *to_epd(struct ib_pd *ibpd)
> +{
> +	return container_of(ibpd, struct efa_pd, ibpd);
> +}
> +
> +static inline struct efa_mr *to_emr(struct ib_mr *ibmr)
> +{
> +	return container_of(ibmr, struct efa_mr, ibmr);
> +}
> +
> +static inline struct efa_qp *to_eqp(struct ib_qp *ibqp)
> +{
> +	return container_of(ibqp, struct efa_qp, ibqp);
> +}
> +
> +static inline struct efa_cq *to_ecq(struct ib_cq *ibcq)
> +{
> +	return container_of(ibcq, struct efa_cq, ibcq);
> +}
> +
> +static inline struct efa_ah *to_eah(struct ib_ah *ibah)
> +{
> +	return container_of(ibah, struct efa_ah, ibah);
> +}
> +
> +#define field_avail(x, fld, sz) (offsetof(typeof(x), fld) + \
> +				 sizeof(((typeof(x) *)0)->fld) <= (sz))
> +
> +#define is_reserved_cleared(reserved) \
> +	!memchr_inv(reserved, 0, sizeof(reserved))
> +
> +static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
> +			       size_t size, enum dma_data_direction dir)
> +{
> +	void *addr;
> +
> +	addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
> +	if (!addr)
> +		return NULL;
> +
> +	*dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir);
> +	if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) {
> +		efa_err(&dev->ibdev.dev, "Failed to map DMA address\n");
> +		free_pages_exact(addr, size);
> +		return NULL;
> +	}
> +
> +	return addr;
> +}
> +
> +static void mmap_obj_entries_remove(struct efa_dev *dev,
> +				    struct efa_ucontext *ucontext, void *obj)
> +{
> +	struct efa_mmap_entry *entry, *tmp;
> +
> +	mutex_lock(&ucontext->lock);
> +	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
> +		if (entry->obj == obj) {
> +			list_del(&entry->list);
> +			efa_dbg(&dev->ibdev.dev,
> +				"mmap: obj[%p] key[0x%llx] addr[0x%llX] len[0x%llX] removed\n",
> +				entry->obj, entry->key, entry->address,
> +				entry->length);
> +			kfree(entry);
> +		}
> +	}
> +	mutex_unlock(&ucontext->lock);
> +}
> +
> +/*
> + * Since we don't track munmaps, we can't know when a user stopped using his
> + * mmapped buffers.
> + * This should be called on dealloc_ucontext in order to drain the mmap entries
> + * and free the (unmapped) DMA buffers.
> + */
> +static void mmap_entries_remove_free(struct efa_dev *dev,
> +				     struct efa_ucontext *ucontext)
> +{
> +	struct efa_mmap_entry *entry, *tmp;
> +
> +	mutex_lock(&ucontext->lock);
> +	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
> +		list_del(&entry->list);
> +		efa_dbg(&dev->ibdev.dev,
> +			"mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
> +			entry->obj, entry->key, entry->address, entry->length);
> +		if (get_mmap_flag(entry->key) == EFA_MMAP_DMA_PAGE)
> +			/* DMA mapping is already gone, now free the pages */
> +			free_pages_exact(phys_to_virt(entry->address),
> +					 entry->length);
> +		kfree(entry);
> +	}
> +	mutex_unlock(&ucontext->lock);
> +}
> +
> +static struct efa_mmap_entry *mmap_entry_get(struct efa_dev *dev,
> +					     struct efa_ucontext *ucontext,
> +					     u64 key,
> +					     u64 len)
> +{
> +	struct efa_mmap_entry *entry, *tmp;
> +
> +	mutex_lock(&ucontext->lock);
> +	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
> +		if (entry->key == key && entry->length == len) {
> +			efa_dbg(&dev->ibdev.dev,
> +				"mmap: obj[%p] key[0x%llx] addr[0x%llX] len[0x%llX] removed\n",
> +				entry->obj, key, entry->address,
> +				entry->length);
> +			mutex_unlock(&ucontext->lock);
> +			return entry;
> +		}
> +	}
> +	mutex_unlock(&ucontext->lock);
> +
> +	return NULL;
> +}
> +
> +static void mmap_entry_insert(struct efa_dev *dev,
> +			      struct efa_ucontext *ucontext,
> +			      struct efa_mmap_entry *entry,
> +			      u8 mmap_flag)
> +{
> +	mutex_lock(&ucontext->lock);
> +	entry->key = ucontext->mmap_key;
> +	set_mmap_flag(&entry->key, mmap_flag);
> +	ucontext->mmap_key += PAGE_SIZE;
> +	list_add_tail(&entry->list, &ucontext->pending_mmaps);
> +	efa_dbg(&dev->ibdev.dev,
> +		"mmap: obj[%p] addr[0x%llx], len[0x%llx], key[0x%llx] inserted\n",
> +		entry->obj, entry->address, entry->length, entry->key);
> +	mutex_unlock(&ucontext->lock);
> +}
> +
> +int efa_query_device(struct ib_device *ibdev,
> +		     struct ib_device_attr *props,
> +		     struct ib_udata *udata)
> +{
> +	struct efa_com_get_device_attr_result *dev_attr;
> +	struct efa_ibv_ex_query_device_resp resp = {};
> +	struct efa_dev *dev = to_edev(ibdev);
> +	int err;
> +
> +	if (udata && udata->inlen &&
> +	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
> +		efa_err_rl(&ibdev->dev,
> +			   "Incompatible ABI params, udata not cleared\n");
> +		return -EINVAL;
> +	}
> +
> +	dev_attr = &dev->dev_attr;
> +
> +	memset(props, 0, sizeof(*props));
> +	props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE;
> +	props->page_size_cap = dev_attr->page_size_cap;
> +	props->vendor_id = dev->pdev->vendor;
> +	props->vendor_part_id = dev->pdev->device;
> +	props->hw_ver = dev->pdev->subsystem_device;
> +	props->max_qp = dev_attr->max_qp;
> +	props->max_cq = dev_attr->max_cq;
> +	props->max_pd = dev_attr->max_pd;
> +	props->max_mr = dev_attr->max_mr;
> +	props->max_ah = dev_attr->max_ah;
> +	props->max_cqe = dev_attr->max_cq_depth;
> +	props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth,
> +				 dev_attr->max_rq_depth);
> +	props->max_send_sge = dev_attr->max_sq_sge;
> +	props->max_recv_sge = dev_attr->max_rq_sge;
> +
> +	if (udata && udata->outlen) {
> +		resp.max_sq_sge = dev_attr->max_sq_sge;
> +		resp.max_rq_sge = dev_attr->max_rq_sge;
> +		resp.max_sq_wr = dev_attr->max_sq_depth;
> +		resp.max_rq_wr = dev_attr->max_rq_depth;
> +
> +		err = ib_copy_to_udata(udata, &resp,
> +				       min(sizeof(resp), udata->outlen));
> +		if (err) {
> +			efa_err_rl(&ibdev->dev,
> +				   "Failed to copy udata for query_device\n");
> +			return err;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +int efa_query_port(struct ib_device *ibdev, u8 port,
> +		   struct ib_port_attr *props)
> +{
> +	struct efa_dev *dev = to_edev(ibdev);
> +
> +	memset(props, 0, sizeof(*props));
> +
> +	props->lid = 0;
> +	props->lmc = 1;
> +	props->sm_lid = 0;
> +	props->sm_sl = 0;
> +
> +	props->state = IB_PORT_ACTIVE;
> +	props->phys_state = 5;
> +	props->port_cap_flags = 0;
> +	props->gid_tbl_len = 1;
> +	props->pkey_tbl_len = 1;
> +	props->bad_pkey_cntr = 0;
> +	props->qkey_viol_cntr = 0;
> +	props->active_speed = IB_SPEED_EDR;
> +	props->active_width = IB_WIDTH_4X;
> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
> +	props->max_msg_sz = dev->mtu;
> +	props->max_vl_num = 1;
> +


Since you memset() props to all zeros,  should you bother with
initializing the zero fields?


> +	return 0;
> +}
> +
> +int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
> +		 int qp_attr_mask,
> +		 struct ib_qp_init_attr *qp_init_attr)
> +{
> +	struct efa_dev *dev = to_edev(ibqp->device);
> +	struct efa_com_query_qp_params params = {};
> +	struct efa_com_query_qp_result result;
> +	struct efa_qp *qp = to_eqp(ibqp);
> +	int err;
> +
> +#define EFA_QUERY_QP_SUPP_MASK \
> +	(IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \
> +	 IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP)
> +
> +	if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) {
> +		efa_err(&dev->ibdev.dev,
> +			"Unsupported qp_attr_mask[%#x] supported[%#x]\n",
> +			qp_attr_mask, EFA_QUERY_QP_SUPP_MASK);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	memset(qp_attr, 0, sizeof(*qp_attr));
> +	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
> +
> +	params.qp_handle = qp->qp_handle;
> +	err = efa_com_query_qp(dev->edev, &params, &result);
> +	if (err)
> +		return err;
> +
> +	qp_attr->qp_state = result.qp_state;
> +	qp_attr->qkey = result.qkey;
> +	qp_attr->sq_psn = result.sq_psn;
> +	qp_attr->sq_draining = result.sq_draining;
> +	qp_attr->port_num = 1;
> +	qp_attr->pkey_index = 0;
> +
> +	qp_attr->cap.max_send_wr = qp->max_send_wr;
> +	qp_attr->cap.max_recv_wr = qp->max_recv_wr;
> +	qp_attr->cap.max_send_sge = qp->max_send_sge;
> +	qp_attr->cap.max_recv_sge = qp->max_recv_sge;
> +	qp_attr->cap.max_inline_data = qp->max_inline_data;
> +
> +	qp_init_attr->qp_type = ibqp->qp_type;
> +	qp_init_attr->recv_cq = ibqp->recv_cq;
> +	qp_init_attr->send_cq = ibqp->send_cq;
> +	qp_init_attr->qp_context = ibqp->qp_context;
> +	qp_init_attr->cap = qp_attr->cap;
> +
> +	return 0;
> +}
> +
> +int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
> +		  union ib_gid *gid)
> +{
> +	struct efa_dev *dev = to_edev(ibdev);
> +
> +	memcpy(gid->raw, dev->addr, sizeof(dev->addr));
> +
> +	return 0;
> +}
> +
> +int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
> +		   u16 *pkey)
> +{
> +	if (index > 0)
> +		return -EINVAL;
> +
> +	*pkey = 0xffff;
> +	return 0;
> +}
> +
> +static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn)
> +{
> +	struct efa_com_dealloc_pd_params params = {
> +		.pdn = pdn,
> +	};
> +
> +	return efa_com_dealloc_pd(dev->edev, &params);
> +}
> +
> +int efa_alloc_pd(struct ib_pd *ibpd,
> +		 struct ib_ucontext *ibucontext,
> +		 struct ib_udata *udata)
> +{
> +	struct efa_dev *dev = to_edev(ibpd->device);
> +	struct efa_ibv_alloc_pd_resp resp = {};
> +	struct efa_com_alloc_pd_result result;
> +	struct efa_pd *pd = to_epd(ibpd);
> +	int err;
> +
> +	if (!udata) {
> +		efa_err_rl(&dev->ibdev.dev, "udata is NULL\n");
> +		err = -EOPNOTSUPP;
> +		goto err_out;
> +	}
> +
> +	if (udata->inlen &&
> +	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
> +		efa_err_rl(&dev->ibdev.dev,
> +			   "Incompatible ABI params, udata not cleared\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	err = efa_com_alloc_pd(dev->edev, &result);
> +	if (err)
> +		goto err_out;
> +
> +	pd->pdn = result.pdn;
> +	resp.pdn = result.pdn;
> +
> +	if (udata->outlen) {
> +		err = ib_copy_to_udata(udata, &resp,
> +				       min(sizeof(resp), udata->outlen));
> +		if (err) {
> +			efa_err_rl(&dev->ibdev.dev,
> +				   "Failed to copy udata for alloc_pd\n");
> +			goto err_dealloc_pd;
> +		}
> +	}
> +
> +	efa_dbg(&dev->ibdev.dev, "Allocated pd[%d]\n", pd->pdn);
> +
> +	return 0;
> +
> +err_dealloc_pd:
> +	efa_pd_dealloc(dev, result.pdn);
> +err_out:
> +	efa_stat_inc(dev, dev->stats.sw_stats.alloc_pd_err);
> +	return err;
> +}
> +
> +void efa_dealloc_pd(struct ib_pd *ibpd)
> +{
> +	struct efa_dev *dev = to_edev(ibpd->device);
> +	struct efa_pd *pd = to_epd(ibpd);
> +
> +	efa_dbg(&dev->ibdev.dev, "Dealloc pd[%d]\n", pd->pdn);
> +	efa_pd_dealloc(dev, pd->pdn);
> +}
> +
> +int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
> +{
> +	struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle };
> +
> +	return efa_com_destroy_qp(dev->edev, &params);
> +}
> +
> +int efa_destroy_qp(struct ib_qp *ibqp)
> +{
> +	struct efa_dev *dev = to_edev(ibqp->pd->device);
> +	struct efa_qp *qp = to_eqp(ibqp);
> +
> +	efa_dbg(&dev->ibdev.dev, "Destroy qp[%u]\n", ibqp->qp_num);
> +	efa_destroy_qp_handle(dev, qp->qp_handle);
> +
> +	if (qp->rq_cpu_addr) {
> +		efa_dbg(&dev->ibdev.dev,
> +			"qp->cpu_addr[%p] freed: size[%lu], dma[%pad]\n",
> +			qp->rq_cpu_addr, qp->rq_size,
> +			&qp->rq_dma_addr);
> +		dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
> +				 DMA_TO_DEVICE);
> +	}
> +
> +	kfree(qp);
> +	return 0;
> +}
> +
> +static int qp_mmap_entries_setup(struct efa_qp *qp,
> +				 struct efa_dev *dev,
> +				 struct efa_ucontext *ucontext,
> +				 struct efa_com_create_qp_params *params,
> +				 struct efa_ibv_create_qp_resp *resp)
> +{
> +	struct efa_mmap_entry *rq_db_entry;
> +	struct efa_mmap_entry *sq_db_entry;
> +	struct efa_mmap_entry *rq_entry;
> +	struct efa_mmap_entry *sq_entry;
> +
> +	sq_db_entry = kzalloc(sizeof(*sq_db_entry), GFP_KERNEL);
> +	sq_entry = kzalloc(sizeof(*sq_entry), GFP_KERNEL);
> +	if (!sq_db_entry || !sq_entry)
> +		goto err_alloc;
> +
> +	if (qp->rq_size) {
> +		rq_entry = kzalloc(sizeof(*rq_entry), GFP_KERNEL);
> +		rq_db_entry = kzalloc(sizeof(*rq_db_entry), GFP_KERNEL);
> +		if (!rq_entry || !rq_db_entry)
> +			goto err_alloc_rq;
> +
> +		rq_db_entry->obj = qp;
> +		rq_entry->obj = qp;
> +
> +		rq_entry->address = virt_to_phys(qp->rq_cpu_addr);
> +		rq_entry->length = qp->rq_size;
> +		mmap_entry_insert(dev, ucontext, rq_entry, EFA_MMAP_DMA_PAGE);
> +		resp->rq_mmap_key = rq_entry->key;
> +		resp->rq_mmap_size = qp->rq_size;
> +
> +		rq_db_entry->address = dev->db_bar_addr +
> +				       resp->rq_db_offset;
> +		rq_db_entry->length = PAGE_SIZE;
> +		mmap_entry_insert(dev, ucontext, rq_db_entry, EFA_MMAP_IO_NC);
> +		resp->rq_db_mmap_key = rq_db_entry->key;
> +		resp->rq_db_offset &= ~PAGE_MASK;
> +	}
> +
> +	sq_db_entry->obj = qp;
> +	sq_entry->obj = qp;
> +
> +	sq_db_entry->address = dev->db_bar_addr + resp->sq_db_offset;
> +	resp->sq_db_offset &= ~PAGE_MASK;
> +	sq_db_entry->length = PAGE_SIZE;
> +	mmap_entry_insert(dev, ucontext, sq_db_entry, EFA_MMAP_IO_NC);
> +	resp->sq_db_mmap_key = sq_db_entry->key;
> +
> +	sq_entry->address = dev->mem_bar_addr + resp->llq_desc_offset;
> +	resp->llq_desc_offset &= ~PAGE_MASK;
> +	sq_entry->length = PAGE_ALIGN(params->sq_ring_size_in_bytes +
> +				      resp->llq_desc_offset);
> +	mmap_entry_insert(dev, ucontext, sq_entry, EFA_MMAP_IO_WC);
> +	resp->llq_desc_mmap_key = sq_entry->key;
> +
> +	return 0;
> +
> +err_alloc_rq:
> +	kfree(rq_entry);
> +	kfree(rq_db_entry);
> +err_alloc:
> +	kfree(sq_entry);
> +	kfree(sq_db_entry);
> +	return -ENOMEM;
> +}
> +
> +static int efa_qp_validate_cap(struct efa_dev *dev,
> +			       struct ib_qp_init_attr *init_attr)
> +{
> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
> +		efa_err(&dev->ibdev.dev,
> +			"qp: requested send wr[%u] exceeds the max[%u]\n",
> +			init_attr->cap.max_send_wr,
> +			dev->dev_attr.max_sq_depth);
> +		return -EINVAL;
> +	}
> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
> +		efa_err(&dev->ibdev.dev,
> +			"qp: requested receive wr[%u] exceeds the max[%u]\n",
> +			init_attr->cap.max_recv_wr,
> +			dev->dev_attr.max_rq_depth);
> +		return -EINVAL;
> +	}
> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
> +		efa_err(&dev->ibdev.dev,
> +			"qp: requested sge send[%u] exceeds the max[%u]\n",
> +			init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
> +		return -EINVAL;
> +	}
> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
> +		efa_err(&dev->ibdev.dev,
> +			"qp: requested sge recv[%u] exceeds the max[%u]\n",
> +			init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
> +		return -EINVAL;
> +	}
> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
> +		efa_err(&dev->ibdev.dev,
> +			"requested inline data[%u] exceeds the max[%u]\n",
> +			init_attr->cap.max_inline_data,
> +			dev->dev_attr.inline_buf_size);
> +		return -EINVAL;
> +	}
> +


Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
log polluting for user errors.


> +	return 0;
> +}
> +
> +static int efa_qp_validate_attr(struct efa_dev *dev,
> +				struct ib_qp_init_attr *init_attr)
> +{
> +	if (init_attr->qp_type != IB_QPT_DRIVER &&
> +	    init_attr->qp_type != IB_QPT_UD) {
> +		efa_err(&dev->ibdev.dev,
> +			"Unsupported qp type %d\n", init_attr->qp_type);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (init_attr->srq) {
> +		efa_err(&dev->ibdev.dev, "SRQ is not supported\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (init_attr->create_flags) {
> +		efa_err(&dev->ibdev.dev, "Unsupported create flags\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	return 0;
> +}
> +
> +struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
> +			    struct ib_qp_init_attr *init_attr,
> +			    struct ib_udata *udata)
> +{
> +	struct efa_com_create_qp_params create_qp_params = {};
> +	struct efa_com_create_qp_result create_qp_resp;
> +	struct efa_dev *dev = to_edev(ibpd->device);
> +	struct efa_ibv_create_qp_resp resp = {};
> +	struct efa_ibv_create_qp cmd = {};
> +	struct efa_ucontext *ucontext;
> +	struct efa_qp *qp;
> +	int err;
> +
> +	ucontext = ibpd->uobject ? to_eucontext(ibpd->uobject->context) :
> +				   NULL;
> +
> +	if (!udata) {
> +		efa_err_rl(&dev->ibdev.dev, "udata is NULL\n");
> +		err = -EOPNOTSUPP;
> +		goto err_out;
> +	}
> +
> +	err = efa_qp_validate_cap(dev, init_attr);
> +	if (err)
> +		goto err_out;
> +
> +	err = efa_qp_validate_attr(dev, init_attr);
> +	if (err)
> +		goto err_out;
> +
> +	if (!field_avail(cmd, driver_qp_type, udata->inlen)) {
> +		efa_err_rl(&dev->ibdev.dev,
> +			   "Incompatible ABI params, no input udata\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (udata->inlen > sizeof(cmd) &&
> +	    !ib_is_udata_cleared(udata, sizeof(cmd),
> +				 udata->inlen - sizeof(cmd))) {
> +		efa_err_rl(&dev->ibdev.dev,
> +			   "Incompatible ABI params, unknown fields in udata\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	err = ib_copy_from_udata(&cmd, udata,
> +				 min(sizeof(cmd), udata->inlen));
> +	if (err) {
> +		efa_err_rl(&dev->ibdev.dev,
> +			   "Cannot copy udata for create_qp\n");
> +		goto err_out;
> +	}
> +
> +	if (cmd.comp_mask) {
> +		efa_err_rl(&dev->ibdev.dev,
> +			   "Incompatible ABI params, unknown fields in udata\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
> +	if (!qp) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +
> +	create_qp_params.uarn = ucontext->uarn;
> +	create_qp_params.pd = to_epd(ibpd)->pdn;
> +
> +	if (init_attr->qp_type == IB_QPT_UD) {
> +		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
> +	} else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
> +		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
> +	} else {
> +		efa_err(&dev->ibdev.dev,
> +			"Unsupported qp type %d driver qp type %d\n",
> +			init_attr->qp_type, cmd.driver_qp_type);
> +		err = -EOPNOTSUPP;
> +		goto err_free_qp;
> +	}
> +
> +	efa_dbg(&dev->ibdev.dev, "Create QP: qp type %d driver qp type %#x\n",
> +		init_attr->qp_type, cmd.driver_qp_type);
> +	create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
> +	create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
> +	create_qp_params.sq_depth = init_attr->cap.max_send_wr;
> +	create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
> +
> +	create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
> +	create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
> +	qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
> +	if (qp->rq_size) {
> +		qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
> +						    qp->rq_size, DMA_TO_DEVICE);
> +		if (!qp->rq_cpu_addr) {
> +			err = -ENOMEM;
> +			goto err_free_qp;
> +		}
> +
> +		efa_dbg(&dev->ibdev.dev,
> +			"qp->cpu_addr[%p] allocated: size[%lu], dma[%pad]\n",
> +			qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
> +		create_qp_params.rq_base_addr = qp->rq_dma_addr;
> +	}
> +
> +	memset(&resp, 0, sizeof(resp));
> +	err = efa_com_create_qp(dev->edev, &create_qp_params,
> +				&create_qp_resp);
> +	if (err)
> +		goto err_free_mapped;
> +
> +	WARN_ON_ONCE(create_qp_resp.sq_db_offset > dev->db_bar_len);
> +	WARN_ON_ONCE(create_qp_resp.rq_db_offset > dev->db_bar_len);
> +	WARN_ON_ONCE(create_qp_resp.llq_descriptors_offset >
> +		     dev->mem_bar_len);
> +
> +	resp.sq_db_offset = create_qp_resp.sq_db_offset;
> +	resp.rq_db_offset = create_qp_resp.rq_db_offset;
> +	resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset;
> +	resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx;
> +	resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx;
> +
> +	err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params,
> +				    &resp);
> +	if (err)
> +		goto err_destroy_qp;
> +
> +	qp->qp_handle = create_qp_resp.qp_handle;
> +	qp->ibqp.qp_num = create_qp_resp.qp_num;
> +	qp->ibqp.qp_type = init_attr->qp_type;
> +	qp->max_send_wr = init_attr->cap.max_send_wr;
> +	qp->max_recv_wr = init_attr->cap.max_recv_wr;
> +	qp->max_send_sge = init_attr->cap.max_send_sge;
> +	qp->max_recv_sge = init_attr->cap.max_recv_sge;
> +	qp->max_inline_data = init_attr->cap.max_inline_data;
> +
> +	if (udata->outlen) {
> +		err = ib_copy_to_udata(udata, &resp,
> +				       min(sizeof(resp), udata->outlen));
> +		if (err) {
> +			efa_err_rl(&dev->ibdev.dev,
> +				   "Failed to copy udata for qp[%u]\n",
> +				   create_qp_resp.qp_num);
> +			goto err_mmap_remove;
> +		}
> +	}
> +
> +	efa_dbg(&dev->ibdev.dev, "Created qp[%d]\n", qp->ibqp.qp_num);
> +
> +	return &qp->ibqp;
> +
> +err_mmap_remove:
> +	mmap_obj_entries_remove(dev, ucontext, qp);
> +err_destroy_qp:
> +	efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
> +err_free_mapped:
> +	if (qp->rq_size) {
> +		dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
> +				 DMA_TO_DEVICE);
> +		free_pages_exact(qp->rq_cpu_addr, qp->rq_size);
> +	}
> +err_free_qp:
> +	kfree(qp);
> +err_out:
> +	efa_stat_inc(dev, dev->stats.sw_stats.create_qp_err);
> +	return ERR_PTR(err);
> +}
> +
> +static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
> +				  struct ib_qp_attr *qp_attr, int qp_attr_mask,
> +				  enum ib_qp_state cur_state,
> +				  enum ib_qp_state new_state)
> +{
> +#define EFA_MODIFY_QP_SUPP_MASK \
> +	(IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \
> +	 IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN)
> +
> +	if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) {
> +		efa_err(&dev->ibdev.dev,
> +			"Unsupported qp_attr_mask[%#x] supported[%#x]\n",
> +			qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (!ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
> +				qp_attr_mask)) {
> +		efa_err(&dev->ibdev.dev, "Invalid modify QP parameters\n");
> +		return -EINVAL;
> +	}
> +
> +	if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) {
> +		efa_err(&dev->ibdev.dev, "Can't change port num\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) {
> +		efa_err(&dev->ibdev.dev, "Can't change pkey index\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	return 0;
> +}
> +
> +int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
> +		  int qp_attr_mask, struct ib_udata *udata)
> +{
> +	struct efa_dev *dev = to_edev(ibqp->device);
> +	struct efa_com_modify_qp_params params = {};
> +	struct efa_qp *qp = to_eqp(ibqp);
> +	enum ib_qp_state cur_state;
> +	enum ib_qp_state new_state;
> +	int err;
> +
> +	if (!udata) {
> +		efa_err_rl(&dev->ibdev.dev, "udata is NULL\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (udata->inlen &&
> +	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
> +		efa_err_rl(&dev->ibdev.dev,
> +			   "Incompatible ABI params, udata not cleared\n");
> +		return -EINVAL;
> +	}
> +
> +	cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state :
> +						     qp->state;
> +	new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
> +
> +	err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state,
> +				     new_state);
> +	if (err)
> +		return err;
> +
> +	params.qp_handle = qp->qp_handle;
> +
> +	if (qp_attr_mask & IB_QP_STATE) {
> +		params.modify_mask |= BIT(EFA_ADMIN_QP_STATE_BIT) |
> +				      BIT(EFA_ADMIN_CUR_QP_STATE_BIT);
> +		params.cur_qp_state = qp_attr->cur_qp_state;
> +		params.qp_state = qp_attr->qp_state;
> +	}
> +
> +	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
> +		params.modify_mask |=
> +			BIT(EFA_ADMIN_SQ_DRAINED_ASYNC_NOTIFY_BIT);
> +		params.sq_drained_async_notify = qp_attr->en_sqd_async_notify;
> +	}
> +
> +	if (qp_attr_mask & IB_QP_QKEY) {
> +		params.modify_mask |= BIT(EFA_ADMIN_QKEY_BIT);
> +		params.qkey = qp_attr->qkey;
> +	}
> +
> +	if (qp_attr_mask & IB_QP_SQ_PSN) {
> +		params.modify_mask |= BIT(EFA_ADMIN_SQ_PSN_BIT);
> +		params.sq_psn = qp_attr->sq_psn;
> +	}
> +
> +	err = efa_com_modify_qp(dev->edev, &params);
> +	if (err)
> +		return err;
> +
> +	qp->state = new_state;
> +
> +	return 0;
> +}
> +
> +static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
> +{
> +	struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx };
> +
> +	return efa_com_destroy_cq(dev->edev, &params);
> +}
> +
> +int efa_destroy_cq(struct ib_cq *ibcq)
> +{
> +	struct efa_dev *dev = to_edev(ibcq->device);
> +	struct efa_cq *cq = to_ecq(ibcq);
> +
> +	efa_dbg(&dev->ibdev.dev,
> +		"Destroy cq[%d] virt[%p] freed: size[%lu], dma[%pad]\n",
> +		 cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
> +
> +	efa_destroy_cq_idx(dev, cq->cq_idx);
> +
> +	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
> +			 DMA_FROM_DEVICE);
> +
> +	kfree(cq);
> +	return 0;
> +}
> +
> +static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
> +				 struct efa_ibv_create_cq_resp *resp)
> +{
> +	struct efa_mmap_entry *cq_entry;
> +
> +	cq_entry = kzalloc(sizeof(*cq_entry), GFP_KERNEL);
> +	if (!cq_entry)
> +		return -ENOMEM;
> +
> +	cq_entry->obj = cq;
> +
> +	cq_entry->address = virt_to_phys(cq->cpu_addr);
> +	cq_entry->length = cq->size;
> +	mmap_entry_insert(dev, cq->ucontext, cq_entry, EFA_MMAP_DMA_PAGE);
> +	resp->q_mmap_key = cq_entry->key;
> +	resp->q_mmap_size = cq_entry->length;
> +
> +	return 0;
> +}
> +
> +static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
> +				  int vector, struct ib_ucontext *ibucontext,
> +				  struct ib_udata *udata)
> +{
> +	struct efa_ibv_create_cq_resp resp = {};
> +	struct efa_com_create_cq_params params;
> +	struct efa_com_create_cq_result result;
> +	struct efa_dev *dev = to_edev(ibdev);
> +	struct efa_ibv_create_cq cmd = {};
> +	struct efa_cq *cq;
> +	int err;
> +
> +	efa_dbg(&ibdev->dev, "create_cq entries %d udata %p\n", entries, udata);
> +
> +	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
> +		efa_err(&ibdev->dev,
> +			"cq: requested entries[%u] non-positive or greater than max[%u]\n",
> +			entries, dev->dev_attr.max_cq_depth);
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (!udata) {
> +		efa_err_rl(&ibdev->dev, "udata is NULL\n");
> +		err = -EOPNOTSUPP;
> +		goto err_out;
> +	}
> +
> +	if (!field_avail(cmd, num_sub_cqs, udata->inlen)) {
> +		efa_err_rl(&ibdev->dev,
> +			   "Incompatible ABI params, no input udata\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (udata->inlen > sizeof(cmd) &&
> +	    !ib_is_udata_cleared(udata, sizeof(cmd),
> +				 udata->inlen - sizeof(cmd))) {
> +		efa_err_rl(&ibdev->dev,
> +			   "Incompatible ABI params, unknown fields in udata\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	err = ib_copy_from_udata(&cmd, udata,
> +				 min(sizeof(cmd), udata->inlen));
> +	if (err) {
> +		efa_err_rl(&ibdev->dev,
> +			   "Cannot copy udata for create_cq\n");
> +		goto err_out;
> +	}
> +
> +	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_50)) {
> +		efa_err_rl(&ibdev->dev,
> +			   "Incompatible ABI params, unknown fields in udata\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (!cmd.cq_entry_size) {
> +		efa_err(&ibdev->dev,
> +			"Invalid entry size [%u]\n", cmd.cq_entry_size);
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
> +		efa_err(&ibdev->dev,
> +			"Invalid number of sub cqs[%u] expected[%u]\n",
> +			cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
> +	if (!cq) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +
> +	memset(&resp, 0, sizeof(resp));
> +	cq->ucontext = to_eucontext(ibucontext);
> +	cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
> +	cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
> +					 DMA_FROM_DEVICE);
> +	if (!cq->cpu_addr) {
> +		err = -ENOMEM;
> +		goto err_free_cq;
> +	}
> +
> +	params.uarn = cq->ucontext->uarn;
> +	params.cq_depth = entries;
> +	params.dma_addr = cq->dma_addr;
> +	params.entry_size_in_bytes = cmd.cq_entry_size;
> +	params.num_sub_cqs = cmd.num_sub_cqs;
> +	err = efa_com_create_cq(dev->edev, &params, &result);
> +	if (err)
> +		goto err_free_mapped;
> +
> +	resp.cq_idx = result.cq_idx;
> +	cq->cq_idx = result.cq_idx;
> +	cq->ibcq.cqe = result.actual_depth;
> +	WARN_ON_ONCE(entries != result.actual_depth);
> +
> +	err = cq_mmap_entries_setup(dev, cq, &resp);
> +	if (err) {
> +		efa_err(&ibdev->dev,
> +			"Could not setup cq[%u] mmap entries\n", cq->cq_idx);
> +		goto err_destroy_cq;
> +	}
> +
> +	if (udata->outlen) {
> +		err = ib_copy_to_udata(udata, &resp,
> +				       min(sizeof(resp), udata->outlen));
> +		if (err) {
> +			efa_err_rl(&ibdev->dev,
> +				   "Failed to copy udata for create_cq\n");
> +			goto err_mmap_remove;
> +		}
> +	}
> +
> +	efa_dbg(&ibdev->dev,
> +		"Created cq[%d], cq depth[%u]. dma[%pad] virt[%p]\n",
> +		cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
> +
> +	return &cq->ibcq;
> +
> +err_mmap_remove:
> +	mmap_obj_entries_remove(dev, to_eucontext(ibucontext), cq);
> +err_destroy_cq:
> +	efa_destroy_cq_idx(dev, cq->cq_idx);
> +err_free_mapped:
> +	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
> +			 DMA_FROM_DEVICE);
> +	free_pages_exact(cq->cpu_addr, cq->size);
> +err_free_cq:
> +	kfree(cq);
> +err_out:
> +	efa_stat_inc(dev, dev->stats.sw_stats.create_cq_err);
> +	return ERR_PTR(err);
> +}
> +
> +struct ib_cq *efa_create_cq(struct ib_device *ibdev,
> +			    const struct ib_cq_init_attr *attr,
> +			    struct ib_ucontext *ibucontext,
> +			    struct ib_udata *udata)
> +{
> +	return do_create_cq(ibdev, attr->cqe, attr->comp_vector, ibucontext,
> +			    udata);
> +}
> +
> +static int umem_to_page_list(struct efa_dev *dev,
> +			     struct ib_umem *umem,
> +			     u64 *page_list,
> +			     u32 hp_cnt,
> +			     u8 hp_shift)
> +{
> +	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
> +	unsigned int page_idx = 0;
> +	unsigned int hp_idx = 0;
> +	struct scatterlist *sg;
> +	unsigned int entry;
> +
> +	if (umem->page_shift != PAGE_SHIFT) {
> +		efa_err(&dev->ibdev.dev,
> +			"umem invalid page shift %d\n", umem->page_shift);
> +		return -EINVAL;
> +	}
> +
> +	efa_dbg(&dev->ibdev.dev, "hp_cnt[%u], pages_in_hp[%u]\n",
> +		hp_cnt, pages_in_hp);
> +
> +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
> +		if (unlikely(sg_dma_len(sg) != PAGE_SIZE)) {
> +			efa_err(&dev->ibdev.dev,
> +				"sg_dma_len[%u] != PAGE_SIZE[%lu]\n",
> +				sg_dma_len(sg), PAGE_SIZE);
> +			return -EINVAL;
> +		}
> +
> +		if (page_idx % pages_in_hp == 0) {
> +			page_list[hp_idx] = sg_dma_address(sg);
> +			hp_idx++;
> +		}
> +		page_idx++;
> +	}
> +
> +	return 0;
> +}
> +
> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
> +{
> +	struct scatterlist *sglist;
> +	struct page *pg;
> +	int i;
> +
> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
> +	if (!sglist)
> +		return NULL;
> +	sg_init_table(sglist, page_cnt);
> +	for (i = 0; i < page_cnt; i++) {
> +		pg = vmalloc_to_page(buf);
> +		if (!pg)
> +			goto err;
> +		WARN_ON_ONCE(PageHighMem(pg));

Is this WARN_ON_ONCE() really an error that needs to be handled?


> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
> +	}
> +	return sglist;
> +
> +err:
> +	kfree(sglist);
> +	return NULL;
> +}
> +
> +/*
> + * create a chunk list of physical pages dma addresses from the supplied
> + * scatter gather list
> + */
> +static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
> +{
> +	unsigned int entry, npg_in_sg, chunk_list_size, chunk_idx, page_idx;
> +	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
> +	int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages;
> +	struct scatterlist *pages_sgl = pbl->phys.indirect.sgl;
> +	int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
> +	struct efa_com_ctrl_buff_info *ctrl_buf;
> +	u64 *cur_chunk_buf, *prev_chunk_buf;
> +	struct scatterlist *sg;
> +	dma_addr_t dma_addr;
> +	int i;
> +
> +	/* allocate a chunk list that consists of 4KB chunks */
> +	chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PAGE_PTRS_PER_CHUNK);
> +
> +	chunk_list->size = chunk_list_size;
> +	chunk_list->chunks = kcalloc(chunk_list_size,
> +				     sizeof(*chunk_list->chunks),
> +				     GFP_KERNEL);
> +	if (!chunk_list->chunks)
> +		return -ENOMEM;
> +
> +	efa_dbg(&dev->ibdev.dev,
> +		"chunk_list_size[%u] - pages[%u]\n", chunk_list_size,
> +		page_cnt);
> +
> +	/* allocate chunk buffers: */
> +	for (i = 0; i < chunk_list_size; i++) {
> +		chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_ALLOC_SIZE,
> +						    GFP_KERNEL);
> +		if (!chunk_list->chunks[i].buf)
> +			goto chunk_list_dealloc;
> +
> +		chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE;
> +	}
> +	chunk_list->chunks[chunk_list_size - 1].length =
> +		((page_cnt % EFA_PAGE_PTRS_PER_CHUNK) * EFA_PAGE_PTR_SIZE) +
> +			EFA_CHUNK_PTR_SIZE;
> +
> +	/* fill the dma addresses of sg list pages to chunks: */
> +	chunk_idx = 0;
> +	page_idx = 0;
> +	cur_chunk_buf = chunk_list->chunks[0].buf;
> +	for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) {
> +		npg_in_sg = sg_dma_len(sg) >> EFA_PAGE_SHIFT;
> +		for (i = 0; i < npg_in_sg; i++) {
> +			cur_chunk_buf[page_idx++] = sg_dma_address(sg) +
> +						    (EFA_PAGE_SIZE * i);
> +
> +			if (page_idx == EFA_PAGE_PTRS_PER_CHUNK) {
> +				chunk_idx++;
> +				cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
> +				page_idx = 0;
> +			}
> +		}
> +	}
> +
> +	/* map chunks to dma and fill chunks next ptrs */
> +	for (i = chunk_list_size - 1; i >= 0; i--) {
> +		dma_addr = dma_map_single(&dev->pdev->dev,
> +					  chunk_list->chunks[i].buf,
> +					  chunk_list->chunks[i].length,
> +					  DMA_TO_DEVICE);
> +		if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
> +			efa_err(&dev->ibdev.dev,
> +				"chunk[%u] dma_map_failed\n", i);
> +			goto chunk_list_unmap;
> +		}
> +
> +		chunk_list->chunks[i].dma_addr = dma_addr;
> +		efa_dbg(&dev->ibdev.dev,
> +			"chunk[%u] mapped at [%pad]\n", i, &dma_addr);
> +
> +		if (!i)
> +			break;
> +
> +		prev_chunk_buf = chunk_list->chunks[i - 1].buf;
> +
> +		ctrl_buf = (struct efa_com_ctrl_buff_info *)
> +				&prev_chunk_buf[EFA_PAGE_PTRS_PER_CHUNK];
> +		ctrl_buf->length = chunk_list->chunks[i].length;
> +
> +		efa_com_set_dma_addr(dma_addr,
> +				     &ctrl_buf->address.mem_addr_high,
> +				     &ctrl_buf->address.mem_addr_low);
> +	}
> +
> +	return 0;
> +
> +chunk_list_unmap:
> +	for (; i < chunk_list_size; i++) {
> +		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
> +				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
> +	}
> +chunk_list_dealloc:
> +	for (i = 0; i < chunk_list_size; i++)
> +		kfree(chunk_list->chunks[i].buf);
> +
> +	kfree(chunk_list->chunks);
> +	return -ENOMEM;
> +}
> +
> +static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl)
> +{
> +	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
> +	int i;
> +
> +	for (i = 0; i < chunk_list->size; i++) {
> +		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
> +				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
> +		kfree(chunk_list->chunks[i].buf);
> +	}
> +
> +	kfree(chunk_list->chunks);
> +}
> +
> +/* initialize pbl continuous mode: map pbl buffer to a dma address. */
> +static int pbl_continuous_initialize(struct efa_dev *dev,
> +				     struct pbl_context *pbl)
> +{
> +	dma_addr_t dma_addr;
> +
> +	dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf,
> +				  pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
> +	if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
> +		efa_err(&dev->ibdev.dev, "Unable to map pbl to DMA address\n");
> +		return -ENOMEM;
> +	}
> +
> +	pbl->phys.continuous.dma_addr = dma_addr;
> +	efa_dbg(&dev->ibdev.dev,
> +		"pbl continuous - dma_addr = %pad, size[%u]\n",
> +		 &dma_addr, pbl->pbl_buf_size_in_bytes);
> +
> +	return 0;
> +}
> +
> +/*
> + * initialize pbl indirect mode:
> + * create a chunk list out of the dma addresses of the physical pages of
> + * pbl buffer.
> + */
> +static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
> +{
> +	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes,
> +					 EFA_PAGE_SIZE);
> +	struct scatterlist *sgl;
> +	int sg_dma_cnt, err;
> +
> +	sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
> +	if (!sgl)
> +		return -ENOMEM;
> +
> +	sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
> +	if (!sg_dma_cnt) {
> +		err = -EINVAL;
> +		goto err_map;
> +	}
> +
> +	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
> +	pbl->phys.indirect.sgl = sgl;
> +	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
> +	err = pbl_chunk_list_create(dev, pbl);
> +	if (err) {
> +		efa_err(&dev->ibdev.dev,
> +			"chunk_list creation failed[%d]\n", err);
> +		goto err_chunk;
> +	}
> +
> +	efa_dbg(&dev->ibdev.dev,
> +		"pbl indirect - size[%u], chunks[%u]\n",
> +		pbl->pbl_buf_size_in_bytes,
> +		pbl->phys.indirect.chunk_list.size);
> +
> +	return 0;
> +
> +err_chunk:
> +	dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
> +err_map:
> +	kfree(sgl);
> +	return err;
> +}
> +
> +static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
> +{
> +	pbl_chunk_list_destroy(dev, pbl);
> +	dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl,
> +		     pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE);
> +	kfree(pbl->phys.indirect.sgl);
> +}
> +
> +/* create a page buffer list from a mapped user memory region */
> +static int pbl_create(struct efa_dev *dev,
> +		      struct pbl_context *pbl,
> +		      struct ib_umem *umem,
> +		      int hp_cnt,
> +		      u8 hp_shift)
> +{
> +	int err;
> +
> +	pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_PAGE_PTR_SIZE;
> +	pbl->pbl_buf = kzalloc(pbl->pbl_buf_size_in_bytes,
> +			       GFP_KERNEL | __GFP_NOWARN);
> +	if (pbl->pbl_buf) {
> +		pbl->physically_continuous = 1;
> +		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
> +					hp_shift);
> +		if (err)
> +			goto err_continuous;
> +		err = pbl_continuous_initialize(dev, pbl);
> +		if (err)
> +			goto err_continuous;
> +	} else {
> +		pbl->physically_continuous = 0;
> +		pbl->pbl_buf = vzalloc(pbl->pbl_buf_size_in_bytes);
> +		if (!pbl->pbl_buf)
> +			return -ENOMEM;
> +
> +		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
> +					hp_shift);
> +		if (err)
> +			goto err_indirect;
> +		err = pbl_indirect_initialize(dev, pbl);
> +		if (err)
> +			goto err_indirect;
> +	}
> +
> +	efa_dbg(&dev->ibdev.dev,
> +		"user_pbl_created: user_pages[%u], continuous[%u]\n",
> +		hp_cnt, pbl->physically_continuous);
> +
> +	return 0;
> +
> +err_continuous:
> +	kfree(pbl->pbl_buf);
> +	return err;
> +err_indirect:
> +	vfree(pbl->pbl_buf);
> +	return err;
> +}
> +
> +static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl)
> +{
> +	if (pbl->physically_continuous) {
> +		dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr,
> +				 pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
> +		kfree(pbl->pbl_buf);
> +	} else {
> +		pbl_indirect_terminate(dev, pbl);
> +		vfree(pbl->pbl_buf);
> +	}
> +}
> +
> +static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
> +				 struct efa_com_reg_mr_params *params)
> +{
> +	int err;
> +
> +	params->inline_pbl = 1;
> +	err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
> +				params->page_num, params->page_shift);
> +	if (err)
> +		return err;
> +
> +	efa_dbg(&dev->ibdev.dev,
> +		"inline_pbl_array - pages[%u]\n", params->page_num);
> +
> +	return 0;
> +}
> +
> +static int efa_create_pbl(struct efa_dev *dev,
> +			  struct pbl_context *pbl,
> +			  struct efa_mr *mr,
> +			  struct efa_com_reg_mr_params *params)
> +{
> +	int err;
> +
> +	err = pbl_create(dev, pbl, mr->umem, params->page_num,
> +			 params->page_shift);
> +	if (err) {
> +		efa_err(&dev->ibdev.dev, "Failed to create pbl[%d]\n", err);
> +		return err;
> +	}
> +
> +	params->inline_pbl = 0;
> +	params->indirect = !pbl->physically_continuous;
> +	if (pbl->physically_continuous) {
> +		params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes;
> +
> +		efa_com_set_dma_addr(pbl->phys.continuous.dma_addr,
> +				     &params->pbl.pbl.address.mem_addr_high,
> +				     &params->pbl.pbl.address.mem_addr_low);
> +	} else {
> +		params->pbl.pbl.length =
> +			pbl->phys.indirect.chunk_list.chunks[0].length;
> +
> +		efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr,
> +				     &params->pbl.pbl.address.mem_addr_high,
> +				     &params->pbl.pbl.address.mem_addr_low);
> +	}
> +
> +	return 0;
> +}
> +
> +static void efa_cont_pages(struct ib_umem *umem, u64 addr,
> +			   unsigned long max_page_shift,
> +			   int *count, u8 *shift, u32 *ncont)
> +{
> +	unsigned long page_shift = umem->page_shift;
> +	struct scatterlist *sg;
> +	u64 base = ~0, p = 0;
> +	unsigned long tmp;
> +	unsigned long m;
> +	u64 len, pfn;
> +	int i = 0;
> +	int entry;
> +
> +	addr = addr >> page_shift;
> +	tmp = (unsigned long)addr;
> +	m = find_first_bit(&tmp, BITS_PER_LONG);
> +	if (max_page_shift)
> +		m = min_t(unsigned long, max_page_shift - page_shift, m);
> +
> +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
> +		len = sg_dma_len(sg) >> page_shift;
> +		pfn = sg_dma_address(sg) >> page_shift;
> +		if (base + p != pfn) {
> +			/*
> +			 * If either the offset or the new
> +			 * base are unaligned update m
> +			 */
> +			tmp = (unsigned long)(pfn | p);
> +			if (!IS_ALIGNED(tmp, 1 << m))
> +				m = find_first_bit(&tmp, BITS_PER_LONG);
> +
> +			base = pfn;
> +			p = 0;
> +		}
> +
> +		p += len;
> +		i += len;
> +	}
> +
> +	if (i) {
> +		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
> +		*ncont = DIV_ROUND_UP(i, (1 << m));
> +	} else {
> +		m = 0;
> +		*ncont = 0;
> +	}
> +
> +	*shift = page_shift + m;
> +	*count = i;
> +}
> +
> +struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
> +			 u64 virt_addr, int access_flags,
> +			 struct ib_udata *udata)
> +{
> +	struct efa_dev *dev = to_edev(ibpd->device);
> +	struct efa_com_reg_mr_params params = {};
> +	struct efa_com_reg_mr_result result = {};
> +	unsigned long max_page_shift;
> +	struct pbl_context pbl;
> +	struct efa_mr *mr;
> +	int inline_size;
> +	int npages;
> +	int err;
> +
> +	if (!udata) {
> +		efa_err_rl(&dev->ibdev.dev, "udata is NULL\n");
> +		err = -EOPNOTSUPP;
> +		goto err_out;
> +	}
> +
> +	if (udata->inlen &&
> +	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
> +		efa_err_rl(&dev->ibdev.dev,
> +			   "Incompatible ABI params, udata not cleared\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (access_flags & ~EFA_SUPPORTED_ACCESS_FLAGS) {
> +		efa_err(&dev->ibdev.dev,
> +			"Unsupported access flags[%#x], supported[%#x]\n",
> +			access_flags, EFA_SUPPORTED_ACCESS_FLAGS);
> +		err = -EOPNOTSUPP;
> +		goto err_out;
> +	}
> +
> +	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
> +	if (!mr) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +
> +	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
> +	if (IS_ERR(mr->umem)) {
> +		err = PTR_ERR(mr->umem);
> +		efa_err(&dev->ibdev.dev,
> +			"Failed to pin and map user space memory[%d]\n", err);
> +		goto err_free;
> +	}
> +
> +	params.pd = to_epd(ibpd)->pdn;
> +	params.iova = virt_addr;
> +	params.mr_length_in_bytes = length;
> +	params.permissions = access_flags & 0x1;
> +	max_page_shift = fls64(dev->dev_attr.page_size_cap);
> +
> +	efa_cont_pages(mr->umem, start, max_page_shift, &npages,
> +		       &params.page_shift, &params.page_num);
> +	efa_dbg(&dev->ibdev.dev,
> +		"start %#llx length %#llx npages %d params.page_shift %u params.page_num %u\n",
> +		start, length, npages, params.page_shift, params.page_num);
> +
> +	inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
> +	if (params.page_num <= inline_size) {
> +		err = efa_create_inline_pbl(dev, mr, &params);
> +		if (err)
> +			goto err_unmap;
> +
> +		err = efa_com_register_mr(dev->edev, &params, &result);
> +		if (err)
> +			goto err_unmap;
> +	} else {
> +		err = efa_create_pbl(dev, &pbl, mr, &params);
> +		if (err)
> +			goto err_unmap;
> +
> +		err = efa_com_register_mr(dev->edev, &params, &result);
> +		pbl_destroy(dev, &pbl);
> +
> +		if (err)
> +			goto err_unmap;
> +	}
> +
> +	mr->ibmr.lkey = result.l_key;
> +	mr->ibmr.rkey = result.r_key;
> +	mr->ibmr.length = length;
> +	efa_dbg(&dev->ibdev.dev, "Registered mr[%d]\n", mr->ibmr.lkey);
> +
> +	return &mr->ibmr;
> +
> +err_unmap:
> +	ib_umem_release(mr->umem);
> +err_free:
> +	kfree(mr);
> +err_out:
> +	efa_stat_inc(dev, dev->stats.sw_stats.reg_mr_err);
> +	return ERR_PTR(err);
> +}
> +
> +int efa_dereg_mr(struct ib_mr *ibmr)
> +{
> +	struct efa_dev *dev = to_edev(ibmr->device);
> +	struct efa_com_dereg_mr_params params;
> +	struct efa_mr *mr = to_emr(ibmr);
> +
> +	efa_dbg(&dev->ibdev.dev, "Deregister mr[%d]\n", ibmr->lkey);
> +
> +	if (mr->umem) {
> +		params.l_key = mr->ibmr.lkey;
> +		efa_com_dereg_mr(dev->edev, &params);
> +		ib_umem_release(mr->umem);
> +	}
> +
> +	kfree(mr);
> +
> +	return 0;
> +}
> +
> +int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
> +			   struct ib_port_immutable *immutable)
> +{
> +	struct ib_port_attr attr;
> +	int err;
> +
> +	immutable->core_cap_flags = RDMA_CORE_CAP_PROT_EFA;
> +
> +	err = ib_query_port(ibdev, port_num, &attr);
> +	if (err) {
> +		efa_err(&ibdev->dev, "Couldn't query port err[%d]\n", err);
> +		return err;
> +	}
> +
> +	immutable->pkey_tbl_len = attr.pkey_tbl_len;
> +	immutable->gid_tbl_len = attr.gid_tbl_len;
> +
> +	return 0;
> +}
> +
> +static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn)
> +{
> +	struct efa_com_dealloc_uar_params params = {
> +		.uarn = uarn,
> +	};
> +
> +	return efa_com_dealloc_uar(dev->edev, &params);
> +}
> +
> +struct ib_ucontext *efa_alloc_ucontext(struct ib_device *ibdev,
> +				       struct ib_udata *udata)
> +{
> +	struct efa_ibv_alloc_ucontext_resp resp = {};
> +	struct efa_com_alloc_uar_result result;
> +	struct efa_dev *dev = to_edev(ibdev);
> +	struct efa_ucontext *ucontext;
> +	int err;
> +
> +	/*
> +	 * it's fine if the driver does not know all request fields,
> +	 * we will ack input fields in our response.
> +	 */
> +
> +	ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
> +	if (!ucontext) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +
> +	err = efa_com_alloc_uar(dev->edev, &result);
> +	if (err)
> +		goto err_free_ucontext;
> +
> +	ucontext->uarn = result.uarn;
> +	mutex_init(&ucontext->lock);
> +	INIT_LIST_HEAD(&ucontext->pending_mmaps);
> +
> +	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
> +	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
> +	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
> +	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
> +	resp.max_llq_size = dev->dev_attr.max_llq_size;
> +
> +	if (udata && udata->outlen) {
> +		err = ib_copy_to_udata(udata, &resp,
> +				       min(sizeof(resp), udata->outlen));
> +		if (err)
> +			goto err_dealloc_uar;
> +	}
> +
> +	return &ucontext->ibucontext;
> +
> +err_dealloc_uar:
> +	efa_dealloc_uar(dev, result.uarn);
> +err_free_ucontext:
> +	kfree(ucontext);
> +err_out:
> +	efa_stat_inc(dev, dev->stats.sw_stats.alloc_ucontext_err);
> +	return ERR_PTR(err);
> +}
> +
> +int efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
> +{
> +	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
> +	struct efa_dev *dev = to_edev(ibucontext->device);
> +
> +	mmap_entries_remove_free(dev, ucontext);
> +	efa_dealloc_uar(dev, ucontext->uarn);
> +	kfree(ucontext);
> +
> +	return 0;
> +}
> +
> +static int __efa_mmap(struct efa_dev *dev,
> +		      struct efa_ucontext *ucontext,
> +		      struct vm_area_struct *vma,
> +		      struct efa_mmap_entry *entry)
> +{
> +	u8 mmap_flag = get_mmap_flag(entry->key);
> +	u64 pfn = entry->address >> PAGE_SHIFT;
> +	u64 address = entry->address;
> +	u64 length = entry->length;
> +	int err;
> +
> +	efa_dbg(&dev->ibdev.dev,
> +		"Mapping address[%#llx], length[%#llx], mmap_flag[%d]\n",
> +		address, length, mmap_flag);
> +
> +	switch (mmap_flag) {
> +	case EFA_MMAP_IO_NC:
> +		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, length,
> +					pgprot_noncached(vma->vm_page_prot));
> +		break;
> +	case EFA_MMAP_IO_WC:
> +		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, length,
> +					pgprot_writecombine(vma->vm_page_prot));
> +		break;
> +	case EFA_MMAP_DMA_PAGE:
> +		err = rdma_user_mmap_page(&ucontext->ibucontext, vma,
> +					  pfn_to_page(pfn), length);
> +		break;
> +	default:
> +		err = -EINVAL;
> +	}
> +
> +	if (err) {
> +		efa_err(&dev->ibdev.dev,
> +			"Couldn't mmap address[%#llx] length[%#llx] mmap_flag[%d] err[%d]\n",
> +			address, length, mmap_flag, err);
> +		return err;
> +	}
> +
> +	return 0;
> +}
> +
> +int efa_mmap(struct ib_ucontext *ibucontext,
> +	     struct vm_area_struct *vma)
> +{
> +	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
> +	struct efa_dev *dev = to_edev(ibucontext->device);
> +	u64 length = vma->vm_end - vma->vm_start;
> +	u64 key = vma->vm_pgoff << PAGE_SHIFT;
> +	struct efa_mmap_entry *entry;
> +
> +	efa_dbg(&dev->ibdev.dev,
> +		"start 0x%lx, end 0x%lx, length = 0x%llx, key = 0x%llx\n",
> +		vma->vm_start, vma->vm_end, length, key);
> +
> +	if (length % PAGE_SIZE != 0) {
> +		efa_err(&dev->ibdev.dev,
> +			"length[0x%llX] is not page size aligned[0x%lX]\n",
> +			length, PAGE_SIZE);
> +		return -EINVAL;
> +	}
> +
> +	entry = mmap_entry_get(dev, ucontext, key, length);
> +	if (!entry) {
> +		efa_err(&dev->ibdev.dev,
> +			"key[0x%llX] does not have valid entry\n", key);
> +		return -EINVAL;
> +	}
> +
> +	return __efa_mmap(dev, ucontext, vma, entry);
> +}
> +
> +static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
> +{
> +	struct efa_com_destroy_ah_params params = {
> +		.ah = ah->ah,
> +		.pdn = to_epd(ah->ibah.pd)->pdn,
> +	};
> +
> +	return efa_com_destroy_ah(dev->edev, &params);
> +}
> +
> +struct ib_ah *efa_create_ah(struct ib_pd *ibpd,
> +			    struct rdma_ah_attr *ah_attr,
> +			    u32 flags,
> +			    struct ib_udata *udata)
> +{
> +	struct efa_dev *dev = to_edev(ibpd->device);
> +	struct efa_com_create_ah_params params = {};
> +	struct efa_ibv_create_ah_resp resp = {};
> +	struct efa_com_create_ah_result result;
> +	struct efa_ah *ah;
> +	int err;
> +
> +	if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) {
> +		efa_warn(&dev->ibdev.dev,
> +			 "Create address handle is not supported in atomic context\n");
> +		err = -EOPNOTSUPP;
> +		goto err_out;
> +	}
> +
> +	if (!udata) {
> +		efa_err_rl(&dev->ibdev.dev, "udata is NULL\n");
> +		err = -EOPNOTSUPP;
> +		goto err_out;
> +	}
> +
> +	if (udata->inlen &&
> +	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
> +		efa_err_rl(&dev->ibdev.dev,
> +			   "Incompatiable ABI params\n");
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
> +	if (!ah) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +
> +	memcpy(params.dest_addr, ah_attr->grh.dgid.raw,
> +	       sizeof(params.dest_addr));
> +	params.pdn = to_epd(ibpd)->pdn;
> +	err = efa_com_create_ah(dev->edev, &params, &result);
> +	if (err)
> +		goto err_free;
> +
> +	memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id));
> +	ah->ah = result.ah;
> +
> +	resp.efa_address_handle = result.ah;
> +
> +	if (udata->outlen) {
> +		err = ib_copy_to_udata(udata, &resp,
> +				       min(sizeof(resp), udata->outlen));
> +		if (err) {
> +			efa_err_rl(&dev->ibdev.dev,
> +				   "Failed to copy udata for create_ah response\n");
> +			goto err_destroy_ah;
> +		}
> +	}
> +	efa_dbg(&dev->ibdev.dev, "Created ah[%d]\n", ah->ah);
> +
> +	return &ah->ibah;
> +
> +err_destroy_ah:
> +	efa_ah_destroy(dev, ah);
> +err_free:
> +	kfree(ah);
> +err_out:
> +	efa_stat_inc(dev, dev->stats.sw_stats.create_ah_err);
> +	return ERR_PTR(err);
> +}
> +
> +int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
> +{
> +	struct efa_dev *dev = to_edev(ibah->pd->device);
> +	struct efa_ah *ah = to_eah(ibah);
> +
> +	efa_dbg(&dev->ibdev.dev, "Destroy ah[%d]\n", ah->ah);
> +
> +	if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
> +		efa_warn(&dev->ibdev.dev,
> +			 "Destroy address handle is not supported in atomic context\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	efa_ah_destroy(dev, ah);
> +
> +	kfree(ah);
> +	return 0;
> +}
> +
> +enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
> +					 u8 port_num)
> +{
> +	return IB_LINK_LAYER_UNSPECIFIED;
> +}
> +


Reviewed-by: Steve Wise <swise@opengridcomputing.com>

Gal Pressman Feb. 27, 2019, 8:39 a.m. UTC | #2

On 26-Feb-19 23:43, Steve Wise wrote:
>> +int efa_query_port(struct ib_device *ibdev, u8 port,
>> +		   struct ib_port_attr *props)
>> +{
>> +	struct efa_dev *dev = to_edev(ibdev);
>> +
>> +	memset(props, 0, sizeof(*props));
>> +
>> +	props->lid = 0;
>> +	props->lmc = 1;
>> +	props->sm_lid = 0;
>> +	props->sm_sl = 0;
>> +
>> +	props->state = IB_PORT_ACTIVE;
>> +	props->phys_state = 5;
>> +	props->port_cap_flags = 0;
>> +	props->gid_tbl_len = 1;
>> +	props->pkey_tbl_len = 1;
>> +	props->bad_pkey_cntr = 0;
>> +	props->qkey_viol_cntr = 0;
>> +	props->active_speed = IB_SPEED_EDR;
>> +	props->active_width = IB_WIDTH_4X;
>> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
>> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
>> +	props->max_msg_sz = dev->mtu;
>> +	props->max_vl_num = 1;
>> +
> 
> 
> Since you memset() props to all zeros,  should you bother with
> initializing the zero fields?

Will remove.

> 
> 
>> +	return 0;
>> +}
>> +
>> +static int efa_qp_validate_cap(struct efa_dev *dev,
>> +			       struct ib_qp_init_attr *init_attr)
>> +{
>> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
>> +		efa_err(&dev->ibdev.dev,
>> +			"qp: requested send wr[%u] exceeds the max[%u]\n",
>> +			init_attr->cap.max_send_wr,
>> +			dev->dev_attr.max_sq_depth);
>> +		return -EINVAL;
>> +	}
>> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
>> +		efa_err(&dev->ibdev.dev,
>> +			"qp: requested receive wr[%u] exceeds the max[%u]\n",
>> +			init_attr->cap.max_recv_wr,
>> +			dev->dev_attr.max_rq_depth);
>> +		return -EINVAL;
>> +	}
>> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
>> +		efa_err(&dev->ibdev.dev,
>> +			"qp: requested sge send[%u] exceeds the max[%u]\n",
>> +			init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
>> +		return -EINVAL;
>> +	}
>> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
>> +		efa_err(&dev->ibdev.dev,
>> +			"qp: requested sge recv[%u] exceeds the max[%u]\n",
>> +			init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
>> +		return -EINVAL;
>> +	}
>> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
>> +		efa_err(&dev->ibdev.dev,
>> +			"requested inline data[%u] exceeds the max[%u]\n",
>> +			init_attr->cap.max_inline_data,
>> +			dev->dev_attr.inline_buf_size);
>> +		return -EINVAL;
>> +	}
>> +
> 
> 
> Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
> log polluting for user errors.

Most users don't really enable debug and we want them to have an indication of
what happened. is efa_warn() better?

> 
> 
>> +	return 0;
>> +}
>> +
>> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
>> +{
>> +	struct scatterlist *sglist;
>> +	struct page *pg;
>> +	int i;
>> +
>> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
>> +	if (!sglist)
>> +		return NULL;
>> +	sg_init_table(sglist, page_cnt);
>> +	for (i = 0; i < page_cnt; i++) {
>> +		pg = vmalloc_to_page(buf);
>> +		if (!pg)
>> +			goto err;
>> +		WARN_ON_ONCE(PageHighMem(pg));
> 
> Is this WARN_ON_ONCE() really an error that needs to be handled?

AFAIK, there is no way we can actually get a higemem page here.
The WARN is here from early dev days, it should probably be removed.

> 
> 
>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
>> +	}
>> +	return sglist;
>> +
>> +err:
>> +	kfree(sglist);
>> +	return NULL;
>> +}
>> +
> 
> 
> Reviewed-by: Steve Wise <swise@opengridcomputing.com>

Thanks Steve!

Leon Romanovsky Feb. 27, 2019, 8:45 a.m. UTC | #3

On Wed, Feb 27, 2019 at 10:39:30AM +0200, Gal Pressman wrote:
> On 26-Feb-19 23:43, Steve Wise wrote:
> >> +int efa_query_port(struct ib_device *ibdev, u8 port,
> >> +		   struct ib_port_attr *props)
> >> +{
> >> +	struct efa_dev *dev = to_edev(ibdev);
> >> +
> >> +	memset(props, 0, sizeof(*props));
> >> +
> >> +	props->lid = 0;
> >> +	props->lmc = 1;
> >> +	props->sm_lid = 0;
> >> +	props->sm_sl = 0;
> >> +
> >> +	props->state = IB_PORT_ACTIVE;
> >> +	props->phys_state = 5;
> >> +	props->port_cap_flags = 0;
> >> +	props->gid_tbl_len = 1;
> >> +	props->pkey_tbl_len = 1;
> >> +	props->bad_pkey_cntr = 0;
> >> +	props->qkey_viol_cntr = 0;
> >> +	props->active_speed = IB_SPEED_EDR;
> >> +	props->active_width = IB_WIDTH_4X;
> >> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
> >> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
> >> +	props->max_msg_sz = dev->mtu;
> >> +	props->max_vl_num = 1;
> >> +
> >
> >
> > Since you memset() props to all zeros,  should you bother with
> > initializing the zero fields?
>
> Will remove.
>
> >
> >
> >> +	return 0;
> >> +}
> >> +
> >> +static int efa_qp_validate_cap(struct efa_dev *dev,
> >> +			       struct ib_qp_init_attr *init_attr)
> >> +{
> >> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"qp: requested send wr[%u] exceeds the max[%u]\n",
> >> +			init_attr->cap.max_send_wr,
> >> +			dev->dev_attr.max_sq_depth);
> >> +		return -EINVAL;
> >> +	}
> >> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"qp: requested receive wr[%u] exceeds the max[%u]\n",
> >> +			init_attr->cap.max_recv_wr,
> >> +			dev->dev_attr.max_rq_depth);
> >> +		return -EINVAL;
> >> +	}
> >> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"qp: requested sge send[%u] exceeds the max[%u]\n",
> >> +			init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
> >> +		return -EINVAL;
> >> +	}
> >> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"qp: requested sge recv[%u] exceeds the max[%u]\n",
> >> +			init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
> >> +		return -EINVAL;
> >> +	}
> >> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"requested inline data[%u] exceeds the max[%u]\n",
> >> +			init_attr->cap.max_inline_data,
> >> +			dev->dev_attr.inline_buf_size);
> >> +		return -EINVAL;
> >> +	}
> >> +
> >
> >
> > Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
> > log polluting for user errors.
>
> Most users don't really enable debug and we want them to have an indication of
> what happened. is efa_warn() better?

aren't you doing anything that your users would like to avoid - polluting dmesg?

>
> >
> >
> >> +	return 0;
> >> +}
> >> +
> >> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
> >> +{
> >> +	struct scatterlist *sglist;
> >> +	struct page *pg;
> >> +	int i;
> >> +
> >> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
> >> +	if (!sglist)
> >> +		return NULL;
> >> +	sg_init_table(sglist, page_cnt);
> >> +	for (i = 0; i < page_cnt; i++) {
> >> +		pg = vmalloc_to_page(buf);
> >> +		if (!pg)
> >> +			goto err;
> >> +		WARN_ON_ONCE(PageHighMem(pg));
> >
> > Is this WARN_ON_ONCE() really an error that needs to be handled?
>
> AFAIK, there is no way we can actually get a higemem page here.
> The WARN is here from early dev days, it should probably be removed.
>
> >
> >
> >> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
> >> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);

Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?

> >> +	}
> >> +	return sglist;
> >> +
> >> +err:
> >> +	kfree(sglist);
> >> +	return NULL;
> >> +}
> >> +
> >
> >
> > Reviewed-by: Steve Wise <swise@opengridcomputing.com>
>
> Thanks Steve!

Gal Pressman Feb. 27, 2019, 9:06 a.m. UTC | #4

On 27-Feb-19 10:45, Leon Romanovsky wrote:
> On Wed, Feb 27, 2019 at 10:39:30AM +0200, Gal Pressman wrote:
>> On 26-Feb-19 23:43, Steve Wise wrote:
>>>> +int efa_query_port(struct ib_device *ibdev, u8 port,
>>>> +		   struct ib_port_attr *props)
>>>> +{
>>>> +	struct efa_dev *dev = to_edev(ibdev);
>>>> +
>>>> +	memset(props, 0, sizeof(*props));
>>>> +
>>>> +	props->lid = 0;
>>>> +	props->lmc = 1;
>>>> +	props->sm_lid = 0;
>>>> +	props->sm_sl = 0;
>>>> +
>>>> +	props->state = IB_PORT_ACTIVE;
>>>> +	props->phys_state = 5;
>>>> +	props->port_cap_flags = 0;
>>>> +	props->gid_tbl_len = 1;
>>>> +	props->pkey_tbl_len = 1;
>>>> +	props->bad_pkey_cntr = 0;
>>>> +	props->qkey_viol_cntr = 0;
>>>> +	props->active_speed = IB_SPEED_EDR;
>>>> +	props->active_width = IB_WIDTH_4X;
>>>> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
>>>> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
>>>> +	props->max_msg_sz = dev->mtu;
>>>> +	props->max_vl_num = 1;
>>>> +
>>>
>>>
>>> Since you memset() props to all zeros,  should you bother with
>>> initializing the zero fields?
>>
>> Will remove.
>>
>>>
>>>
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static int efa_qp_validate_cap(struct efa_dev *dev,
>>>> +			       struct ib_qp_init_attr *init_attr)
>>>> +{
>>>> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"qp: requested send wr[%u] exceeds the max[%u]\n",
>>>> +			init_attr->cap.max_send_wr,
>>>> +			dev->dev_attr.max_sq_depth);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"qp: requested receive wr[%u] exceeds the max[%u]\n",
>>>> +			init_attr->cap.max_recv_wr,
>>>> +			dev->dev_attr.max_rq_depth);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"qp: requested sge send[%u] exceeds the max[%u]\n",
>>>> +			init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"qp: requested sge recv[%u] exceeds the max[%u]\n",
>>>> +			init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"requested inline data[%u] exceeds the max[%u]\n",
>>>> +			init_attr->cap.max_inline_data,
>>>> +			dev->dev_attr.inline_buf_size);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +
>>>
>>>
>>> Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
>>> log polluting for user errors.
>>
>> Most users don't really enable debug and we want them to have an indication of
>> what happened. is efa_warn() better?
> 
> aren't you doing anything that your users would like to avoid - polluting dmesg?

We haven't seen a case where it polluted dmesg.
It's one error print of invalid parameter (which is very unlikely as our
provider checks for this as well) and most applications will exit at this point
if create QP failed so there shouldn't be any more prints.

> 
>>
>>>
>>>
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
>>>> +{
>>>> +	struct scatterlist *sglist;
>>>> +	struct page *pg;
>>>> +	int i;
>>>> +
>>>> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
>>>> +	if (!sglist)
>>>> +		return NULL;
>>>> +	sg_init_table(sglist, page_cnt);
>>>> +	for (i = 0; i < page_cnt; i++) {
>>>> +		pg = vmalloc_to_page(buf);
>>>> +		if (!pg)
>>>> +			goto err;
>>>> +		WARN_ON_ONCE(PageHighMem(pg));
>>>
>>> Is this WARN_ON_ONCE() really an error that needs to be handled?
>>
>> AFAIK, there is no way we can actually get a higemem page here.
>> The WARN is here from early dev days, it should probably be removed.
>>
>>>
>>>
>>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
>>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
> 
> Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?

EFA_PAGE_SIZE represents the device page size.

> 
>>>> +	}
>>>> +	return sglist;
>>>> +
>>>> +err:
>>>> +	kfree(sglist);
>>>> +	return NULL;
>>>> +}
>>>> +
>>>
>>>
>>> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
>>
>> Thanks Steve!

Leon Romanovsky Feb. 27, 2019, 9:37 a.m. UTC | #5

On Wed, Feb 27, 2019 at 11:06:14AM +0200, Gal Pressman wrote:
> On 27-Feb-19 10:45, Leon Romanovsky wrote:
> > On Wed, Feb 27, 2019 at 10:39:30AM +0200, Gal Pressman wrote:
> >> On 26-Feb-19 23:43, Steve Wise wrote:
> >>>> +int efa_query_port(struct ib_device *ibdev, u8 port,
> >>>> +		   struct ib_port_attr *props)
> >>>> +{
> >>>> +	struct efa_dev *dev = to_edev(ibdev);
> >>>> +
> >>>> +	memset(props, 0, sizeof(*props));
> >>>> +
> >>>> +	props->lid = 0;
> >>>> +	props->lmc = 1;
> >>>> +	props->sm_lid = 0;
> >>>> +	props->sm_sl = 0;
> >>>> +
> >>>> +	props->state = IB_PORT_ACTIVE;
> >>>> +	props->phys_state = 5;
> >>>> +	props->port_cap_flags = 0;
> >>>> +	props->gid_tbl_len = 1;
> >>>> +	props->pkey_tbl_len = 1;
> >>>> +	props->bad_pkey_cntr = 0;
> >>>> +	props->qkey_viol_cntr = 0;
> >>>> +	props->active_speed = IB_SPEED_EDR;
> >>>> +	props->active_width = IB_WIDTH_4X;
> >>>> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
> >>>> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
> >>>> +	props->max_msg_sz = dev->mtu;
> >>>> +	props->max_vl_num = 1;
> >>>> +
> >>>
> >>>
> >>> Since you memset() props to all zeros,  should you bother with
> >>> initializing the zero fields?
> >>
> >> Will remove.
> >>
> >>>
> >>>
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static int efa_qp_validate_cap(struct efa_dev *dev,
> >>>> +			       struct ib_qp_init_attr *init_attr)
> >>>> +{
> >>>> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
> >>>> +		efa_err(&dev->ibdev.dev,
> >>>> +			"qp: requested send wr[%u] exceeds the max[%u]\n",
> >>>> +			init_attr->cap.max_send_wr,
> >>>> +			dev->dev_attr.max_sq_depth);
> >>>> +		return -EINVAL;
> >>>> +	}
> >>>> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
> >>>> +		efa_err(&dev->ibdev.dev,
> >>>> +			"qp: requested receive wr[%u] exceeds the max[%u]\n",
> >>>> +			init_attr->cap.max_recv_wr,
> >>>> +			dev->dev_attr.max_rq_depth);
> >>>> +		return -EINVAL;
> >>>> +	}
> >>>> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
> >>>> +		efa_err(&dev->ibdev.dev,
> >>>> +			"qp: requested sge send[%u] exceeds the max[%u]\n",
> >>>> +			init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
> >>>> +		return -EINVAL;
> >>>> +	}
> >>>> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
> >>>> +		efa_err(&dev->ibdev.dev,
> >>>> +			"qp: requested sge recv[%u] exceeds the max[%u]\n",
> >>>> +			init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
> >>>> +		return -EINVAL;
> >>>> +	}
> >>>> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
> >>>> +		efa_err(&dev->ibdev.dev,
> >>>> +			"requested inline data[%u] exceeds the max[%u]\n",
> >>>> +			init_attr->cap.max_inline_data,
> >>>> +			dev->dev_attr.inline_buf_size);
> >>>> +		return -EINVAL;
> >>>> +	}
> >>>> +
> >>>
> >>>
> >>> Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
> >>> log polluting for user errors.
> >>
> >> Most users don't really enable debug and we want them to have an indication of
> >> what happened. is efa_warn() better?
> >
> > aren't you doing anything that your users would like to avoid - polluting dmesg?
>
> We haven't seen a case where it polluted dmesg.
> It's one error print of invalid parameter (which is very unlikely as our
> provider checks for this as well) and most applications will exit at this point
> if create QP failed so there shouldn't be any more prints.
>
> >
> >>
> >>>
> >>>
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
> >>>> +{
> >>>> +	struct scatterlist *sglist;
> >>>> +	struct page *pg;
> >>>> +	int i;
> >>>> +
> >>>> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
> >>>> +	if (!sglist)
> >>>> +		return NULL;
> >>>> +	sg_init_table(sglist, page_cnt);
> >>>> +	for (i = 0; i < page_cnt; i++) {
> >>>> +		pg = vmalloc_to_page(buf);
> >>>> +		if (!pg)
> >>>> +			goto err;
> >>>> +		WARN_ON_ONCE(PageHighMem(pg));
> >>>
> >>> Is this WARN_ON_ONCE() really an error that needs to be handled?
> >>
> >> AFAIK, there is no way we can actually get a higemem page here.
> >> The WARN is here from early dev days, it should probably be removed.
> >>
> >>>
> >>>
> >>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
> >>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
> >
> > Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?
>
> EFA_PAGE_SIZE represents the device page size.

So why don't you do:
u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_PAGE_SIZE);

instead of doing it outside of efa_vmalloc_buf_to_sg()?


>
> >
> >>>> +	}
> >>>> +	return sglist;
> >>>> +
> >>>> +err:
> >>>> +	kfree(sglist);
> >>>> +	return NULL;
> >>>> +}
> >>>> +
> >>>
> >>>
> >>> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
> >>
> >> Thanks Steve!

Gal Pressman Feb. 27, 2019, 11:39 a.m. UTC | #6

On 27-Feb-19 11:37, Leon Romanovsky wrote:
> On Wed, Feb 27, 2019 at 11:06:14AM +0200, Gal Pressman wrote:
>> On 27-Feb-19 10:45, Leon Romanovsky wrote:
>>> On Wed, Feb 27, 2019 at 10:39:30AM +0200, Gal Pressman wrote:
>>>> On 26-Feb-19 23:43, Steve Wise wrote:
>>>>>> +int efa_query_port(struct ib_device *ibdev, u8 port,
>>>>>> +		   struct ib_port_attr *props)
>>>>>> +{
>>>>>> +	struct efa_dev *dev = to_edev(ibdev);
>>>>>> +
>>>>>> +	memset(props, 0, sizeof(*props));
>>>>>> +
>>>>>> +	props->lid = 0;
>>>>>> +	props->lmc = 1;
>>>>>> +	props->sm_lid = 0;
>>>>>> +	props->sm_sl = 0;
>>>>>> +
>>>>>> +	props->state = IB_PORT_ACTIVE;
>>>>>> +	props->phys_state = 5;
>>>>>> +	props->port_cap_flags = 0;
>>>>>> +	props->gid_tbl_len = 1;
>>>>>> +	props->pkey_tbl_len = 1;
>>>>>> +	props->bad_pkey_cntr = 0;
>>>>>> +	props->qkey_viol_cntr = 0;
>>>>>> +	props->active_speed = IB_SPEED_EDR;
>>>>>> +	props->active_width = IB_WIDTH_4X;
>>>>>> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
>>>>>> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
>>>>>> +	props->max_msg_sz = dev->mtu;
>>>>>> +	props->max_vl_num = 1;
>>>>>> +
>>>>>
>>>>>
>>>>> Since you memset() props to all zeros,  should you bother with
>>>>> initializing the zero fields?
>>>>
>>>> Will remove.
>>>>
>>>>>
>>>>>
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int efa_qp_validate_cap(struct efa_dev *dev,
>>>>>> +			       struct ib_qp_init_attr *init_attr)
>>>>>> +{
>>>>>> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
>>>>>> +		efa_err(&dev->ibdev.dev,
>>>>>> +			"qp: requested send wr[%u] exceeds the max[%u]\n",
>>>>>> +			init_attr->cap.max_send_wr,
>>>>>> +			dev->dev_attr.max_sq_depth);
>>>>>> +		return -EINVAL;
>>>>>> +	}
>>>>>> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
>>>>>> +		efa_err(&dev->ibdev.dev,
>>>>>> +			"qp: requested receive wr[%u] exceeds the max[%u]\n",
>>>>>> +			init_attr->cap.max_recv_wr,
>>>>>> +			dev->dev_attr.max_rq_depth);
>>>>>> +		return -EINVAL;
>>>>>> +	}
>>>>>> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
>>>>>> +		efa_err(&dev->ibdev.dev,
>>>>>> +			"qp: requested sge send[%u] exceeds the max[%u]\n",
>>>>>> +			init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
>>>>>> +		return -EINVAL;
>>>>>> +	}
>>>>>> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
>>>>>> +		efa_err(&dev->ibdev.dev,
>>>>>> +			"qp: requested sge recv[%u] exceeds the max[%u]\n",
>>>>>> +			init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
>>>>>> +		return -EINVAL;
>>>>>> +	}
>>>>>> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
>>>>>> +		efa_err(&dev->ibdev.dev,
>>>>>> +			"requested inline data[%u] exceeds the max[%u]\n",
>>>>>> +			init_attr->cap.max_inline_data,
>>>>>> +			dev->dev_attr.inline_buf_size);
>>>>>> +		return -EINVAL;
>>>>>> +	}
>>>>>> +
>>>>>
>>>>>
>>>>> Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
>>>>> log polluting for user errors.
>>>>
>>>> Most users don't really enable debug and we want them to have an indication of
>>>> what happened. is efa_warn() better?
>>>
>>> aren't you doing anything that your users would like to avoid - polluting dmesg?
>>
>> We haven't seen a case where it polluted dmesg.
>> It's one error print of invalid parameter (which is very unlikely as our
>> provider checks for this as well) and most applications will exit at this point
>> if create QP failed so there shouldn't be any more prints.
>>
>>>
>>>>
>>>>>
>>>>>
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
>>>>>> +{
>>>>>> +	struct scatterlist *sglist;
>>>>>> +	struct page *pg;
>>>>>> +	int i;
>>>>>> +
>>>>>> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
>>>>>> +	if (!sglist)
>>>>>> +		return NULL;
>>>>>> +	sg_init_table(sglist, page_cnt);
>>>>>> +	for (i = 0; i < page_cnt; i++) {
>>>>>> +		pg = vmalloc_to_page(buf);
>>>>>> +		if (!pg)
>>>>>> +			goto err;
>>>>>> +		WARN_ON_ONCE(PageHighMem(pg));
>>>>>
>>>>> Is this WARN_ON_ONCE() really an error that needs to be handled?
>>>>
>>>> AFAIK, there is no way we can actually get a higemem page here.
>>>> The WARN is here from early dev days, it should probably be removed.
>>>>
>>>>>
>>>>>
>>>>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
>>>>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
>>>
>>> Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?
>>
>> EFA_PAGE_SIZE represents the device page size.
> 
> So why don't you do:
> u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_PAGE_SIZE);
> 
> instead of doing it outside of efa_vmalloc_buf_to_sg()?

I'm using 'size_in_pages' both in pbl_indirect_initialize and in
efa_vmalloc_buf_to_sg so it's calculated once in the outer function. Do you
suggest to make the calculation twice?

> 
> 
>>
>>>
>>>>>> +	}
>>>>>> +	return sglist;
>>>>>> +
>>>>>> +err:
>>>>>> +	kfree(sglist);
>>>>>> +	return NULL;
>>>>>> +}
>>>>>> +
>>>>>
>>>>>
>>>>> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
>>>>
>>>> Thanks Steve!

Leon Romanovsky Feb. 27, 2019, 12:50 p.m. UTC | #7

On Wed, Feb 27, 2019 at 01:39:25PM +0200, Gal Pressman wrote:
> On 27-Feb-19 11:37, Leon Romanovsky wrote:
> > On Wed, Feb 27, 2019 at 11:06:14AM +0200, Gal Pressman wrote:
> >> On 27-Feb-19 10:45, Leon Romanovsky wrote:
> >>> On Wed, Feb 27, 2019 at 10:39:30AM +0200, Gal Pressman wrote:
> >>>> On 26-Feb-19 23:43, Steve Wise wrote:
> >>>>>> +int efa_query_port(struct ib_device *ibdev, u8 port,
> >>>>>> +		   struct ib_port_attr *props)
> >>>>>> +{
> >>>>>> +	struct efa_dev *dev = to_edev(ibdev);
> >>>>>> +
> >>>>>> +	memset(props, 0, sizeof(*props));
> >>>>>> +
> >>>>>> +	props->lid = 0;
> >>>>>> +	props->lmc = 1;
> >>>>>> +	props->sm_lid = 0;
> >>>>>> +	props->sm_sl = 0;
> >>>>>> +
> >>>>>> +	props->state = IB_PORT_ACTIVE;
> >>>>>> +	props->phys_state = 5;
> >>>>>> +	props->port_cap_flags = 0;
> >>>>>> +	props->gid_tbl_len = 1;
> >>>>>> +	props->pkey_tbl_len = 1;
> >>>>>> +	props->bad_pkey_cntr = 0;
> >>>>>> +	props->qkey_viol_cntr = 0;
> >>>>>> +	props->active_speed = IB_SPEED_EDR;
> >>>>>> +	props->active_width = IB_WIDTH_4X;
> >>>>>> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
> >>>>>> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
> >>>>>> +	props->max_msg_sz = dev->mtu;
> >>>>>> +	props->max_vl_num = 1;
> >>>>>> +
> >>>>>
> >>>>>
> >>>>> Since you memset() props to all zeros,  should you bother with
> >>>>> initializing the zero fields?
> >>>>
> >>>> Will remove.
> >>>>
> >>>>>
> >>>>>
> >>>>>> +	return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int efa_qp_validate_cap(struct efa_dev *dev,
> >>>>>> +			       struct ib_qp_init_attr *init_attr)
> >>>>>> +{
> >>>>>> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
> >>>>>> +		efa_err(&dev->ibdev.dev,
> >>>>>> +			"qp: requested send wr[%u] exceeds the max[%u]\n",
> >>>>>> +			init_attr->cap.max_send_wr,
> >>>>>> +			dev->dev_attr.max_sq_depth);
> >>>>>> +		return -EINVAL;
> >>>>>> +	}
> >>>>>> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
> >>>>>> +		efa_err(&dev->ibdev.dev,
> >>>>>> +			"qp: requested receive wr[%u] exceeds the max[%u]\n",
> >>>>>> +			init_attr->cap.max_recv_wr,
> >>>>>> +			dev->dev_attr.max_rq_depth);
> >>>>>> +		return -EINVAL;
> >>>>>> +	}
> >>>>>> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
> >>>>>> +		efa_err(&dev->ibdev.dev,
> >>>>>> +			"qp: requested sge send[%u] exceeds the max[%u]\n",
> >>>>>> +			init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
> >>>>>> +		return -EINVAL;
> >>>>>> +	}
> >>>>>> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
> >>>>>> +		efa_err(&dev->ibdev.dev,
> >>>>>> +			"qp: requested sge recv[%u] exceeds the max[%u]\n",
> >>>>>> +			init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
> >>>>>> +		return -EINVAL;
> >>>>>> +	}
> >>>>>> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
> >>>>>> +		efa_err(&dev->ibdev.dev,
> >>>>>> +			"requested inline data[%u] exceeds the max[%u]\n",
> >>>>>> +			init_attr->cap.max_inline_data,
> >>>>>> +			dev->dev_attr.inline_buf_size);
> >>>>>> +		return -EINVAL;
> >>>>>> +	}
> >>>>>> +
> >>>>>
> >>>>>
> >>>>> Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
> >>>>> log polluting for user errors.
> >>>>
> >>>> Most users don't really enable debug and we want them to have an indication of
> >>>> what happened. is efa_warn() better?
> >>>
> >>> aren't you doing anything that your users would like to avoid - polluting dmesg?
> >>
> >> We haven't seen a case where it polluted dmesg.
> >> It's one error print of invalid parameter (which is very unlikely as our
> >> provider checks for this as well) and most applications will exit at this point
> >> if create QP failed so there shouldn't be any more prints.
> >>
> >>>
> >>>>
> >>>>>
> >>>>>
> >>>>>> +	return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
> >>>>>> +{
> >>>>>> +	struct scatterlist *sglist;
> >>>>>> +	struct page *pg;
> >>>>>> +	int i;
> >>>>>> +
> >>>>>> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
> >>>>>> +	if (!sglist)
> >>>>>> +		return NULL;
> >>>>>> +	sg_init_table(sglist, page_cnt);
> >>>>>> +	for (i = 0; i < page_cnt; i++) {
> >>>>>> +		pg = vmalloc_to_page(buf);
> >>>>>> +		if (!pg)
> >>>>>> +			goto err;
> >>>>>> +		WARN_ON_ONCE(PageHighMem(pg));
> >>>>>
> >>>>> Is this WARN_ON_ONCE() really an error that needs to be handled?
> >>>>
> >>>> AFAIK, there is no way we can actually get a higemem page here.
> >>>> The WARN is here from early dev days, it should probably be removed.
> >>>>
> >>>>>
> >>>>>
> >>>>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
> >>>>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
> >>>
> >>> Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?
> >>
> >> EFA_PAGE_SIZE represents the device page size.
> >
> > So why don't you do:
> > u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_PAGE_SIZE);
> >
> > instead of doing it outside of efa_vmalloc_buf_to_sg()?
>
> I'm using 'size_in_pages' both in pbl_indirect_initialize and in
> efa_vmalloc_buf_to_sg so it's calculated once in the outer function. Do you
> suggest to make the calculation twice?

I see it now, Thanks


>
> >
> >
> >>
> >>>
> >>>>>> +	}
> >>>>>> +	return sglist;
> >>>>>> +
> >>>>>> +err:
> >>>>>> +	kfree(sglist);
> >>>>>> +	return NULL;
> >>>>>> +}
> >>>>>> +
> >>>>>
> >>>>>
> >>>>> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
> >>>>
> >>>> Thanks Steve!

Steve Wise Feb. 27, 2019, 2:05 p.m. UTC | #8

> -----Original Message-----
> From: Gal Pressman <galpress@amazon.com>
> Sent: Wednesday, February 27, 2019 2:40 AM
> To: Steve Wise <swise@opengridcomputing.com>; Jason Gunthorpe
> <jgg@ziepe.ca>; Doug Ledford <dledford@redhat.com>
> Cc: Yossi Leybovich <sleybo@amazon.com>; Alexander Matushevsky
> <matua@amazon.com>; Leah Shalev <shalevl@amazon.com>; Dave Goodell
> <goodell@amazon.com>; Brian Barrett <bbarrett@amazon.com>; linux-
> rdma@vger.kernel.org; Sean Hefty <sean.hefty@intel.com>; Dennis
> Dalessandro <dennis.dalessandro@intel.com>; Leon Romanovsky
> <leon@kernel.org>; Christoph Hellwig <hch@infradead.org>; Parav Pandit
> <parav@mellanox.com>; Sagi Grimberg <sagi@grimberg.me>
> Subject: Re: [PATCH rdma-next v2 09/11] RDMA/efa: Add EFA verbs
> implementation
> 
> On 26-Feb-19 23:43, Steve Wise wrote:
> >> +int efa_query_port(struct ib_device *ibdev, u8 port,
> >> +		   struct ib_port_attr *props)
> >> +{
> >> +	struct efa_dev *dev = to_edev(ibdev);
> >> +
> >> +	memset(props, 0, sizeof(*props));
> >> +
> >> +	props->lid = 0;
> >> +	props->lmc = 1;
> >> +	props->sm_lid = 0;
> >> +	props->sm_sl = 0;
> >> +
> >> +	props->state = IB_PORT_ACTIVE;
> >> +	props->phys_state = 5;
> >> +	props->port_cap_flags = 0;
> >> +	props->gid_tbl_len = 1;
> >> +	props->pkey_tbl_len = 1;
> >> +	props->bad_pkey_cntr = 0;
> >> +	props->qkey_viol_cntr = 0;
> >> +	props->active_speed = IB_SPEED_EDR;
> >> +	props->active_width = IB_WIDTH_4X;
> >> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
> >> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
> >> +	props->max_msg_sz = dev->mtu;
> >> +	props->max_vl_num = 1;
> >> +
> >
> >
> > Since you memset() props to all zeros,  should you bother with
> > initializing the zero fields?
> 
> Will remove.
> 
> >
> >
> >> +	return 0;
> >> +}
> >> +
> >> +static int efa_qp_validate_cap(struct efa_dev *dev,
> >> +			       struct ib_qp_init_attr *init_attr)
> >> +{
> >> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"qp: requested send wr[%u] exceeds the
> max[%u]\n",
> >> +			init_attr->cap.max_send_wr,
> >> +			dev->dev_attr.max_sq_depth);
> >> +		return -EINVAL;
> >> +	}
> >> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"qp: requested receive wr[%u] exceeds the
> max[%u]\n",
> >> +			init_attr->cap.max_recv_wr,
> >> +			dev->dev_attr.max_rq_depth);
> >> +		return -EINVAL;
> >> +	}
> >> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"qp: requested sge send[%u] exceeds the
> max[%u]\n",
> >> +			init_attr->cap.max_send_sge, dev-
> >dev_attr.max_sq_sge);
> >> +		return -EINVAL;
> >> +	}
> >> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"qp: requested sge recv[%u] exceeds the
> max[%u]\n",
> >> +			init_attr->cap.max_recv_sge, dev-
> >dev_attr.max_rq_sge);
> >> +		return -EINVAL;
> >> +	}
> >> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
> >> +		efa_err(&dev->ibdev.dev,
> >> +			"requested inline data[%u] exceeds the max[%u]\n",
> >> +			init_attr->cap.max_inline_data,
> >> +			dev->dev_attr.inline_buf_size);
> >> +		return -EINVAL;
> >> +	}
> >> +
> >
> >
> > Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
> > log polluting for user errors.
> 
> Most users don't really enable debug and we want them to have an
> indication of
> what happened. is efa_warn() better?

I don't know what the preference is for *warn vs *err vs *info.  But I think in general, these sorts of errors aren't logged except for debug purposes.

> 
> >
> >
> >> +	return 0;
> >> +}
> >> +
> >> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
> >> +{
> >> +	struct scatterlist *sglist;
> >> +	struct page *pg;
> >> +	int i;
> >> +
> >> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
> >> +	if (!sglist)
> >> +		return NULL;
> >> +	sg_init_table(sglist, page_cnt);
> >> +	for (i = 0; i < page_cnt; i++) {
> >> +		pg = vmalloc_to_page(buf);
> >> +		if (!pg)
> >> +			goto err;
> >> +		WARN_ON_ONCE(PageHighMem(pg));
> >
> > Is this WARN_ON_ONCE() really an error that needs to be handled?
> 
> AFAIK, there is no way we can actually get a higemem page here.
> The WARN is here from early dev days, it should probably be removed.
> 
> >
> >
> >> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
> >> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
> >> +	}
> >> +	return sglist;
> >> +
> >> +err:
> >> +	kfree(sglist);
> >> +	return NULL;
> >> +}
> >> +
> >
> >
> > Reviewed-by: Steve Wise <swise@opengridcomputing.com>
> 
> Thanks Steve!

Shiraz Saleem Feb. 27, 2019, 10:13 p.m. UTC | #9

>Subject: [PATCH rdma-next v2 09/11] RDMA/efa: Add EFA verbs implementation
>
>Add a file that implements the EFA verbs.
>
>Signed-off-by: Gal Pressman <galpress@amazon.com>
>---
> drivers/infiniband/hw/efa/efa_verbs.c | 1891
>+++++++++++++++++++++++++++++++++
> 1 file changed, 1891 insertions(+)
> create mode 100644 drivers/infiniband/hw/efa/efa_verbs.c
>
[...]
>+int efa_query_port(struct ib_device *ibdev, u8 port,
>+		   struct ib_port_attr *props)
>+{
>+	struct efa_dev *dev = to_edev(ibdev);
>+
>+	memset(props, 0, sizeof(*props));
I don't think you need to explicitly zero initialize this struct. It's already done by core in ib_query_port(). 

[....]

>+struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
>+			    struct ib_qp_init_attr *init_attr,
>+			    struct ib_udata *udata)
>+{
>+	struct efa_com_create_qp_params create_qp_params = {};
>+	struct efa_com_create_qp_result create_qp_resp;
>+	struct efa_dev *dev = to_edev(ibpd->device);
>+	struct efa_ibv_create_qp_resp resp = {};
>+	struct efa_ibv_create_qp cmd = {};
>+	struct efa_ucontext *ucontext;
>+	struct efa_qp *qp;
>+	int err;
>+
>+	ucontext = ibpd->uobject ? to_eucontext(ibpd->uobject->context) :
>+				   NULL;
>+
>+	if (!udata) {
>+		efa_err_rl(&dev->ibdev.dev, "udata is NULL\n");
>+		err = -EOPNOTSUPP;
>+		goto err_out;
>+	}
>+
>+	err = efa_qp_validate_cap(dev, init_attr);
>+	if (err)
>+		goto err_out;
>+
>+	err = efa_qp_validate_attr(dev, init_attr);
>+	if (err)
>+		goto err_out;
>+
>+	if (!field_avail(cmd, driver_qp_type, udata->inlen)) {
>+		efa_err_rl(&dev->ibdev.dev,
>+			   "Incompatible ABI params, no input udata\n");
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	if (udata->inlen > sizeof(cmd) &&
>+	    !ib_is_udata_cleared(udata, sizeof(cmd),
>+				 udata->inlen - sizeof(cmd))) {
>+		efa_err_rl(&dev->ibdev.dev,
>+			   "Incompatible ABI params, unknown fields in udata\n");
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	err = ib_copy_from_udata(&cmd, udata,
>+				 min(sizeof(cmd), udata->inlen));
>+	if (err) {
>+		efa_err_rl(&dev->ibdev.dev,
>+			   "Cannot copy udata for create_qp\n");
>+		goto err_out;
>+	}
>+
>+	if (cmd.comp_mask) {
>+		efa_err_rl(&dev->ibdev.dev,
>+			   "Incompatible ABI params, unknown fields in udata\n");
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
>+	if (!qp) {
>+		err = -ENOMEM;
>+		goto err_out;
>+	}
>+
>+	create_qp_params.uarn = ucontext->uarn;
>+	create_qp_params.pd = to_epd(ibpd)->pdn;
>+
>+	if (init_attr->qp_type == IB_QPT_UD) {
>+		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
>+	} else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
>+		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
>+	} else {
>+		efa_err(&dev->ibdev.dev,
>+			"Unsupported qp type %d driver qp type %d\n",
>+			init_attr->qp_type, cmd.driver_qp_type);
>+		err = -EOPNOTSUPP;
>+		goto err_free_qp;
>+	}
>+
>+	efa_dbg(&dev->ibdev.dev, "Create QP: qp type %d driver qp type
>%#x\n",
>+		init_attr->qp_type, cmd.driver_qp_type);
>+	create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
>+	create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
>+	create_qp_params.sq_depth = init_attr->cap.max_send_wr;
>+	create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
>+
>+	create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
>+	create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
>+	qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
>+	if (qp->rq_size) {
>+		qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
>+						    qp->rq_size,
>DMA_TO_DEVICE);
>+		if (!qp->rq_cpu_addr) {
>+			err = -ENOMEM;
>+			goto err_free_qp;
>+		}
>+
>+		efa_dbg(&dev->ibdev.dev,
>+			"qp->cpu_addr[%p] allocated: size[%lu], dma[%pad]\n",
>+			qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
>+		create_qp_params.rq_base_addr = qp->rq_dma_addr;
>+	}
>+
>+	memset(&resp, 0, sizeof(resp));
>+	err = efa_com_create_qp(dev->edev, &create_qp_params,
>+				&create_qp_resp);
>+	if (err)
>+		goto err_free_mapped;
>+
>+	WARN_ON_ONCE(create_qp_resp.sq_db_offset > dev->db_bar_len);
>+	WARN_ON_ONCE(create_qp_resp.rq_db_offset > dev->db_bar_len);
>+	WARN_ON_ONCE(create_qp_resp.llq_descriptors_offset >
>+		     dev->mem_bar_len);

I saw many instances of WARN or WARN_ON_ONCE in the driver. Is this what you
really want? Perhaps a pr_ or dev_ warn variant?

[...]
>+static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
>+				  int vector, struct ib_ucontext *ibucontext,
>+				  struct ib_udata *udata)
>+{
>+	struct efa_ibv_create_cq_resp resp = {};
>+	struct efa_com_create_cq_params params;
>+	struct efa_com_create_cq_result result;
>+	struct efa_dev *dev = to_edev(ibdev);
>+	struct efa_ibv_create_cq cmd = {};
>+	struct efa_cq *cq;
>+	int err;
>+
>+	efa_dbg(&ibdev->dev, "create_cq entries %d udata %p\n", entries,
>udata);
>+
>+	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
>+		efa_err(&ibdev->dev,
>+			"cq: requested entries[%u] non-positive or greater than
>max[%u]\n",
>+			entries, dev->dev_attr.max_cq_depth);
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	if (!udata) {
>+		efa_err_rl(&ibdev->dev, "udata is NULL\n");
>+		err = -EOPNOTSUPP;
>+		goto err_out;
>+	}
>+
>+	if (!field_avail(cmd, num_sub_cqs, udata->inlen)) {
>+		efa_err_rl(&ibdev->dev,
>+			   "Incompatible ABI params, no input udata\n");
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	if (udata->inlen > sizeof(cmd) &&
>+	    !ib_is_udata_cleared(udata, sizeof(cmd),
>+				 udata->inlen - sizeof(cmd))) {
>+		efa_err_rl(&ibdev->dev,
>+			   "Incompatible ABI params, unknown fields in udata\n");
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	err = ib_copy_from_udata(&cmd, udata,
>+				 min(sizeof(cmd), udata->inlen));
>+	if (err) {
>+		efa_err_rl(&ibdev->dev,
>+			   "Cannot copy udata for create_cq\n");
>+		goto err_out;
>+	}
>+
>+	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_50)) {
>+		efa_err_rl(&ibdev->dev,
>+			   "Incompatible ABI params, unknown fields in udata\n");
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	if (!cmd.cq_entry_size) {
>+		efa_err(&ibdev->dev,
>+			"Invalid entry size [%u]\n", cmd.cq_entry_size);
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
>+		efa_err(&ibdev->dev,
>+			"Invalid number of sub cqs[%u] expected[%u]\n",
>+			cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
>+		err = -EINVAL;
>+		goto err_out;
>+	}
>+
>+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
>+	if (!cq) {
>+		err = -ENOMEM;
>+		goto err_out;
>+	}
>+
>+	memset(&resp, 0, sizeof(resp));

This was already initialized. Few instances of these..

>+
>+/*
>+ * initialize pbl indirect mode:
>+ * create a chunk list out of the dma addresses of the physical pages of
>+ * pbl buffer.
>+ */
>+static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
>+{
>+	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes,
>+					 EFA_PAGE_SIZE);
>+	struct scatterlist *sgl;
>+	int sg_dma_cnt, err;
>+
>+	sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
>+	if (!sgl)
>+		return -ENOMEM;
>+
>+	sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages,
>DMA_TO_DEVICE);
>+	if (!sg_dma_cnt) {
>+		err = -EINVAL;
>+		goto err_map;
>+	}
>+
>+	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
>+	pbl->phys.indirect.sgl = sgl;
>+	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
>+	err = pbl_chunk_list_create(dev, pbl);
>+	if (err) {
>+		efa_err(&dev->ibdev.dev,
>+			"chunk_list creation failed[%d]\n", err);
>+		goto err_chunk;
>+	}

Seems like you have many error prints in this driver. Do you really want this to show up
in dmesg or just the critical ones? Consider dev or pr_dbg?

Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>

Gal Pressman Feb. 28, 2019, 9:09 a.m. UTC | #10

On 27-Feb-19 16:05, Steve Wise wrote:
> 
> 
>> -----Original Message-----
>> From: Gal Pressman <galpress@amazon.com>
>> Sent: Wednesday, February 27, 2019 2:40 AM
>> To: Steve Wise <swise@opengridcomputing.com>; Jason Gunthorpe
>> <jgg@ziepe.ca>; Doug Ledford <dledford@redhat.com>
>> Cc: Yossi Leybovich <sleybo@amazon.com>; Alexander Matushevsky
>> <matua@amazon.com>; Leah Shalev <shalevl@amazon.com>; Dave Goodell
>> <goodell@amazon.com>; Brian Barrett <bbarrett@amazon.com>; linux-
>> rdma@vger.kernel.org; Sean Hefty <sean.hefty@intel.com>; Dennis
>> Dalessandro <dennis.dalessandro@intel.com>; Leon Romanovsky
>> <leon@kernel.org>; Christoph Hellwig <hch@infradead.org>; Parav Pandit
>> <parav@mellanox.com>; Sagi Grimberg <sagi@grimberg.me>
>> Subject: Re: [PATCH rdma-next v2 09/11] RDMA/efa: Add EFA verbs
>> implementation
>>
>> On 26-Feb-19 23:43, Steve Wise wrote:
>>>> +int efa_query_port(struct ib_device *ibdev, u8 port,
>>>> +		   struct ib_port_attr *props)
>>>> +{
>>>> +	struct efa_dev *dev = to_edev(ibdev);
>>>> +
>>>> +	memset(props, 0, sizeof(*props));
>>>> +
>>>> +	props->lid = 0;
>>>> +	props->lmc = 1;
>>>> +	props->sm_lid = 0;
>>>> +	props->sm_sl = 0;
>>>> +
>>>> +	props->state = IB_PORT_ACTIVE;
>>>> +	props->phys_state = 5;
>>>> +	props->port_cap_flags = 0;
>>>> +	props->gid_tbl_len = 1;
>>>> +	props->pkey_tbl_len = 1;
>>>> +	props->bad_pkey_cntr = 0;
>>>> +	props->qkey_viol_cntr = 0;
>>>> +	props->active_speed = IB_SPEED_EDR;
>>>> +	props->active_width = IB_WIDTH_4X;
>>>> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
>>>> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
>>>> +	props->max_msg_sz = dev->mtu;
>>>> +	props->max_vl_num = 1;
>>>> +
>>>
>>>
>>> Since you memset() props to all zeros,  should you bother with
>>> initializing the zero fields?
>>
>> Will remove.
>>
>>>
>>>
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static int efa_qp_validate_cap(struct efa_dev *dev,
>>>> +			       struct ib_qp_init_attr *init_attr)
>>>> +{
>>>> +	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"qp: requested send wr[%u] exceeds the
>> max[%u]\n",
>>>> +			init_attr->cap.max_send_wr,
>>>> +			dev->dev_attr.max_sq_depth);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"qp: requested receive wr[%u] exceeds the
>> max[%u]\n",
>>>> +			init_attr->cap.max_recv_wr,
>>>> +			dev->dev_attr.max_rq_depth);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"qp: requested sge send[%u] exceeds the
>> max[%u]\n",
>>>> +			init_attr->cap.max_send_sge, dev-
>>> dev_attr.max_sq_sge);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"qp: requested sge recv[%u] exceeds the
>> max[%u]\n",
>>>> +			init_attr->cap.max_recv_sge, dev-
>>> dev_attr.max_rq_sge);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
>>>> +		efa_err(&dev->ibdev.dev,
>>>> +			"requested inline data[%u] exceeds the max[%u]\n",
>>>> +			init_attr->cap.max_inline_data,
>>>> +			dev->dev_attr.inline_buf_size);
>>>> +		return -EINVAL;
>>>> +	}
>>>> +
>>>
>>>
>>> Should all these efa_err() calls really be efa_dbg()s?  That's a lot of
>>> log polluting for user errors.
>>
>> Most users don't really enable debug and we want them to have an
>> indication of
>> what happened. is efa_warn() better?
> 
> I don't know what the preference is for *warn vs *err vs *info.  But I think in general, these sorts of errors aren't logged except for debug purposes.

Will change to debug.

> 
>>
>>>
>>>
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
>>>> +{
>>>> +	struct scatterlist *sglist;
>>>> +	struct page *pg;
>>>> +	int i;
>>>> +
>>>> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
>>>> +	if (!sglist)
>>>> +		return NULL;
>>>> +	sg_init_table(sglist, page_cnt);
>>>> +	for (i = 0; i < page_cnt; i++) {
>>>> +		pg = vmalloc_to_page(buf);
>>>> +		if (!pg)
>>>> +			goto err;
>>>> +		WARN_ON_ONCE(PageHighMem(pg));
>>>
>>> Is this WARN_ON_ONCE() really an error that needs to be handled?
>>
>> AFAIK, there is no way we can actually get a higemem page here.
>> The WARN is here from early dev days, it should probably be removed.
>>
>>>
>>>
>>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
>>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
>>>> +	}
>>>> +	return sglist;
>>>> +
>>>> +err:
>>>> +	kfree(sglist);
>>>> +	return NULL;
>>>> +}
>>>> +
>>>
>>>
>>> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
>>
>> Thanks Steve!
>

Gal Pressman Feb. 28, 2019, 9:14 a.m. UTC | #11

On 28-Feb-19 00:13, Saleem, Shiraz wrote:
>> Subject: [PATCH rdma-next v2 09/11] RDMA/efa: Add EFA verbs implementation
>>
>> Add a file that implements the EFA verbs.
>>
>> Signed-off-by: Gal Pressman <galpress@amazon.com>
>> ---
>> drivers/infiniband/hw/efa/efa_verbs.c | 1891
>> +++++++++++++++++++++++++++++++++
>> 1 file changed, 1891 insertions(+)
>> create mode 100644 drivers/infiniband/hw/efa/efa_verbs.c
>>
> [...]
>> +int efa_query_port(struct ib_device *ibdev, u8 port,
>> +		   struct ib_port_attr *props)
>> +{
>> +	struct efa_dev *dev = to_edev(ibdev);
>> +
>> +	memset(props, 0, sizeof(*props));
> I don't think you need to explicitly zero initialize this struct. It's already done by core in ib_query_port(). 

ACK.

> 
> [....]
> 
>> +struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
>> +			    struct ib_qp_init_attr *init_attr,
>> +			    struct ib_udata *udata)
>> +{
>> +	struct efa_com_create_qp_params create_qp_params = {};
>> +	struct efa_com_create_qp_result create_qp_resp;
>> +	struct efa_dev *dev = to_edev(ibpd->device);
>> +	struct efa_ibv_create_qp_resp resp = {};
>> +	struct efa_ibv_create_qp cmd = {};
>> +	struct efa_ucontext *ucontext;
>> +	struct efa_qp *qp;
>> +	int err;
>> +
>> +	ucontext = ibpd->uobject ? to_eucontext(ibpd->uobject->context) :
>> +				   NULL;
>> +
>> +	if (!udata) {
>> +		efa_err_rl(&dev->ibdev.dev, "udata is NULL\n");
>> +		err = -EOPNOTSUPP;
>> +		goto err_out;
>> +	}
>> +
>> +	err = efa_qp_validate_cap(dev, init_attr);
>> +	if (err)
>> +		goto err_out;
>> +
>> +	err = efa_qp_validate_attr(dev, init_attr);
>> +	if (err)
>> +		goto err_out;
>> +
>> +	if (!field_avail(cmd, driver_qp_type, udata->inlen)) {
>> +		efa_err_rl(&dev->ibdev.dev,
>> +			   "Incompatible ABI params, no input udata\n");
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	if (udata->inlen > sizeof(cmd) &&
>> +	    !ib_is_udata_cleared(udata, sizeof(cmd),
>> +				 udata->inlen - sizeof(cmd))) {
>> +		efa_err_rl(&dev->ibdev.dev,
>> +			   "Incompatible ABI params, unknown fields in udata\n");
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	err = ib_copy_from_udata(&cmd, udata,
>> +				 min(sizeof(cmd), udata->inlen));
>> +	if (err) {
>> +		efa_err_rl(&dev->ibdev.dev,
>> +			   "Cannot copy udata for create_qp\n");
>> +		goto err_out;
>> +	}
>> +
>> +	if (cmd.comp_mask) {
>> +		efa_err_rl(&dev->ibdev.dev,
>> +			   "Incompatible ABI params, unknown fields in udata\n");
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
>> +	if (!qp) {
>> +		err = -ENOMEM;
>> +		goto err_out;
>> +	}
>> +
>> +	create_qp_params.uarn = ucontext->uarn;
>> +	create_qp_params.pd = to_epd(ibpd)->pdn;
>> +
>> +	if (init_attr->qp_type == IB_QPT_UD) {
>> +		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
>> +	} else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
>> +		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
>> +	} else {
>> +		efa_err(&dev->ibdev.dev,
>> +			"Unsupported qp type %d driver qp type %d\n",
>> +			init_attr->qp_type, cmd.driver_qp_type);
>> +		err = -EOPNOTSUPP;
>> +		goto err_free_qp;
>> +	}
>> +
>> +	efa_dbg(&dev->ibdev.dev, "Create QP: qp type %d driver qp type
>> %#x\n",
>> +		init_attr->qp_type, cmd.driver_qp_type);
>> +	create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
>> +	create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
>> +	create_qp_params.sq_depth = init_attr->cap.max_send_wr;
>> +	create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
>> +
>> +	create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
>> +	create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
>> +	qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
>> +	if (qp->rq_size) {
>> +		qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
>> +						    qp->rq_size,
>> DMA_TO_DEVICE);
>> +		if (!qp->rq_cpu_addr) {
>> +			err = -ENOMEM;
>> +			goto err_free_qp;
>> +		}
>> +
>> +		efa_dbg(&dev->ibdev.dev,
>> +			"qp->cpu_addr[%p] allocated: size[%lu], dma[%pad]\n",
>> +			qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
>> +		create_qp_params.rq_base_addr = qp->rq_dma_addr;
>> +	}
>> +
>> +	memset(&resp, 0, sizeof(resp));
>> +	err = efa_com_create_qp(dev->edev, &create_qp_params,
>> +				&create_qp_resp);
>> +	if (err)
>> +		goto err_free_mapped;
>> +
>> +	WARN_ON_ONCE(create_qp_resp.sq_db_offset > dev->db_bar_len);
>> +	WARN_ON_ONCE(create_qp_resp.rq_db_offset > dev->db_bar_len);
>> +	WARN_ON_ONCE(create_qp_resp.llq_descriptors_offset >
>> +		     dev->mem_bar_len);
> 
> I saw many instances of WARN or WARN_ON_ONCE in the driver. Is this what you
> really want? Perhaps a pr_ or dev_ warn variant?

This WARNs can happily retire by now :), I'll remove them.

> 
> [...]
>> +static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
>> +				  int vector, struct ib_ucontext *ibucontext,
>> +				  struct ib_udata *udata)
>> +{
>> +	struct efa_ibv_create_cq_resp resp = {};
>> +	struct efa_com_create_cq_params params;
>> +	struct efa_com_create_cq_result result;
>> +	struct efa_dev *dev = to_edev(ibdev);
>> +	struct efa_ibv_create_cq cmd = {};
>> +	struct efa_cq *cq;
>> +	int err;
>> +
>> +	efa_dbg(&ibdev->dev, "create_cq entries %d udata %p\n", entries,
>> udata);
>> +
>> +	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
>> +		efa_err(&ibdev->dev,
>> +			"cq: requested entries[%u] non-positive or greater than
>> max[%u]\n",
>> +			entries, dev->dev_attr.max_cq_depth);
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	if (!udata) {
>> +		efa_err_rl(&ibdev->dev, "udata is NULL\n");
>> +		err = -EOPNOTSUPP;
>> +		goto err_out;
>> +	}
>> +
>> +	if (!field_avail(cmd, num_sub_cqs, udata->inlen)) {
>> +		efa_err_rl(&ibdev->dev,
>> +			   "Incompatible ABI params, no input udata\n");
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	if (udata->inlen > sizeof(cmd) &&
>> +	    !ib_is_udata_cleared(udata, sizeof(cmd),
>> +				 udata->inlen - sizeof(cmd))) {
>> +		efa_err_rl(&ibdev->dev,
>> +			   "Incompatible ABI params, unknown fields in udata\n");
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	err = ib_copy_from_udata(&cmd, udata,
>> +				 min(sizeof(cmd), udata->inlen));
>> +	if (err) {
>> +		efa_err_rl(&ibdev->dev,
>> +			   "Cannot copy udata for create_cq\n");
>> +		goto err_out;
>> +	}
>> +
>> +	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_50)) {
>> +		efa_err_rl(&ibdev->dev,
>> +			   "Incompatible ABI params, unknown fields in udata\n");
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	if (!cmd.cq_entry_size) {
>> +		efa_err(&ibdev->dev,
>> +			"Invalid entry size [%u]\n", cmd.cq_entry_size);
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
>> +		efa_err(&ibdev->dev,
>> +			"Invalid number of sub cqs[%u] expected[%u]\n",
>> +			cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
>> +	if (!cq) {
>> +		err = -ENOMEM;
>> +		goto err_out;
>> +	}
>> +
>> +	memset(&resp, 0, sizeof(resp));
> 
> This was already initialized. Few instances of these..

ACK.

> 
>> +
>> +/*
>> + * initialize pbl indirect mode:
>> + * create a chunk list out of the dma addresses of the physical pages of
>> + * pbl buffer.
>> + */
>> +static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
>> +{
>> +	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes,
>> +					 EFA_PAGE_SIZE);
>> +	struct scatterlist *sgl;
>> +	int sg_dma_cnt, err;
>> +
>> +	sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
>> +	if (!sgl)
>> +		return -ENOMEM;
>> +
>> +	sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages,
>> DMA_TO_DEVICE);
>> +	if (!sg_dma_cnt) {
>> +		err = -EINVAL;
>> +		goto err_map;
>> +	}
>> +
>> +	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
>> +	pbl->phys.indirect.sgl = sgl;
>> +	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
>> +	err = pbl_chunk_list_create(dev, pbl);
>> +	if (err) {
>> +		efa_err(&dev->ibdev.dev,
>> +			"chunk_list creation failed[%d]\n", err);
>> +		goto err_chunk;
>> +	}
> 
> Seems like you have many error prints in this driver. Do you really want this to show up
> in dmesg or just the critical ones? Consider dev or pr_dbg?

I will audit these.

> 
> Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>

Thanks Shiraz!

Jason Gunthorpe March 5, 2019, 1:22 a.m. UTC | #12

On Wed, Feb 27, 2019 at 10:13:46PM +0000, Saleem, Shiraz wrote:

> >+	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
> >+	pbl->phys.indirect.sgl = sgl;
> >+	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
> >+	err = pbl_chunk_list_create(dev, pbl);
> >+	if (err) {
> >+		efa_err(&dev->ibdev.dev,
> >+			"chunk_list creation failed[%d]\n", err);
> >+		goto err_chunk;
> >+	}
> 
> Seems like you have many error prints in this driver. Do you really want this to show up
> in dmesg or just the critical ones? Consider dev or pr_dbg?

As a general rule, no driver should have a path where unprivileged
user space can cause unbounded dmesg prints. 

This is sort of a security problem.

We have such a common request in drivers to do this though, I wonder
if we should have a special core printer function for it that is
automatically limited in some way.

Jason

Jason Gunthorpe March 5, 2019, 1:39 a.m. UTC | #13

On Wed, Feb 27, 2019 at 11:06:14AM +0200, Gal Pressman wrote:

> >>>> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
> >>>> +{
> >>>> +	struct scatterlist *sglist;
> >>>> +	struct page *pg;
> >>>> +	int i;
> >>>> +
> >>>> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
> >>>> +	if (!sglist)
> >>>> +		return NULL;
> >>>> +	sg_init_table(sglist, page_cnt);
> >>>> +	for (i = 0; i < page_cnt; i++) {
> >>>> +		pg = vmalloc_to_page(buf);
> >>>> +		if (!pg)
> >>>> +			goto err;
> >>>> +		WARN_ON_ONCE(PageHighMem(pg));
> >>>
> >>> Is this WARN_ON_ONCE() really an error that needs to be handled?
> >>
> >> AFAIK, there is no way we can actually get a higemem page here.
> >> The WARN is here from early dev days, it should probably be removed.
> >>
> >>>
> >>>
> >>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
> >>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
> > 
> > Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?
> 
> EFA_PAGE_SIZE represents the device page size.

Something is wrong here then, as vmalloc_to_page returns something
that is *at most* PAGE_SIZE long. You can't go around and pass that
into sg_set_page with EFA_PAGE_SIZE > PAGE_SIZE.

Maybe this code is wrongly assuming PAGE_SIZE > EFA_PAGE_SIZE 
?

What you need here is to make the scatter list out of PAGE_SIZE blocks
and then use Shiraz's work to fragment it into EFA_PAGE_SIZE blocks
for building your pbl. dma_map_sg should use the largest sgls possible
for efficiency.

Also, I think there are now several places in the kernel converting
vmalloc/kmalloc regions into scatterlist tables, it probably warrents
adding a shared helper in lib/scatterlist.c that does a good and
proper job of this.

> >>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);

Yuk

  buf += EFA_PAGE_SIZE/sizeof(*buf);

Jason

Gal Pressman March 5, 2019, 9:42 a.m. UTC | #14

On 05-Mar-19 03:22, Jason Gunthorpe wrote:
> On Wed, Feb 27, 2019 at 10:13:46PM +0000, Saleem, Shiraz wrote:
> 
>>> +	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
>>> +	pbl->phys.indirect.sgl = sgl;
>>> +	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
>>> +	err = pbl_chunk_list_create(dev, pbl);
>>> +	if (err) {
>>> +		efa_err(&dev->ibdev.dev,
>>> +			"chunk_list creation failed[%d]\n", err);
>>> +		goto err_chunk;
>>> +	}
>>
>> Seems like you have many error prints in this driver. Do you really want this to show up
>> in dmesg or just the critical ones? Consider dev or pr_dbg?
> 
> As a general rule, no driver should have a path where unprivileged
> user space can cause unbounded dmesg prints. 
> 
> This is sort of a security problem.

As I said, auditing prints to debug.

> 
> We have such a common request in drivers to do this though, I wonder
> if we should have a special core printer function for it that is
> automatically limited in some way.
> 
> Jason
>

Gal Pressman March 5, 2019, 1:14 p.m. UTC | #15

On 05-Mar-19 03:39, Jason Gunthorpe wrote:
> On Wed, Feb 27, 2019 at 11:06:14AM +0200, Gal Pressman wrote:
> 
>>>>>> +static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
>>>>>> +{
>>>>>> +	struct scatterlist *sglist;
>>>>>> +	struct page *pg;
>>>>>> +	int i;
>>>>>> +
>>>>>> +	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
>>>>>> +	if (!sglist)
>>>>>> +		return NULL;
>>>>>> +	sg_init_table(sglist, page_cnt);
>>>>>> +	for (i = 0; i < page_cnt; i++) {
>>>>>> +		pg = vmalloc_to_page(buf);
>>>>>> +		if (!pg)
>>>>>> +			goto err;
>>>>>> +		WARN_ON_ONCE(PageHighMem(pg));
>>>>>
>>>>> Is this WARN_ON_ONCE() really an error that needs to be handled?
>>>>
>>>> AFAIK, there is no way we can actually get a higemem page here.
>>>> The WARN is here from early dev days, it should probably be removed.
>>>>
>>>>>
>>>>>
>>>>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
>>>>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
>>>
>>> Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?
>>
>> EFA_PAGE_SIZE represents the device page size.
> 
> Something is wrong here then, as vmalloc_to_page returns something
> that is *at most* PAGE_SIZE long. You can't go around and pass that
> into sg_set_page with EFA_PAGE_SIZE > PAGE_SIZE.
> 
> Maybe this code is wrongly assuming PAGE_SIZE > EFA_PAGE_SIZE 
> ?

We do assume PAGE_SIZE >= EFA_PAGE_SIZE.
For instance, on systems where PAGE_SIZE is 64k, we still use chunks of 4k.

> 
> What you need here is to make the scatter list out of PAGE_SIZE blocks
> and then use Shiraz's work to fragment it into EFA_PAGE_SIZE blocks
> for building your pbl. dma_map_sg should use the largest sgls possible
> for efficiency.
> 
> Also, I think there are now several places in the kernel converting
> vmalloc/kmalloc regions into scatterlist tables, it probably warrents
> adding a shared helper in lib/scatterlist.c that does a good and
> proper job of this.

My search yielded six occurrences, all under drivers/media.
Does this justify moving this function to scatterlist.c?

> 
>>>>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
> 
> Yuk
> 
>   buf += EFA_PAGE_SIZE/sizeof(*buf);

Done

Jason Gunthorpe March 5, 2019, 8:15 p.m. UTC | #16

On Tue, Mar 05, 2019 at 03:14:34PM +0200, Gal Pressman wrote:
> >>>>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
> >>>>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
> >>>
> >>> Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?
> >>
> >> EFA_PAGE_SIZE represents the device page size.
> > 
> > Something is wrong here then, as vmalloc_to_page returns something
> > that is *at most* PAGE_SIZE long. You can't go around and pass that
> > into sg_set_page with EFA_PAGE_SIZE > PAGE_SIZE.
> > 
> > Maybe this code is wrongly assuming PAGE_SIZE > EFA_PAGE_SIZE 
> > ?
> 
> We do assume PAGE_SIZE >= EFA_PAGE_SIZE.
> For instance, on systems where PAGE_SIZE is 64k, we still use chunks of 4k.

Hurm, BUILD_BUG_ON that, I think.

> > What you need here is to make the scatter list out of PAGE_SIZE blocks
> > and then use Shiraz's work to fragment it into EFA_PAGE_SIZE blocks
> > for building your pbl. dma_map_sg should use the largest sgls possible
> > for efficiency.
> > 
> > Also, I think there are now several places in the kernel converting
> > vmalloc/kmalloc regions into scatterlist tables, it probably warrents
> > adding a shared helper in lib/scatterlist.c that does a good and
> > proper job of this.
> 
> My search yielded six occurrences, all under drivers/media.
> Does this justify moving this function to scatterlist.c?

I wrote one under drivers/fpga a few years ago too ..

See fpga_mgr_buf_load() - this handles vmap and kmap transparently, so
it somewhat more general.

I think 7 is certainly enough to warrant common code.

Jason

Gal Pressman March 6, 2019, 3:55 p.m. UTC | #17

On 05-Mar-19 22:15, Jason Gunthorpe wrote:
> On Tue, Mar 05, 2019 at 03:14:34PM +0200, Gal Pressman wrote:
>>>>>>>> +		sg_set_page(&sglist[i], pg, EFA_PAGE_SIZE, 0);
>>>>>>>> +		buf = (u64 *)((u8 *)buf + EFA_PAGE_SIZE);
>>>>>
>>>>> Why do you need special EFA_PAGE_SIZE? Isn't PAGE_SIZE enough for you?
>>>>
>>>> EFA_PAGE_SIZE represents the device page size.
>>>
>>> Something is wrong here then, as vmalloc_to_page returns something
>>> that is *at most* PAGE_SIZE long. You can't go around and pass that
>>> into sg_set_page with EFA_PAGE_SIZE > PAGE_SIZE.
>>>
>>> Maybe this code is wrongly assuming PAGE_SIZE > EFA_PAGE_SIZE 
>>> ?
>>
>> We do assume PAGE_SIZE >= EFA_PAGE_SIZE.
>> For instance, on systems where PAGE_SIZE is 64k, we still use chunks of 4k.
> 
> Hurm, BUILD_BUG_ON that, I think.

Will do.
Also, looks like there's a bug in sg_set_page(), the offset shouldn't be zero
but offset_in_page(buf).

> 
>>> What you need here is to make the scatter list out of PAGE_SIZE blocks
>>> and then use Shiraz's work to fragment it into EFA_PAGE_SIZE blocks
>>> for building your pbl. dma_map_sg should use the largest sgls possible
>>> for efficiency.
>>>
>>> Also, I think there are now several places in the kernel converting
>>> vmalloc/kmalloc regions into scatterlist tables, it probably warrents
>>> adding a shared helper in lib/scatterlist.c that does a good and
>>> proper job of this.
>>
>> My search yielded six occurrences, all under drivers/media.
>> Does this justify moving this function to scatterlist.c?
> 
> I wrote one under drivers/fpga a few years ago too ..
> 
> See fpga_mgr_buf_load() - this handles vmap and kmap transparently, so
> it somewhat more general.
> 
> I think 7 is certainly enough to warrant common code.

Looks like some of the drivers operate on the scatterlist itself (no chaining),
while others (such as the one you cited) operate on a sg table. I wonder if we
should convert all to use scatterlist or sg tables?

Also, EFA is special in the sense that all sg functions use PAGE_SIZE while EFA
needs EFA_PAGE_SIZE. I guess it could be solved by passing that as a parameter,
but that'll probably require some changes in existing API functions
(__sg_alloc_table_from_pages for example)?

Would you consider keeping efa_vmalloc_buf_to_sg() as a temporary solution?
The scope of this change is a bit bigger than just RDMA and I'd like to split
these changes.

Jason Gunthorpe March 6, 2019, 6:34 p.m. UTC | #18

On Wed, Mar 06, 2019 at 05:55:11PM +0200, Gal Pressman wrote:

> Looks like some of the drivers operate on the scatterlist itself (no chaining),
> while others (such as the one you cited) operate on a sg table. I wonder if we
> should convert all to use scatterlist or sg tables?

sg table is generally simpler for cases that don't need extreme
efficiency

> Also, EFA is special in the sense that all sg functions use PAGE_SIZE while EFA
> needs EFA_PAGE_SIZE. I guess it could be solved by passing that as a parameter,
> but that'll probably require some changes in existing API functions
> (__sg_alloc_table_from_pages for example)?

No, you are supposed to build up the sgl in large chunks and then
fragment it into whatever the HW requires after DMA mapping.

The code as written is wrong, as the IOMMU is free to consolidate the
carefully broken up SGL into a single entry after doing dma_map.

Fragementation for HW page size limitations must be done after
mapping to use the APIs correctly.

Shiraz is working on a function to iterate over a sgl in specified
fixed size blocks (ie EFA_PAGE_SIZE), you should work with him as it
is exactly what is needed here as well.

> Would you consider keeping efa_vmalloc_buf_to_sg() as a temporary solution?
> The scope of this change is a bit bigger than just RDMA and I'd like to split
> these changes.

You still have to make what you have right..

I'd suggest you write the full function you'd like to propose for
lib/scatterlist.c in EFA and commit to move it out later.

Jason

Gal Pressman March 7, 2019, 2:44 p.m. UTC | #19

On 06-Mar-19 20:34, Jason Gunthorpe wrote:
> On Wed, Mar 06, 2019 at 05:55:11PM +0200, Gal Pressman wrote:
>> Also, EFA is special in the sense that all sg functions use PAGE_SIZE while EFA
>> needs EFA_PAGE_SIZE. I guess it could be solved by passing that as a parameter,
>> but that'll probably require some changes in existing API functions
>> (__sg_alloc_table_from_pages for example)?
> 
> No, you are supposed to build up the sgl in large chunks and then
> fragment it into whatever the HW requires after DMA mapping.

That's fine, I'll change the stride to PAGE_SIZE.

> 
> The code as written is wrong, as the IOMMU is free to consolidate the
> carefully broken up SGL into a single entry after doing dma_map.
> 
> Fragementation for HW page size limitations must be done after
> mapping to use the APIs correctly.

It does.
When building the chunk list (pbl_chunk_list_create) we iterate each sg element
in EFA_PAGE_SIZE strides.

To be honest, the name EFA_PAGE_SIZE is misleading as it has nothing to do with
the device' page sizes, EFA_CHUNK_PAYLOAD_SIZE is more appropriate. I'll rename it.

> 
> Shiraz is working on a function to iterate over a sgl in specified
> fixed size blocks (ie EFA_PAGE_SIZE), you should work with him as it
> is exactly what is needed here as well.
> 
>> Would you consider keeping efa_vmalloc_buf_to_sg() as a temporary solution?
>> The scope of this change is a bit bigger than just RDMA and I'd like to split
>> these changes.
> 
> You still have to make what you have right..
> 
> I'd suggest you write the full function you'd like to propose for
> lib/scatterlist.c in EFA and commit to move it out later.
> 
> Jason
>

Jason Gunthorpe March 7, 2019, 6:53 p.m. UTC | #20

On Thu, Mar 07, 2019 at 04:44:32PM +0200, Gal Pressman wrote:
> On 06-Mar-19 20:34, Jason Gunthorpe wrote:
> > On Wed, Mar 06, 2019 at 05:55:11PM +0200, Gal Pressman wrote:
> >> Also, EFA is special in the sense that all sg functions use PAGE_SIZE while EFA
> >> needs EFA_PAGE_SIZE. I guess it could be solved by passing that as a parameter,
> >> but that'll probably require some changes in existing API functions
> >> (__sg_alloc_table_from_pages for example)?
> > 
> > No, you are supposed to build up the sgl in large chunks and then
> > fragment it into whatever the HW requires after DMA mapping.
> 
> That's fine, I'll change the stride to PAGE_SIZE.
> 
> > 
> > The code as written is wrong, as the IOMMU is free to consolidate the
> > carefully broken up SGL into a single entry after doing dma_map.
> > 
> > Fragementation for HW page size limitations must be done after
> > mapping to use the APIs correctly.
> 
> It does.
> When building the chunk list (pbl_chunk_list_create) we iterate each sg element
> in EFA_PAGE_SIZE strides.

This loop in pbl_chunk_list_create should use Shiraz's stuff, but it
is OK to update it after his stuff makes it (noting that if Shiraz is
merged first you have to fix it) this.

Mind that the sg offset in that loop is not always zero though.

Jason

Gal Pressman March 10, 2019, 2:05 p.m. UTC | #21

On 07-Mar-19 20:53, Jason Gunthorpe wrote:
> On Thu, Mar 07, 2019 at 04:44:32PM +0200, Gal Pressman wrote:
>> On 06-Mar-19 20:34, Jason Gunthorpe wrote:
>>> On Wed, Mar 06, 2019 at 05:55:11PM +0200, Gal Pressman wrote:
>>>> Also, EFA is special in the sense that all sg functions use PAGE_SIZE while EFA
>>>> needs EFA_PAGE_SIZE. I guess it could be solved by passing that as a parameter,
>>>> but that'll probably require some changes in existing API functions
>>>> (__sg_alloc_table_from_pages for example)?
>>>
>>> No, you are supposed to build up the sgl in large chunks and then
>>> fragment it into whatever the HW requires after DMA mapping.
>>
>> That's fine, I'll change the stride to PAGE_SIZE.
>>
>>>
>>> The code as written is wrong, as the IOMMU is free to consolidate the
>>> carefully broken up SGL into a single entry after doing dma_map.
>>>
>>> Fragementation for HW page size limitations must be done after
>>> mapping to use the APIs correctly.
>>
>> It does.
>> When building the chunk list (pbl_chunk_list_create) we iterate each sg element
>> in EFA_PAGE_SIZE strides.
> 
> This loop in pbl_chunk_list_create should use Shiraz's stuff, but it
> is OK to update it after his stuff makes it (noting that if Shiraz is
> merged first you have to fix it) this.

Alright.

> 
> Mind that the sg offset in that loop is not always zero though.

Right, thanks!

for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) {
	npg_in_sg = sg_dma_len(sg) >> EFA_PAGE_SHIFT;
	for (i = 0; i < npg_in_sg; i++) {
		cur_chunk_buf[page_idx++] = sg_dma_address(sg) +
                                                              ^
                                                              & ~(EFA_PAGE_SIZE - 1) seems right?

					    (EFA_PAGE_SIZE * i);

		if (page_idx == EFA_PAGE_PTRS_PER_CHUNK) {
			chunk_idx++;
			cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
			page_idx = 0;
		}
	}
}

Jason Gunthorpe March 10, 2019, 11:51 p.m. UTC | #22

On Sun, Mar 10, 2019 at 04:05:25PM +0200, Gal Pressman wrote:
> On 07-Mar-19 20:53, Jason Gunthorpe wrote:
> > On Thu, Mar 07, 2019 at 04:44:32PM +0200, Gal Pressman wrote:
> >> On 06-Mar-19 20:34, Jason Gunthorpe wrote:
> >>> On Wed, Mar 06, 2019 at 05:55:11PM +0200, Gal Pressman wrote:
> >>>> Also, EFA is special in the sense that all sg functions use PAGE_SIZE while EFA
> >>>> needs EFA_PAGE_SIZE. I guess it could be solved by passing that as a parameter,
> >>>> but that'll probably require some changes in existing API functions
> >>>> (__sg_alloc_table_from_pages for example)?
> >>>
> >>> No, you are supposed to build up the sgl in large chunks and then
> >>> fragment it into whatever the HW requires after DMA mapping.
> >>
> >> That's fine, I'll change the stride to PAGE_SIZE.
> >>
> >>>
> >>> The code as written is wrong, as the IOMMU is free to consolidate the
> >>> carefully broken up SGL into a single entry after doing dma_map.
> >>>
> >>> Fragementation for HW page size limitations must be done after
> >>> mapping to use the APIs correctly.
> >>
> >> It does.
> >> When building the chunk list (pbl_chunk_list_create) we iterate each sg element
> >> in EFA_PAGE_SIZE strides.
> > 
> > This loop in pbl_chunk_list_create should use Shiraz's stuff, but it
> > is OK to update it after his stuff makes it (noting that if Shiraz is
> > merged first you have to fix it) this.
> 
> Alright.
> 
> > 
> > Mind that the sg offset in that loop is not always zero though.
> 
> Right, thanks!
> 
> for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) {
> 	npg_in_sg = sg_dma_len(sg) >> EFA_PAGE_SHIFT;

Don't think the above will compute properly if an offset is present.

> 	for (i = 0; i < npg_in_sg; i++) {
> 		cur_chunk_buf[page_idx++] = sg_dma_address(sg) +
>                                                               ^
>                                                               & ~(EFA_PAGE_SIZE - 1) seems right?
> 
> 					    (EFA_PAGE_SIZE * i);

Yes, certainly helpful..

But Shiraz's version is cleaner :)

Jason

[rdma-next,v2,09/11] RDMA/efa: Add EFA verbs implementation

Commit Message

Comments

Patch