diff mbox series

[for-rc,4/6] RDMA/efa: Use API to get contiguous memory blocks aligned to device supported page size

Message ID 20190528124618.77918-5-galpress@amazon.com (mailing list archive)
State Superseded
Headers show
Series EFA updates 2019-05-28 | expand

Commit Message

Gal Pressman May 28, 2019, 12:46 p.m. UTC
Use the ib_umem_find_best_pgsz() and rdma_for_each_block() API when
registering an MR instead of coding it in the driver.

ib_umem_find_best_pgsz() is used to find the best suitable page size
which replaces the existing efa_cont_pages() implementation.
rdma_for_each_block() is used to iterate the umem in aligned contiguous
memory blocks.

Cc: Shiraz Saleem <shiraz.saleem@intel.com>
Reviewed-by: Firas JahJah <firasj@amazon.com>
Signed-off-by: Gal Pressman <galpress@amazon.com>
---
 drivers/infiniband/hw/efa/efa_verbs.c | 81 +++++----------------------
 1 file changed, 14 insertions(+), 67 deletions(-)

Comments

Jason Gunthorpe May 29, 2019, 4:19 p.m. UTC | #1
On Tue, May 28, 2019 at 03:46:16PM +0300, Gal Pressman wrote:
> @@ -1500,13 +1443,17 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
>  	params.iova = virt_addr;
>  	params.mr_length_in_bytes = length;
>  	params.permissions = access_flags & 0x1;
> -	max_page_shift = fls64(dev->dev_attr.page_size_cap);
>  
> -	efa_cont_pages(mr->umem, start, max_page_shift, &npages,
> -		       &params.page_shift, &params.page_num);
> +	pg_sz = ib_umem_find_best_pgsz(mr->umem,
> +				       dev->dev_attr.page_size_cap,
> +				       virt_addr);

I think this needs to check pg_sz is not zero..

Jason
Shiraz Saleem May 29, 2019, 5:01 p.m. UTC | #2
> Subject: [PATCH for-rc 4/6] RDMA/efa: Use API to get contiguous memory blocks
> aligned to device supported page size
> 
> Use the ib_umem_find_best_pgsz() and rdma_for_each_block() API when
> registering an MR instead of coding it in the driver.
> 
> ib_umem_find_best_pgsz() is used to find the best suitable page size which
> replaces the existing efa_cont_pages() implementation.
> rdma_for_each_block() is used to iterate the umem in aligned contiguous memory
> blocks.
> 
> Cc: Shiraz Saleem <shiraz.saleem@intel.com>
> Reviewed-by: Firas JahJah <firasj@amazon.com>
> Signed-off-by: Gal Pressman <galpress@amazon.com>
> ---
>  drivers/infiniband/hw/efa/efa_verbs.c | 81 +++++----------------------
>  1 file changed, 14 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c
> b/drivers/infiniband/hw/efa/efa_verbs.c
> index 0640c2435f67..c1246c39f234 100644
> --- a/drivers/infiniband/hw/efa/efa_verbs.c
> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
> @@ -1054,21 +1054,15 @@ static int umem_to_page_list(struct efa_dev *dev,
>  			     u8 hp_shift)
>  {
>  	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
> -	struct sg_dma_page_iter sg_iter;
> -	unsigned int page_idx = 0;
> +	struct ib_block_iter biter;
>  	unsigned int hp_idx = 0;
> 
>  	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
>  		  hp_cnt, pages_in_hp);
> 
> -	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0)
> {
> -		if (page_idx % pages_in_hp == 0) {
> -			page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter);
> -			hp_idx++;
> -		}
> -
> -		page_idx++;
> -	}
> +	rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap,
> +			    BIT(hp_shift))
> +		page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
> 
>  	return 0;
>  }
> @@ -1402,56 +1396,6 @@ static int efa_create_pbl(struct efa_dev *dev,
>  	return 0;
>  }
> 
> -static void efa_cont_pages(struct ib_umem *umem, u64 addr,
> -			   unsigned long max_page_shift,
> -			   int *count, u8 *shift, u32 *ncont)
> -{
> -	struct scatterlist *sg;
> -	u64 base = ~0, p = 0;
> -	unsigned long tmp;
> -	unsigned long m;
> -	u64 len, pfn;
> -	int i = 0;
> -	int entry;
> -
> -	addr = addr >> PAGE_SHIFT;
> -	tmp = (unsigned long)addr;
> -	m = find_first_bit(&tmp, BITS_PER_LONG);
> -	if (max_page_shift)
> -		m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
> -
> -	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
> -		len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE);
> -		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
> -		if (base + p != pfn) {
> -			/*
> -			 * If either the offset or the new
> -			 * base are unaligned update m
> -			 */
> -			tmp = (unsigned long)(pfn | p);
> -			if (!IS_ALIGNED(tmp, 1 << m))
> -				m = find_first_bit(&tmp, BITS_PER_LONG);
> -
> -			base = pfn;
> -			p = 0;
> -		}
> -
> -		p += len;
> -		i += len;
> -	}
> -
> -	if (i) {
> -		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
> -		*ncont = DIV_ROUND_UP(i, (1 << m));
> -	} else {
> -		m = 0;
> -		*ncont = 0;
> -	}
> -
> -	*shift = PAGE_SHIFT + m;
> -	*count = i;
> -}
> -

Leon - perhaps mlx5_ib_cont_pages() can also be replaced with the new core helper?

Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>
Shiraz Saleem May 29, 2019, 5:20 p.m. UTC | #3
> Subject: Re: [PATCH for-rc 4/6] RDMA/efa: Use API to get contiguous memory
> blocks aligned to device supported page size
> 
> On Tue, May 28, 2019 at 03:46:16PM +0300, Gal Pressman wrote:
> > @@ -1500,13 +1443,17 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64
> start, u64 length,
> >  	params.iova = virt_addr;
> >  	params.mr_length_in_bytes = length;
> >  	params.permissions = access_flags & 0x1;
> > -	max_page_shift = fls64(dev->dev_attr.page_size_cap);
> >
> > -	efa_cont_pages(mr->umem, start, max_page_shift, &npages,
> > -		       &params.page_shift, &params.page_num);
> > +	pg_sz = ib_umem_find_best_pgsz(mr->umem,
> > +				       dev->dev_attr.page_size_cap,
> > +				       virt_addr);
> 
> I think this needs to check pg_sz is not zero..
> 

What is the smallest page size this driver supports?
Gal Pressman May 29, 2019, 7:35 p.m. UTC | #4
On 29/05/2019 20:20, Saleem, Shiraz wrote:
>> Subject: Re: [PATCH for-rc 4/6] RDMA/efa: Use API to get contiguous memory
>> blocks aligned to device supported page size
>>
>> On Tue, May 28, 2019 at 03:46:16PM +0300, Gal Pressman wrote:
>>> @@ -1500,13 +1443,17 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64
>> start, u64 length,
>>>  	params.iova = virt_addr;
>>>  	params.mr_length_in_bytes = length;
>>>  	params.permissions = access_flags & 0x1;
>>> -	max_page_shift = fls64(dev->dev_attr.page_size_cap);
>>>
>>> -	efa_cont_pages(mr->umem, start, max_page_shift, &npages,
>>> -		       &params.page_shift, &params.page_num);
>>> +	pg_sz = ib_umem_find_best_pgsz(mr->umem,
>>> +				       dev->dev_attr.page_size_cap,
>>> +				       virt_addr);
>>
>> I think this needs to check pg_sz is not zero..
>>
> 
> What is the smallest page size this driver supports?

The page size capability is queried from the device, the smallest page size is
currently 4k.

Isn't PAGE_SIZE always supported? What did drivers do before
ib_umem_find_best_pgsz() existed in case they didn't support PAGE_SIZE?

I doubt there is/will be a real use-case where it matters for EFA, but I can add
the check to be on the safe side.
Jason Gunthorpe May 30, 2019, 6:33 p.m. UTC | #5
On Wed, May 29, 2019 at 10:35:58PM +0300, Gal Pressman wrote:
> On 29/05/2019 20:20, Saleem, Shiraz wrote:
> >> Subject: Re: [PATCH for-rc 4/6] RDMA/efa: Use API to get contiguous memory
> >> blocks aligned to device supported page size
> >>
> >> On Tue, May 28, 2019 at 03:46:16PM +0300, Gal Pressman wrote:
> >>> @@ -1500,13 +1443,17 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64
> >> start, u64 length,
> >>>  	params.iova = virt_addr;
> >>>  	params.mr_length_in_bytes = length;
> >>>  	params.permissions = access_flags & 0x1;
> >>> -	max_page_shift = fls64(dev->dev_attr.page_size_cap);
> >>>
> >>> -	efa_cont_pages(mr->umem, start, max_page_shift, &npages,
> >>> -		       &params.page_shift, &params.page_num);
> >>> +	pg_sz = ib_umem_find_best_pgsz(mr->umem,
> >>> +				       dev->dev_attr.page_size_cap,
> >>> +				       virt_addr);
> >>
> >> I think this needs to check pg_sz is not zero..
> >>
> > 
> > What is the smallest page size this driver supports?
> 
> The page size capability is queried from the device, the smallest page size is
> currently 4k.
> 
> Isn't PAGE_SIZE always supported? 

No, PAGE_SIZE is only supported if the device supports PAGE_SIZE or
smaller blocks.

> What did drivers do before ib_umem_find_best_pgsz() existed in case
> they didn't support PAGE_SIZE?

Most malfunctioned.

> I doubt there is/will be a real use-case where it matters for EFA,
> but I can add the check to be on the safe side.

Please, I want the reference implementations of this API to be correct
for reference by others.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 0640c2435f67..c1246c39f234 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1054,21 +1054,15 @@  static int umem_to_page_list(struct efa_dev *dev,
 			     u8 hp_shift)
 {
 	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
-	struct sg_dma_page_iter sg_iter;
-	unsigned int page_idx = 0;
+	struct ib_block_iter biter;
 	unsigned int hp_idx = 0;
 
 	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
 		  hp_cnt, pages_in_hp);
 
-	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
-		if (page_idx % pages_in_hp == 0) {
-			page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter);
-			hp_idx++;
-		}
-
-		page_idx++;
-	}
+	rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap,
+			    BIT(hp_shift))
+		page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
 
 	return 0;
 }
@@ -1402,56 +1396,6 @@  static int efa_create_pbl(struct efa_dev *dev,
 	return 0;
 }
 
-static void efa_cont_pages(struct ib_umem *umem, u64 addr,
-			   unsigned long max_page_shift,
-			   int *count, u8 *shift, u32 *ncont)
-{
-	struct scatterlist *sg;
-	u64 base = ~0, p = 0;
-	unsigned long tmp;
-	unsigned long m;
-	u64 len, pfn;
-	int i = 0;
-	int entry;
-
-	addr = addr >> PAGE_SHIFT;
-	tmp = (unsigned long)addr;
-	m = find_first_bit(&tmp, BITS_PER_LONG);
-	if (max_page_shift)
-		m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
-
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE);
-		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
-		if (base + p != pfn) {
-			/*
-			 * If either the offset or the new
-			 * base are unaligned update m
-			 */
-			tmp = (unsigned long)(pfn | p);
-			if (!IS_ALIGNED(tmp, 1 << m))
-				m = find_first_bit(&tmp, BITS_PER_LONG);
-
-			base = pfn;
-			p = 0;
-		}
-
-		p += len;
-		i += len;
-	}
-
-	if (i) {
-		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
-		*ncont = DIV_ROUND_UP(i, (1 << m));
-	} else {
-		m = 0;
-		*ncont = 0;
-	}
-
-	*shift = PAGE_SHIFT + m;
-	*count = i;
-}
-
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
 			 struct ib_udata *udata)
@@ -1459,11 +1403,10 @@  struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	struct efa_dev *dev = to_edev(ibpd->device);
 	struct efa_com_reg_mr_params params = {};
 	struct efa_com_reg_mr_result result = {};
-	unsigned long max_page_shift;
 	struct pbl_context pbl;
+	unsigned int pg_sz;
 	struct efa_mr *mr;
 	int inline_size;
-	int npages;
 	int err;
 
 	if (udata->inlen &&
@@ -1500,13 +1443,17 @@  struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	params.iova = virt_addr;
 	params.mr_length_in_bytes = length;
 	params.permissions = access_flags & 0x1;
-	max_page_shift = fls64(dev->dev_attr.page_size_cap);
 
-	efa_cont_pages(mr->umem, start, max_page_shift, &npages,
-		       &params.page_shift, &params.page_num);
+	pg_sz = ib_umem_find_best_pgsz(mr->umem,
+				       dev->dev_attr.page_size_cap,
+				       virt_addr);
+	params.page_shift = __ffs(pg_sz);
+	params.page_num = DIV_ROUND_UP(length + (start & (pg_sz - 1)),
+				       pg_sz);
+
 	ibdev_dbg(&dev->ibdev,
-		  "start %#llx length %#llx npages %d params.page_shift %u params.page_num %u\n",
-		  start, length, npages, params.page_shift, params.page_num);
+		  "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
+		  start, length, params.page_shift, params.page_num);
 
 	inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
 	if (params.page_num <= inline_size) {