diff mbox series

[rdma-next,v1,4/6] RDMA/i40iw: Use umem API to retrieve aligned DMA address

Message ID 20190219145745.13476-5-shiraz.saleem@intel.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series Add APIs to get contiguous memory blocks aligned to a HW supported page size | expand

Commit Message

Shiraz Saleem Feb. 19, 2019, 2:57 p.m. UTC
Call the core helpers to retrieve the HW aligned address to use
for the MR, within a supported i40iw page size.

Remove code in i40iw to determine when MR is backed by 2M huge pages
which involves checking the umem->hugetlb flag and VMA inspection.
The core helpers will return the 2M aligned address if the
MR is backed by 2M pages.

Fixes: f26c7c83395b ("i40iw: Add 2MB page support")
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/i40iw/i40iw_user.h  |  5 ++++
 drivers/infiniband/hw/i40iw/i40iw_verbs.c | 49 ++++++-------------------------
 drivers/infiniband/hw/i40iw/i40iw_verbs.h |  3 +-
 3 files changed, 15 insertions(+), 42 deletions(-)

Comments

Jason Gunthorpe Feb. 20, 2019, 4:07 a.m. UTC | #1
On Tue, Feb 19, 2019 at 08:57:43AM -0600, Shiraz Saleem wrote:
> Call the core helpers to retrieve the HW aligned address to use
> for the MR, within a supported i40iw page size.
> 
> Remove code in i40iw to determine when MR is backed by 2M huge pages
> which involves checking the umem->hugetlb flag and VMA inspection.
> The core helpers will return the 2M aligned address if the
> MR is backed by 2M pages.
> 
> Fixes: f26c7c83395b ("i40iw: Add 2MB page support")
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>  drivers/infiniband/hw/i40iw/i40iw_user.h  |  5 ++++
>  drivers/infiniband/hw/i40iw/i40iw_verbs.c | 49 ++++++-------------------------
>  drivers/infiniband/hw/i40iw/i40iw_verbs.h |  3 +-
>  3 files changed, 15 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h
> index b125925..09fdcee 100644
> +++ b/drivers/infiniband/hw/i40iw/i40iw_user.h
> @@ -80,6 +80,11 @@ enum i40iw_device_capabilities_const {
>  	I40IW_MAX_PDS = 			32768
>  };
>  
> +enum i40iw_supported_page_size {
> +	I40IW_PAGE_SZ_4K = 0x00001000,
> +	I40IW_PAGE_SZ_2M = 0x00200000
> +};

This seems a bit strange..

> -	for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap, 0) {
> -		pg_addr = sg_page_iter_dma_address(&sg_iter);
> -		if (first_pg)
> -			*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
> -		else if (!(pg_addr & ~iwmr->page_msk))
> -			*pbl = cpu_to_le64(pg_addr);
> -		else
> -			continue;
> -
> -		first_pg = false;
> +	for (ib_umem_start_phys_iter(region, &sg_phys_iter,
> -		iwmr->page_size);

Maybe this should be:

for_each_sg_dma_page_sz (region->sg_head.sgl, &sg_iter, region->nmap,
                         iwmr->page_size) 

?

Is there a reason to move away from the API we built here?

Jason
Shiraz Saleem Feb. 23, 2019, 7:26 p.m. UTC | #2
On Tue, Feb 19, 2019 at 09:07:29PM -0700, Jason Gunthorpe wrote:
> On Tue, Feb 19, 2019 at 08:57:43AM -0600, Shiraz Saleem wrote:
> > Call the core helpers to retrieve the HW aligned address to use
> > for the MR, within a supported i40iw page size.
> > 
> > Remove code in i40iw to determine when MR is backed by 2M huge pages
> > which involves checking the umem->hugetlb flag and VMA inspection.
> > The core helpers will return the 2M aligned address if the
> > MR is backed by 2M pages.
> > 
> 
> > -	for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap, 0) {
> > -		pg_addr = sg_page_iter_dma_address(&sg_iter);
> > -		if (first_pg)
> > -			*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
> > -		else if (!(pg_addr & ~iwmr->page_msk))
> > -			*pbl = cpu_to_le64(pg_addr);
> > -		else
> > -			continue;
> > -
> > -		first_pg = false;
> > +	for (ib_umem_start_phys_iter(region, &sg_phys_iter,
> > -		iwmr->page_size);
> 
> Maybe this should be:
> 
> for_each_sg_dma_page_sz (region->sg_head.sgl, &sg_iter, region->nmap,
>                          iwmr->page_size) 
> 
> ?
> 
> Is there a reason to move away from the API we built here?
> 
Yes. Its a different iterator type we need to use here and
iterate the SG list in contiguous aligned memory blocks as
opposed to PAGE_SIZE increments.

Shiraz
Jason Gunthorpe Feb. 25, 2019, 6:38 p.m. UTC | #3
On Sat, Feb 23, 2019 at 01:26:41PM -0600, Shiraz Saleem wrote:
> On Tue, Feb 19, 2019 at 09:07:29PM -0700, Jason Gunthorpe wrote:
> > On Tue, Feb 19, 2019 at 08:57:43AM -0600, Shiraz Saleem wrote:
> > > Call the core helpers to retrieve the HW aligned address to use
> > > for the MR, within a supported i40iw page size.
> > > 
> > > Remove code in i40iw to determine when MR is backed by 2M huge pages
> > > which involves checking the umem->hugetlb flag and VMA inspection.
> > > The core helpers will return the 2M aligned address if the
> > > MR is backed by 2M pages.
> > > 
> > 
> > > -	for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap, 0) {
> > > -		pg_addr = sg_page_iter_dma_address(&sg_iter);
> > > -		if (first_pg)
> > > -			*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
> > > -		else if (!(pg_addr & ~iwmr->page_msk))
> > > -			*pbl = cpu_to_le64(pg_addr);
> > > -		else
> > > -			continue;
> > > -
> > > -		first_pg = false;
> > > +	for (ib_umem_start_phys_iter(region, &sg_phys_iter,
> > > -		iwmr->page_size);
> > 
> > Maybe this should be:
> > 
> > for_each_sg_dma_page_sz (region->sg_head.sgl, &sg_iter, region->nmap,
> >                          iwmr->page_size) 
> > 
> > ?
> > 
> > Is there a reason to move away from the API we built here?
> > 
> Yes. Its a different iterator type we need to use here and
> iterate the SG list in contiguous aligned memory blocks as
> opposed to PAGE_SIZE increments.

I mean, why not add an option to the core api to do something other
than PAGE_SIZE increments?

Jason
Shiraz Saleem Feb. 28, 2019, 9:42 p.m. UTC | #4
>Subject: Re: [PATCH rdma-next v1 4/6] RDMA/i40iw: Use umem API to retrieve
>aligned DMA address
>
>On Sat, Feb 23, 2019 at 01:26:41PM -0600, Shiraz Saleem wrote:
>> On Tue, Feb 19, 2019 at 09:07:29PM -0700, Jason Gunthorpe wrote:
>> > On Tue, Feb 19, 2019 at 08:57:43AM -0600, Shiraz Saleem wrote:
>> > > Call the core helpers to retrieve the HW aligned address to use
>> > > for the MR, within a supported i40iw page size.
>> > >
>> > > Remove code in i40iw to determine when MR is backed by 2M huge
>> > > pages which involves checking the umem->hugetlb flag and VMA
>inspection.
>> > > The core helpers will return the 2M aligned address if the MR is
>> > > backed by 2M pages.
>> > >
>> >
>> > > -	for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap,
>0) {
>> > > -		pg_addr = sg_page_iter_dma_address(&sg_iter);
>> > > -		if (first_pg)
>> > > -			*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
>> > > -		else if (!(pg_addr & ~iwmr->page_msk))
>> > > -			*pbl = cpu_to_le64(pg_addr);
>> > > -		else
>> > > -			continue;
>> > > -
>> > > -		first_pg = false;
>> > > +	for (ib_umem_start_phys_iter(region, &sg_phys_iter,
>> > > -		iwmr->page_size);
>> >
>> > Maybe this should be:
>> >
>> > for_each_sg_dma_page_sz (region->sg_head.sgl, &sg_iter, region->nmap,
>> >                          iwmr->page_size)
>> >
>> > ?
>> >
>> > Is there a reason to move away from the API we built here?
>> >
>> Yes. Its a different iterator type we need to use here and iterate the
>> SG list in contiguous aligned memory blocks as opposed to PAGE_SIZE
>> increments.
>
>I mean, why not add an option to the core api to do something other than
>PAGE_SIZE increments?
>

Not sure it's trivial and if it's that generic yet. The iterator ib_umem_next_phys_iter()
is reliant on best supported HW size passed in to get the pg_bit to use when
we iterate the SG list. 

And the best supported HW size is got via another umem  API
ib_umem_find_single_pg_size() specific to RDMA MRs which runs the SG list.

As we discussed here,
https://patchwork.kernel.org/patch/10499753/#22186847

We also ignore alignment of start of first sgl and end of last sgl which I thought
was specific to RDMA HW?

But maybe worth considering though for future work to extend as a scatterlist API. 

Shiraz
Jason Gunthorpe March 5, 2019, 1:11 a.m. UTC | #5
On Thu, Feb 28, 2019 at 09:42:30PM +0000, Saleem, Shiraz wrote:
> >Subject: Re: [PATCH rdma-next v1 4/6] RDMA/i40iw: Use umem API to retrieve
> >aligned DMA address
> >
> >On Sat, Feb 23, 2019 at 01:26:41PM -0600, Shiraz Saleem wrote:
> >> On Tue, Feb 19, 2019 at 09:07:29PM -0700, Jason Gunthorpe wrote:
> >> > On Tue, Feb 19, 2019 at 08:57:43AM -0600, Shiraz Saleem wrote:
> >> > > Call the core helpers to retrieve the HW aligned address to use
> >> > > for the MR, within a supported i40iw page size.
> >> > >
> >> > > Remove code in i40iw to determine when MR is backed by 2M huge
> >> > > pages which involves checking the umem->hugetlb flag and VMA
> >inspection.
> >> > > The core helpers will return the 2M aligned address if the MR is
> >> > > backed by 2M pages.
> >> > >
> >> >
> >> > > -	for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap,
> >0) {
> >> > > -		pg_addr = sg_page_iter_dma_address(&sg_iter);
> >> > > -		if (first_pg)
> >> > > -			*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
> >> > > -		else if (!(pg_addr & ~iwmr->page_msk))
> >> > > -			*pbl = cpu_to_le64(pg_addr);
> >> > > -		else
> >> > > -			continue;
> >> > > -
> >> > > -		first_pg = false;
> >> > > +	for (ib_umem_start_phys_iter(region, &sg_phys_iter,
> >> > > -		iwmr->page_size);
> >> >
> >> > Maybe this should be:
> >> >
> >> > for_each_sg_dma_page_sz (region->sg_head.sgl, &sg_iter, region->nmap,
> >> >                          iwmr->page_size)
> >> >
> >> > ?
> >> >
> >> > Is there a reason to move away from the API we built here?
> >> >
> >> Yes. Its a different iterator type we need to use here and iterate the
> >> SG list in contiguous aligned memory blocks as opposed to PAGE_SIZE
> >> increments.
> >
> >I mean, why not add an option to the core api to do something other than
> >PAGE_SIZE increments?
> >
> 
> Not sure it's trivial and if it's that generic yet. The iterator ib_umem_next_phys_iter()
> is reliant on best supported HW size passed in to get the pg_bit to use when
> we iterate the SG list. 
> 
> And the best supported HW size is got via another umem  API
> ib_umem_find_single_pg_size() specific to RDMA MRs which runs the SG list.

Yes.. but once we have this block size computed with rdma specific
code the iteration seems fairly generic..
 
> We also ignore alignment of start of first sgl and end of last sgl which I thought
> was specific to RDMA HW?

It is basically the same as what sg_page_iter_dma already does - that
iterator rounds everything down/up to a page boundary.

If someone doesn't want that rounding then they need to use a
different iterator..

> But maybe worth considering though for future work to extend as a scatterlist API. 

Can we at least follow the same basic design of having a
rdma_for_each_block () iterator helper loop like for_each_sg_dma_page,
etc?

Jason
Shiraz Saleem March 12, 2019, 8:23 p.m. UTC | #6
>Subject: Re: [PATCH rdma-next v1 4/6] RDMA/i40iw: Use umem API to retrieve
>aligned DMA address
>
>On Thu, Feb 28, 2019 at 09:42:30PM +0000, Saleem, Shiraz wrote:
>> >Subject: Re: [PATCH rdma-next v1 4/6] RDMA/i40iw: Use umem API to
>> >retrieve aligned DMA address
>> >
>> >On Sat, Feb 23, 2019 at 01:26:41PM -0600, Shiraz Saleem wrote:
>> >> On Tue, Feb 19, 2019 at 09:07:29PM -0700, Jason Gunthorpe wrote:
>> >> > On Tue, Feb 19, 2019 at 08:57:43AM -0600, Shiraz Saleem wrote:
>> >> > > Call the core helpers to retrieve the HW aligned address to use
>> >> > > for the MR, within a supported i40iw page size.
>> >> > >
>> >> > > Remove code in i40iw to determine when MR is backed by 2M huge
>> >> > > pages which involves checking the umem->hugetlb flag and VMA
>> >inspection.
>> >> > > The core helpers will return the 2M aligned address if the MR
>> >> > > is backed by 2M pages.
>> >> > >
>> >> >
>> >> > > -	for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region-
>>nmap,
>> >0) {
>> >> > > -		pg_addr = sg_page_iter_dma_address(&sg_iter);
>> >> > > -		if (first_pg)
>> >> > > -			*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
>> >> > > -		else if (!(pg_addr & ~iwmr->page_msk))
>> >> > > -			*pbl = cpu_to_le64(pg_addr);
>> >> > > -		else
>> >> > > -			continue;
>> >> > > -
>> >> > > -		first_pg = false;
>> >> > > +	for (ib_umem_start_phys_iter(region, &sg_phys_iter,
>> >> > > -		iwmr->page_size);
>> >> >
>> >> > Maybe this should be:
>> >> >
>> >> > for_each_sg_dma_page_sz (region->sg_head.sgl, &sg_iter, region-
>>nmap,
>> >> >                          iwmr->page_size)
>> >> >
>> >> > ?
>> >> >
>> >> > Is there a reason to move away from the API we built here?
>> >> >
>> >> Yes. Its a different iterator type we need to use here and iterate
>> >> the SG list in contiguous aligned memory blocks as opposed to
>> >> PAGE_SIZE increments.
>> >
>> >I mean, why not add an option to the core api to do something other
>> >than PAGE_SIZE increments?
>> >
>>
>> Not sure it's trivial and if it's that generic yet. The iterator
>> ib_umem_next_phys_iter() is reliant on best supported HW size passed
>> in to get the pg_bit to use when we iterate the SG list.
>>
>> And the best supported HW size is got via another umem  API
>> ib_umem_find_single_pg_size() specific to RDMA MRs which runs the SG list.
>
>Yes.. but once we have this block size computed with rdma specific code the
>iteration seems fairly generic..
>
>> We also ignore alignment of start of first sgl and end of last sgl
>> which I thought was specific to RDMA HW?
>
>It is basically the same as what sg_page_iter_dma already does - that iterator
>rounds everything down/up to a page boundary.
>
>If someone doesn't want that rounding then they need to use a different iterator..
>
>> But maybe worth considering though for future work to extend as a scatterlist
>API.
>
>Can we at least follow the same basic design of having a rdma_for_each_block ()
>iterator helper loop like for_each_sg_dma_page, etc?
>
Sure. I will explore it and see if we can generalize it. A look at for_each_sg_dma_page does
seem that it's feasible. I ll need to work out the details and do some further testing.

I am considering further breaking up this series. The page combining portion (i.e. patch #1)
can be sent independently and the new API (with the revised design) to get aligned memory
blocks can be a follow on.

Also Selvin found a problem while testing the series. Some drivers still use
umem->nmap to size their buffers to store the page dma address. With page combining, this can
cause out of bound accesses when the SGEs are unfolded using for_each_sg_dma_page
We need to size them based off umem->npages and this needs to be a precursor to the page
combining patch. I ll send that out too.

Shiraz
Jason Gunthorpe March 13, 2019, 6:04 p.m. UTC | #7
On Tue, Mar 12, 2019 at 08:23:04PM +0000, Saleem, Shiraz wrote:

> Also Selvin found a problem while testing the series. Some drivers
> still use umem->nmap to size their buffers to store the page dma
> address. With page combining, this can cause out of bound accesses
> when the SGEs are unfolded using for_each_sg_dma_page We need to
> size them based off umem->npages and this needs to be a precursor to
> the page combining patch. I ll send that out too.

Yikes, yes please

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h
index b125925..09fdcee 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_user.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_user.h
@@ -80,6 +80,11 @@  enum i40iw_device_capabilities_const {
 	I40IW_MAX_PDS = 			32768
 };
 
+enum i40iw_supported_page_size {
+	I40IW_PAGE_SZ_4K = 0x00001000,
+	I40IW_PAGE_SZ_2M = 0x00200000
+};
+
 #define i40iw_handle void *
 #define i40iw_adapter_handle i40iw_handle
 #define i40iw_qp_handle i40iw_handle
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index d5fb2b9..5c678d2 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -1362,53 +1362,22 @@  static void i40iw_copy_user_pgaddrs(struct i40iw_mr *iwmr,
 	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
 	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
 	struct i40iw_pble_info *pinfo;
-	struct sg_dma_page_iter sg_iter;
-	u64 pg_addr = 0;
+	struct sg_phys_iter sg_phys_iter;
 	u32 idx = 0;
-	bool first_pg = true;
 
 	pinfo = (level == I40IW_LEVEL_1) ? NULL : palloc->level2.leaf;
 
 	if (iwmr->type == IW_MEMREG_TYPE_QP)
 		iwpbl->qp_mr.sq_page = sg_page(region->sg_head.sgl);
 
-	for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap, 0) {
-		pg_addr = sg_page_iter_dma_address(&sg_iter);
-		if (first_pg)
-			*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
-		else if (!(pg_addr & ~iwmr->page_msk))
-			*pbl = cpu_to_le64(pg_addr);
-		else
-			continue;
-
-		first_pg = false;
+	for (ib_umem_start_phys_iter(region, &sg_phys_iter, iwmr->page_size);
+	     ib_umem_next_phys_iter(region, &sg_phys_iter);) {
+		*pbl = cpu_to_le64(sg_phys_iter.phyaddr);
 		pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
 	}
 }
 
 /**
- * i40iw_set_hugetlb_params - set MR pg size and mask to huge pg values.
- * @addr: virtual address
- * @iwmr: mr pointer for this memory registration
- */
-static void i40iw_set_hugetlb_values(u64 addr, struct i40iw_mr *iwmr)
-{
-	struct vm_area_struct *vma;
-	struct hstate *h;
-
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma(current->mm, addr);
-	if (vma && is_vm_hugetlb_page(vma)) {
-		h = hstate_vma(vma);
-		if (huge_page_size(h) == 0x200000) {
-			iwmr->page_size = huge_page_size(h);
-			iwmr->page_msk = huge_page_mask(h);
-		}
-	}
-	up_read(&current->mm->mmap_sem);
-}
-
-/**
  * i40iw_check_mem_contiguous - check if pbls stored in arr are contiguous
  * @arr: lvl1 pbl array
  * @npages: page count
@@ -1862,11 +1831,11 @@  static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
 	iwmr->ibmr.device = pd->device;
 	ucontext = to_ucontext(pd->uobject->context);
 
-	iwmr->page_size = PAGE_SIZE;
-	iwmr->page_msk = PAGE_MASK;
-
-	if (region->hugetlb && (req.reg_type == IW_MEMREG_TYPE_MEM))
-		i40iw_set_hugetlb_values(start, iwmr);
+	iwmr->page_size = I40IW_PAGE_SZ_4K;
+	if (req.reg_type == IW_MEMREG_TYPE_MEM)
+		iwmr->page_size = ib_umem_find_single_pg_size(region,
+					I40IW_PAGE_SZ_4K | I40IW_PAGE_SZ_2M,
+					virt);
 
 	region_length = region->length + (start & (iwmr->page_size - 1));
 	pg_shift = ffs(iwmr->page_size) - 1;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
index 76cf173..3a41375 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
@@ -94,8 +94,7 @@  struct i40iw_mr {
 	struct ib_umem *region;
 	u16 type;
 	u32 page_cnt;
-	u32 page_size;
-	u64 page_msk;
+	u64 page_size;
 	u32 npages;
 	u32 stag;
 	u64 length;