diff mbox series

[RFC,1/4] RDMA/umem: Minimize SG table entries

Message ID 20181019233409.1104-2-shiraz.saleem@intel.com (mailing list archive)
State Superseded
Headers show
Series Introduce APIs to get DMA addresses aligned to a HW supported page size | expand

Commit Message

Shiraz Saleem Oct. 19, 2018, 11:34 p.m. UTC
Squash contiguous regions of PAGE_SIZE pages
into a single SG entry as opposed to one
SG entry per page. This reduces the SG table
size and is friendliest to the IOMMU.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 35 deletions(-)

Comments

Jason Gunthorpe Oct. 22, 2018, 9:06 p.m. UTC | #1
On Fri, Oct 19, 2018 at 06:34:06PM -0500, Shiraz Saleem wrote:
> Squash contiguous regions of PAGE_SIZE pages
> into a single SG entry as opposed to one
> SG entry per page. This reduces the SG table
> size and is friendliest to the IOMMU.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>  drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++----------------------
>  1 file changed, 31 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index c6144df..486d6d7 100644
> +++ b/drivers/infiniband/core/umem.c
> @@ -39,6 +39,7 @@
>  #include <linux/export.h>
>  #include <linux/hugetlb.h>
>  #include <linux/slab.h>
> +#include <linux/pagemap.h>
>  #include <rdma/ib_umem_odp.h>
>  
>  #include "uverbs.h"
> @@ -46,18 +47,16 @@
>  
>  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>  {
> -	struct scatterlist *sg;
> +	struct sg_page_iter sg_iter;
>  	struct page *page;
> -	int i;
>  
>  	if (umem->nmap > 0)
>  		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
> -				umem->npages,
> +				umem->sg_head.orig_nents,
>  				DMA_BIDIRECTIONAL);
>  
> -	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
> -
> -		page = sg_page(sg);
> +	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_head.orig_nents, 0) {
> +		page = sg_page_iter_page(&sg_iter);
>  		if (!PageDirty(page) && umem->writable && dirty)
>  			set_page_dirty_lock(page);
>  		put_page(page);
> @@ -92,7 +91,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  	int ret;
>  	int i;
>  	unsigned long dma_attrs = 0;
> -	struct scatterlist *sg, *sg_list_start;
>  	unsigned int gup_flags = FOLL_WRITE;
>  
>  	if (dmasync)
> @@ -138,7 +136,13 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  	/* We assume the memory is from hugetlb until proved otherwise */
>  	umem->hugetlb   = 1;
>  
> -	page_list = (struct page **) __get_free_page(GFP_KERNEL);
> +	npages = ib_umem_num_pages(umem);
> +	if (npages == 0 || npages > UINT_MAX) {
> +		ret = -EINVAL;
> +		goto umem_kfree;
> +	}
> +
> +	page_list = kmalloc_array(npages, sizeof(*page_list),
> GFP_KERNEL);

Hurm.. The reason it was __get_free_page and a loop was to avoid a
high order allocation triggered by userspace.

ie if I want to make a MR over a 2G region this requires a 4M
allocation of page pointers which is too high to do via
kmalloc_array().

So I think this can't work, the page list has to be restricted in size
and we need to iterate...

>  	while (npages) {
>  		down_read(&mm->mmap_sem);
>  		ret = get_user_pages_longterm(cur_base,
>  				     min_t(unsigned long, npages,
>  					   PAGE_SIZE / sizeof (struct page *)),

Why keep the min? This was just to match the size of the array..

> -				     gup_flags, page_list, vma_list);
> +				     gup_flags, page_list + umem->npages, vma_list);
>  		if (ret < 0) {
>  			up_read(&mm->mmap_sem);
> -			goto umem_release;
> +			release_pages(page_list, umem->npages);
> +			goto vma;
>  		}
>  
>  		umem->npages += ret;
>  		cur_base += ret * PAGE_SIZE;
>  		npages   -= ret;
>  
> -		/* Continue to hold the mmap_sem as vma_list access
> -		 * needs to be protected.
> -		 */
> -		for_each_sg(sg_list_start, sg, ret, i) {
> +		for(i = 0; i < ret && umem->hugetlb; i++) {
>  			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
>  				umem->hugetlb = 0;
> -
> -			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
>  		}
>  		up_read(&mm->mmap_sem);
> +	}
>  
> -		/* preparing for next loop */
> -		sg_list_start = sg;
> +	ret = sg_alloc_table_from_pages(&umem->sg_head,
> +					page_list,
> +					umem->npages,
> +					0,
> +					umem->npages << PAGE_SHIFT,
> +					GFP_KERNEL);

Yep, this is nice.. But I wonder if this needs to copy what i915 is
doing here in i915_sg_segment_size() and
__i915_gem_userptr_alloc_pages() - ie restrict the size if the swiotlb
is enabled. Confusing as to why though..


Anyhow, I think for this to be really optimal, some new apis in
scatterlist.h will be needed.. Something like
a'sg_append_table_from_pages' helper.

Then the flow would be:

sg_alloc_table(&table, min(npages, SG_MAX_SINGLE_ALLOC))
while (npages) {
   get_user_pages(page_list, ...)
   sg_append_table_from_pages(&table, page_list, ...,
       min(npages, SG_MAX_SINGLE_ALLOC));
   npages -= page_list_len;
}

So we at most over allocate by 1 page, and if that became critical we
could copy the last chain in the sg_table.

If sg_append_table_from_pages() needs more room in the table then it
would convert the last entry to a chain and allocate another table.

This bounds the excess memory required and doesn't need a high order
allocation to invoke get_user_pages.


Also, this series is out of order, right?

If the umem core starts producing large SGL entries won't all the
drivers break?

The drivers need to be able to break them down before this is
introduced.

Jason
Shiraz Saleem Oct. 23, 2018, 11:55 p.m. UTC | #2
On Mon, Oct 22, 2018 at 03:06:42PM -0600, Jason Gunthorpe wrote:
> On Fri, Oct 19, 2018 at 06:34:06PM -0500, Shiraz Saleem wrote:
> > Squash contiguous regions of PAGE_SIZE pages
> > into a single SG entry as opposed to one
> > SG entry per page. This reduces the SG table
> > size and is friendliest to the IOMMU.
> > 
> > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> >  drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++----------------------
> >  1 file changed, 31 insertions(+), 35 deletions(-)

[..] 

> > +	page_list = kmalloc_array(npages, sizeof(*page_list),
> > GFP_KERNEL);
> 
> Hurm.. The reason it was __get_free_page and a loop was to avoid a
> high order allocation triggered by userspace.
> 
> ie if I want to make a MR over a 2G region this requires a 4M
> allocation of page pointers which is too high to do via
> kmalloc_array().
> 
> So I think this can't work, the page list has to be restricted in size
> and we need to iterate...

Yes. I had this same concern and this is one of the reasons I posted this patch set as an RFC.
I was thinking of building a temporary linked-list of page_list arr objects (after pinning)
each of which could hold a limited set of pages.
We would then iterate this list to identify contiguous pages and squash them into an SGL entries.

[..]
> > +	ret = sg_alloc_table_from_pages(&umem->sg_head,
> > +					page_list,
> > +					umem->npages,
> > +					0,
> > +					umem->npages << PAGE_SHIFT,
> > +					GFP_KERNEL);
> 
> Yep, this is nice.. But I wonder if this needs to copy what i915 is
> doing here in i915_sg_segment_size() and
> __i915_gem_userptr_alloc_pages() - ie restrict the size if the swiotlb
> is enabled. Confusing as to why though..
> 
> 
> Anyhow, I think for this to be really optimal, some new apis in
> scatterlist.h will be needed.. Something like
> a'sg_append_table_from_pages' helper.
> 
> Then the flow would be:
> 
> sg_alloc_table(&table, min(npages, SG_MAX_SINGLE_ALLOC))
> while (npages) {
>    get_user_pages(page_list, ...)
>    sg_append_table_from_pages(&table, page_list, ...,
>        min(npages, SG_MAX_SINGLE_ALLOC));
>    npages -= page_list_len;
> }
> 
> So we at most over allocate by 1 page, and if that became critical we
> could copy the last chain in the sg_table.
> 
> If sg_append_table_from_pages() needs more room in the table then it
> would convert the last entry to a chain and allocate another table.
>

Since sg_alloc_table_from_pages cant be really used here as page_list size has
to be restricted, I would prefer we defer solving the problem of over allocating
of SG table to a follow on series.
And do combining of contig regions into SGL entries in this one.

> This bounds the excess memory required and doesn't need a high order
> allocation to invoke get_user_pages.
> 
> 
> Also, this series is out of order, right?
> 
> If the umem core starts producing large SGL entries won't all the
> drivers break?

Hmmm..why?

The code to break down each SGL entry into dma_addr + PAGE_SIZE*i (i = 0 to # of PAGE_SIZE pages in SGE)
already exists no? For instance,
https://elixir.bootlin.com/linux/v4.8/source/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c#L918

> The drivers need to be able to break them down before this is
> introduced.
> 
> Jason
Jason Gunthorpe Oct. 26, 2018, 2:27 a.m. UTC | #3
On Tue, Oct 23, 2018 at 06:55:45PM -0500, Shiraz Saleem wrote:
> On Mon, Oct 22, 2018 at 03:06:42PM -0600, Jason Gunthorpe wrote:
> > On Fri, Oct 19, 2018 at 06:34:06PM -0500, Shiraz Saleem wrote:
> > > Squash contiguous regions of PAGE_SIZE pages
> > > into a single SG entry as opposed to one
> > > SG entry per page. This reduces the SG table
> > > size and is friendliest to the IOMMU.
> > > 
> > > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> > > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > >  drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++----------------------
> > >  1 file changed, 31 insertions(+), 35 deletions(-)
> 
> [..] 
> 
> > > +	page_list = kmalloc_array(npages, sizeof(*page_list),
> > > GFP_KERNEL);
> > 
> > Hurm.. The reason it was __get_free_page and a loop was to avoid a
> > high order allocation triggered by userspace.
> > 
> > ie if I want to make a MR over a 2G region this requires a 4M
> > allocation of page pointers which is too high to do via
> > kmalloc_array().
> > 
> > So I think this can't work, the page list has to be restricted in size
> > and we need to iterate...
> 
> Yes. I had this same concern and this is one of the reasons I posted this patch set as an RFC.
> I was thinking of building a temporary linked-list of page_list arr objects (after pinning)
> each of which could hold a limited set of pages.
> We would then iterate this list to identify contiguous pages and squash them into an SGL entries.

There is no reason to hold all the pages in memory, this can be done
via iteration..

> > So we at most over allocate by 1 page, and if that became critical we
> > could copy the last chain in the sg_table.
> > 
> > If sg_append_table_from_pages() needs more room in the table then it
> > would convert the last entry to a chain and allocate another table.
> >
> 
> Since sg_alloc_table_from_pages cant be really used here as page_list size has
> to be restricted, I would prefer we defer solving the problem of over allocating
> of SG table to a follow on series.
> And do combining of contig regions into SGL entries in this one.

Sure, I don't mind the overallocation, it is already the way things
are today.

Maybe we trivially add an API to trim the sg_table?

> > If the umem core starts producing large SGL entries won't all the
> > drivers break?
> 
> Hmmm..why?

I'm concerned something iterates over the SGL without using the 'for
each page' varient of the iterator.. It is worth doing some survey at
least.

> The code to break down each SGL entry into dma_addr + PAGE_SIZE*i (i
> = 0 to # of PAGE_SIZE pages in SGE) already exists no? For instance,
> https://elixir.bootlin.com/linux/v4.8/source/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c#L918

Like this is a good example, what is the point of this code?

umem->page_shift is always PAGE_SHIFT for umem's that are not ODP so
why isn't this just written as for each for_each_sg_page??

Jason
Shiraz Saleem Oct. 30, 2018, 11:24 p.m. UTC | #4
On Thu, Oct 25, 2018 at 08:27:17PM -0600, Jason Gunthorpe wrote:
> On Tue, Oct 23, 2018 at 06:55:45PM -0500, Shiraz Saleem wrote:
> > On Mon, Oct 22, 2018 at 03:06:42PM -0600, Jason Gunthorpe wrote:
> > > On Fri, Oct 19, 2018 at 06:34:06PM -0500, Shiraz Saleem wrote:
> > > > Squash contiguous regions of PAGE_SIZE pages
> > > > into a single SG entry as opposed to one
> > > > SG entry per page. This reduces the SG table
> > > > size and is friendliest to the IOMMU.
> > > > 
> > > > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> > > > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > > >  drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++----------------------
> > > >  1 file changed, 31 insertions(+), 35 deletions(-)

[..]

> > Since sg_alloc_table_from_pages cant be really used here as page_list size has
> > to be restricted, I would prefer we defer solving the problem of over allocating
> > of SG table to a follow on series.
> > And do combining of contig regions into SGL entries in this one.
> 
> Sure, I don't mind the overallocation, it is already the way things
> are today.
> 
> Maybe we trivially add an API to trim the sg_table?

I will explore this and try adding to this series.
 
> > > If the umem core starts producing large SGL entries won't all the
> > > drivers break?
> > 
> > Hmmm..why?
> 
> I'm concerned something iterates over the SGL without using the 'for
> each page' varient of the iterator.. It is worth doing some survey at
> least.

OK. That sounds reasonable.
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index c6144df..486d6d7 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -39,6 +39,7 @@ 
 #include <linux/export.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
@@ -46,18 +47,16 @@ 
 
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct page *page;
-	int i;
 
 	if (umem->nmap > 0)
 		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
-				umem->npages,
+				umem->sg_head.orig_nents,
 				DMA_BIDIRECTIONAL);
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
-
-		page = sg_page(sg);
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_head.orig_nents, 0) {
+		page = sg_page_iter_page(&sg_iter);
 		if (!PageDirty(page) && umem->writable && dirty)
 			set_page_dirty_lock(page);
 		put_page(page);
@@ -92,7 +91,6 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	int ret;
 	int i;
 	unsigned long dma_attrs = 0;
-	struct scatterlist *sg, *sg_list_start;
 	unsigned int gup_flags = FOLL_WRITE;
 
 	if (dmasync)
@@ -138,7 +136,13 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	/* We assume the memory is from hugetlb until proved otherwise */
 	umem->hugetlb   = 1;
 
-	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	npages = ib_umem_num_pages(umem);
+	if (npages == 0 || npages > UINT_MAX) {
+		ret = -EINVAL;
+		goto umem_kfree;
+	}
+
+	page_list = kmalloc_array(npages, sizeof(*page_list), GFP_KERNEL);
 	if (!page_list) {
 		ret = -ENOMEM;
 		goto umem_kfree;
@@ -152,12 +156,6 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	if (!vma_list)
 		umem->hugetlb = 0;
 
-	npages = ib_umem_num_pages(umem);
-	if (npages == 0 || npages > UINT_MAX) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
 	down_write(&mm->mmap_sem);
@@ -172,50 +170,48 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
 	cur_base = addr & PAGE_MASK;
 
-	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
-	if (ret)
-		goto vma;
-
 	if (!umem->writable)
 		gup_flags |= FOLL_FORCE;
 
-	sg_list_start = umem->sg_head.sgl;
-
 	while (npages) {
 		down_read(&mm->mmap_sem);
 		ret = get_user_pages_longterm(cur_base,
 				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
-				     gup_flags, page_list, vma_list);
+				     gup_flags, page_list + umem->npages, vma_list);
 		if (ret < 0) {
 			up_read(&mm->mmap_sem);
-			goto umem_release;
+			release_pages(page_list, umem->npages);
+			goto vma;
 		}
 
 		umem->npages += ret;
 		cur_base += ret * PAGE_SIZE;
 		npages   -= ret;
 
-		/* Continue to hold the mmap_sem as vma_list access
-		 * needs to be protected.
-		 */
-		for_each_sg(sg_list_start, sg, ret, i) {
+		for(i = 0; i < ret && umem->hugetlb; i++) {
 			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
 				umem->hugetlb = 0;
-
-			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
 		}
 		up_read(&mm->mmap_sem);
+	}
 
-		/* preparing for next loop */
-		sg_list_start = sg;
+	ret = sg_alloc_table_from_pages(&umem->sg_head,
+					page_list,
+					umem->npages,
+					0,
+					umem->npages << PAGE_SHIFT,
+					GFP_KERNEL);
+	if (ret) {
+		release_pages(page_list, umem->npages);
+		goto vma;
 	}
 
 	umem->nmap = ib_dma_map_sg_attrs(context->device,
-				  umem->sg_head.sgl,
-				  umem->npages,
-				  DMA_BIDIRECTIONAL,
-				  dma_attrs);
+					 umem->sg_head.sgl,
+					 umem->sg_head.orig_nents,
+					 DMA_BIDIRECTIONAL,
+					 dma_attrs);
 
 	if (!umem->nmap) {
 		ret = -ENOMEM;
@@ -234,7 +230,7 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 out:
 	if (vma_list)
 		free_page((unsigned long) vma_list);
-	free_page((unsigned long) page_list);
+	kfree(page_list);
 umem_kfree:
 	if (ret) {
 		mmdrop(umem->owning_mm);