Message ID | 20181019233409.1104-2-shiraz.saleem@intel.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Introduce APIs to get DMA addresses aligned to a HW supported page size | expand |
On Fri, Oct 19, 2018 at 06:34:06PM -0500, Shiraz Saleem wrote: > Squash contiguous regions of PAGE_SIZE pages > into a single SG entry as opposed to one > SG entry per page. This reduces the SG table > size and is friendliest to the IOMMU. > > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca> > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com> > drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++---------------------- > 1 file changed, 31 insertions(+), 35 deletions(-) > > diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c > index c6144df..486d6d7 100644 > +++ b/drivers/infiniband/core/umem.c > @@ -39,6 +39,7 @@ > #include <linux/export.h> > #include <linux/hugetlb.h> > #include <linux/slab.h> > +#include <linux/pagemap.h> > #include <rdma/ib_umem_odp.h> > > #include "uverbs.h" > @@ -46,18 +47,16 @@ > > static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) > { > - struct scatterlist *sg; > + struct sg_page_iter sg_iter; > struct page *page; > - int i; > > if (umem->nmap > 0) > ib_dma_unmap_sg(dev, umem->sg_head.sgl, > - umem->npages, > + umem->sg_head.orig_nents, > DMA_BIDIRECTIONAL); > > - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { > - > - page = sg_page(sg); > + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_head.orig_nents, 0) { > + page = sg_page_iter_page(&sg_iter); > if (!PageDirty(page) && umem->writable && dirty) > set_page_dirty_lock(page); > put_page(page); > @@ -92,7 +91,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, > int ret; > int i; > unsigned long dma_attrs = 0; > - struct scatterlist *sg, *sg_list_start; > unsigned int gup_flags = FOLL_WRITE; > > if (dmasync) > @@ -138,7 +136,13 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, > /* We assume the memory is from hugetlb until proved otherwise */ > umem->hugetlb = 1; > > - page_list = (struct page **) __get_free_page(GFP_KERNEL); > + npages = ib_umem_num_pages(umem); > + if (npages == 0 || npages > UINT_MAX) { > + ret = -EINVAL; > + goto umem_kfree; > + } > + > + page_list = kmalloc_array(npages, sizeof(*page_list), > GFP_KERNEL); Hurm.. The reason it was __get_free_page and a loop was to avoid a high order allocation triggered by userspace. ie if I want to make a MR over a 2G region this requires a 4M allocation of page pointers which is too high to do via kmalloc_array(). So I think this can't work, the page list has to be restricted in size and we need to iterate... > while (npages) { > down_read(&mm->mmap_sem); > ret = get_user_pages_longterm(cur_base, > min_t(unsigned long, npages, > PAGE_SIZE / sizeof (struct page *)), Why keep the min? This was just to match the size of the array.. > - gup_flags, page_list, vma_list); > + gup_flags, page_list + umem->npages, vma_list); > if (ret < 0) { > up_read(&mm->mmap_sem); > - goto umem_release; > + release_pages(page_list, umem->npages); > + goto vma; > } > > umem->npages += ret; > cur_base += ret * PAGE_SIZE; > npages -= ret; > > - /* Continue to hold the mmap_sem as vma_list access > - * needs to be protected. > - */ > - for_each_sg(sg_list_start, sg, ret, i) { > + for(i = 0; i < ret && umem->hugetlb; i++) { > if (vma_list && !is_vm_hugetlb_page(vma_list[i])) > umem->hugetlb = 0; > - > - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); > } > up_read(&mm->mmap_sem); > + } > > - /* preparing for next loop */ > - sg_list_start = sg; > + ret = sg_alloc_table_from_pages(&umem->sg_head, > + page_list, > + umem->npages, > + 0, > + umem->npages << PAGE_SHIFT, > + GFP_KERNEL); Yep, this is nice.. But I wonder if this needs to copy what i915 is doing here in i915_sg_segment_size() and __i915_gem_userptr_alloc_pages() - ie restrict the size if the swiotlb is enabled. Confusing as to why though.. Anyhow, I think for this to be really optimal, some new apis in scatterlist.h will be needed.. Something like a'sg_append_table_from_pages' helper. Then the flow would be: sg_alloc_table(&table, min(npages, SG_MAX_SINGLE_ALLOC)) while (npages) { get_user_pages(page_list, ...) sg_append_table_from_pages(&table, page_list, ..., min(npages, SG_MAX_SINGLE_ALLOC)); npages -= page_list_len; } So we at most over allocate by 1 page, and if that became critical we could copy the last chain in the sg_table. If sg_append_table_from_pages() needs more room in the table then it would convert the last entry to a chain and allocate another table. This bounds the excess memory required and doesn't need a high order allocation to invoke get_user_pages. Also, this series is out of order, right? If the umem core starts producing large SGL entries won't all the drivers break? The drivers need to be able to break them down before this is introduced. Jason
On Mon, Oct 22, 2018 at 03:06:42PM -0600, Jason Gunthorpe wrote: > On Fri, Oct 19, 2018 at 06:34:06PM -0500, Shiraz Saleem wrote: > > Squash contiguous regions of PAGE_SIZE pages > > into a single SG entry as opposed to one > > SG entry per page. This reduces the SG table > > size and is friendliest to the IOMMU. > > > > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca> > > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com> > > drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++---------------------- > > 1 file changed, 31 insertions(+), 35 deletions(-) [..] > > + page_list = kmalloc_array(npages, sizeof(*page_list), > > GFP_KERNEL); > > Hurm.. The reason it was __get_free_page and a loop was to avoid a > high order allocation triggered by userspace. > > ie if I want to make a MR over a 2G region this requires a 4M > allocation of page pointers which is too high to do via > kmalloc_array(). > > So I think this can't work, the page list has to be restricted in size > and we need to iterate... Yes. I had this same concern and this is one of the reasons I posted this patch set as an RFC. I was thinking of building a temporary linked-list of page_list arr objects (after pinning) each of which could hold a limited set of pages. We would then iterate this list to identify contiguous pages and squash them into an SGL entries. [..] > > + ret = sg_alloc_table_from_pages(&umem->sg_head, > > + page_list, > > + umem->npages, > > + 0, > > + umem->npages << PAGE_SHIFT, > > + GFP_KERNEL); > > Yep, this is nice.. But I wonder if this needs to copy what i915 is > doing here in i915_sg_segment_size() and > __i915_gem_userptr_alloc_pages() - ie restrict the size if the swiotlb > is enabled. Confusing as to why though.. > > > Anyhow, I think for this to be really optimal, some new apis in > scatterlist.h will be needed.. Something like > a'sg_append_table_from_pages' helper. > > Then the flow would be: > > sg_alloc_table(&table, min(npages, SG_MAX_SINGLE_ALLOC)) > while (npages) { > get_user_pages(page_list, ...) > sg_append_table_from_pages(&table, page_list, ..., > min(npages, SG_MAX_SINGLE_ALLOC)); > npages -= page_list_len; > } > > So we at most over allocate by 1 page, and if that became critical we > could copy the last chain in the sg_table. > > If sg_append_table_from_pages() needs more room in the table then it > would convert the last entry to a chain and allocate another table. > Since sg_alloc_table_from_pages cant be really used here as page_list size has to be restricted, I would prefer we defer solving the problem of over allocating of SG table to a follow on series. And do combining of contig regions into SGL entries in this one. > This bounds the excess memory required and doesn't need a high order > allocation to invoke get_user_pages. > > > Also, this series is out of order, right? > > If the umem core starts producing large SGL entries won't all the > drivers break? Hmmm..why? The code to break down each SGL entry into dma_addr + PAGE_SIZE*i (i = 0 to # of PAGE_SIZE pages in SGE) already exists no? For instance, https://elixir.bootlin.com/linux/v4.8/source/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c#L918 > The drivers need to be able to break them down before this is > introduced. > > Jason
On Tue, Oct 23, 2018 at 06:55:45PM -0500, Shiraz Saleem wrote: > On Mon, Oct 22, 2018 at 03:06:42PM -0600, Jason Gunthorpe wrote: > > On Fri, Oct 19, 2018 at 06:34:06PM -0500, Shiraz Saleem wrote: > > > Squash contiguous regions of PAGE_SIZE pages > > > into a single SG entry as opposed to one > > > SG entry per page. This reduces the SG table > > > size and is friendliest to the IOMMU. > > > > > > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca> > > > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> > > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com> > > > drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++---------------------- > > > 1 file changed, 31 insertions(+), 35 deletions(-) > > [..] > > > > + page_list = kmalloc_array(npages, sizeof(*page_list), > > > GFP_KERNEL); > > > > Hurm.. The reason it was __get_free_page and a loop was to avoid a > > high order allocation triggered by userspace. > > > > ie if I want to make a MR over a 2G region this requires a 4M > > allocation of page pointers which is too high to do via > > kmalloc_array(). > > > > So I think this can't work, the page list has to be restricted in size > > and we need to iterate... > > Yes. I had this same concern and this is one of the reasons I posted this patch set as an RFC. > I was thinking of building a temporary linked-list of page_list arr objects (after pinning) > each of which could hold a limited set of pages. > We would then iterate this list to identify contiguous pages and squash them into an SGL entries. There is no reason to hold all the pages in memory, this can be done via iteration.. > > So we at most over allocate by 1 page, and if that became critical we > > could copy the last chain in the sg_table. > > > > If sg_append_table_from_pages() needs more room in the table then it > > would convert the last entry to a chain and allocate another table. > > > > Since sg_alloc_table_from_pages cant be really used here as page_list size has > to be restricted, I would prefer we defer solving the problem of over allocating > of SG table to a follow on series. > And do combining of contig regions into SGL entries in this one. Sure, I don't mind the overallocation, it is already the way things are today. Maybe we trivially add an API to trim the sg_table? > > If the umem core starts producing large SGL entries won't all the > > drivers break? > > Hmmm..why? I'm concerned something iterates over the SGL without using the 'for each page' varient of the iterator.. It is worth doing some survey at least. > The code to break down each SGL entry into dma_addr + PAGE_SIZE*i (i > = 0 to # of PAGE_SIZE pages in SGE) already exists no? For instance, > https://elixir.bootlin.com/linux/v4.8/source/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c#L918 Like this is a good example, what is the point of this code? umem->page_shift is always PAGE_SHIFT for umem's that are not ODP so why isn't this just written as for each for_each_sg_page?? Jason
On Thu, Oct 25, 2018 at 08:27:17PM -0600, Jason Gunthorpe wrote: > On Tue, Oct 23, 2018 at 06:55:45PM -0500, Shiraz Saleem wrote: > > On Mon, Oct 22, 2018 at 03:06:42PM -0600, Jason Gunthorpe wrote: > > > On Fri, Oct 19, 2018 at 06:34:06PM -0500, Shiraz Saleem wrote: > > > > Squash contiguous regions of PAGE_SIZE pages > > > > into a single SG entry as opposed to one > > > > SG entry per page. This reduces the SG table > > > > size and is friendliest to the IOMMU. > > > > > > > > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca> > > > > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> > > > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com> > > > > drivers/infiniband/core/umem.c | 66 ++++++++++++++++++++---------------------- > > > > 1 file changed, 31 insertions(+), 35 deletions(-) [..] > > Since sg_alloc_table_from_pages cant be really used here as page_list size has > > to be restricted, I would prefer we defer solving the problem of over allocating > > of SG table to a follow on series. > > And do combining of contig regions into SGL entries in this one. > > Sure, I don't mind the overallocation, it is already the way things > are today. > > Maybe we trivially add an API to trim the sg_table? I will explore this and try adding to this series. > > > If the umem core starts producing large SGL entries won't all the > > > drivers break? > > > > Hmmm..why? > > I'm concerned something iterates over the SGL without using the 'for > each page' varient of the iterator.. It is worth doing some survey at > least. OK. That sounds reasonable.
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c6144df..486d6d7 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -39,6 +39,7 @@ #include <linux/export.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" @@ -46,18 +47,16 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct scatterlist *sg; + struct sg_page_iter sg_iter; struct page *page; - int i; if (umem->nmap > 0) ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->npages, + umem->sg_head.orig_nents, DMA_BIDIRECTIONAL); - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - - page = sg_page(sg); + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_head.orig_nents, 0) { + page = sg_page_iter_page(&sg_iter); if (!PageDirty(page) && umem->writable && dirty) set_page_dirty_lock(page); put_page(page); @@ -92,7 +91,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, int ret; int i; unsigned long dma_attrs = 0; - struct scatterlist *sg, *sg_list_start; unsigned int gup_flags = FOLL_WRITE; if (dmasync) @@ -138,7 +136,13 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; - page_list = (struct page **) __get_free_page(GFP_KERNEL); + npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto umem_kfree; + } + + page_list = kmalloc_array(npages, sizeof(*page_list), GFP_KERNEL); if (!page_list) { ret = -ENOMEM; goto umem_kfree; @@ -152,12 +156,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!vma_list) umem->hugetlb = 0; - npages = ib_umem_num_pages(umem); - if (npages == 0 || npages > UINT_MAX) { - ret = -EINVAL; - goto out; - } - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; down_write(&mm->mmap_sem); @@ -172,50 +170,48 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base = addr & PAGE_MASK; - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); - if (ret) - goto vma; - if (!umem->writable) gup_flags |= FOLL_FORCE; - sg_list_start = umem->sg_head.sgl; - while (npages) { down_read(&mm->mmap_sem); ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, vma_list); + gup_flags, page_list + umem->npages, vma_list); if (ret < 0) { up_read(&mm->mmap_sem); - goto umem_release; + release_pages(page_list, umem->npages); + goto vma; } umem->npages += ret; cur_base += ret * PAGE_SIZE; npages -= ret; - /* Continue to hold the mmap_sem as vma_list access - * needs to be protected. - */ - for_each_sg(sg_list_start, sg, ret, i) { + for(i = 0; i < ret && umem->hugetlb; i++) { if (vma_list && !is_vm_hugetlb_page(vma_list[i])) umem->hugetlb = 0; - - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } up_read(&mm->mmap_sem); + } - /* preparing for next loop */ - sg_list_start = sg; + ret = sg_alloc_table_from_pages(&umem->sg_head, + page_list, + umem->npages, + 0, + umem->npages << PAGE_SHIFT, + GFP_KERNEL); + if (ret) { + release_pages(page_list, umem->npages); + goto vma; } umem->nmap = ib_dma_map_sg_attrs(context->device, - umem->sg_head.sgl, - umem->npages, - DMA_BIDIRECTIONAL, - dma_attrs); + umem->sg_head.sgl, + umem->sg_head.orig_nents, + DMA_BIDIRECTIONAL, + dma_attrs); if (!umem->nmap) { ret = -ENOMEM; @@ -234,7 +230,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, out: if (vma_list) free_page((unsigned long) vma_list); - free_page((unsigned long) page_list); + kfree(page_list); umem_kfree: if (ret) { mmdrop(umem->owning_mm);