Message ID | 20190328165402.19164-1-shiraz.saleem@intel.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | [v3,rdma-next] RDMA/umem: Combine contiguous PAGE_SIZE regions in SGEs | expand |
On Thu, Mar 28, 2019 at 11:54:02AM -0500, Shiraz Saleem wrote: > +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table > + * > + * sg: current scatterlist entry > + * page_list: array of npage struct page pointers > + * npages: number of pages in page_list > + * nents: [out] number of entries in the scatterlist > + * > + * Return new end of scatterlist > + */ > +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, > + struct page **page_list, > + unsigned long npages, > + int *nents) > +{ > + unsigned long first_pfn; > + unsigned long i = 0; > + bool update_cur_sg = false; > + bool first = !sg_page(sg); > + > + /* Check if new page_list is contiguous with end of previous page_list. > + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. > + */ > + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == > + page_to_pfn(page_list[0]))) > + update_cur_sg = true; > + > + while (i != npages) { > + unsigned long len; > + struct page *first_page = page_list[i]; > + > + first_pfn = page_to_pfn(first_page); > + > + /* Compute the number of contiguous pages we have starting > + * at i > + */ > + for (len = 0; i != npages && > + first_pfn + len == page_to_pfn(page_list[i]); > + len++) > + i++; > + > + /* Squash N contiguous pages from page_list into current sge */ > + if (update_cur_sg) { > + sg_set_page(sg, sg_page(sg), > + sg->length + (len << PAGE_SHIFT), 0); There is another problem here.. The sg->length is an 'unsigned int' so this add can technically overflow. Need to also check 'if length + len > UINT_MAX' before combining. There is also dma_get_max_seg_size(), maybe that is the right limit? Maybe the IB core should just call dma_set_max_seg_size(2G) for all drivers? I'm looking at the iommu code and thinking if you pass a sg->length > max_seg_size it probably does something bad .. Jason
On 3/28/19 9:54 AM, Shiraz Saleem wrote: > Combine contiguous regions of PAGE_SIZE pages > into single scatter list entries while adding > to the scatter table. This minimizes the number > of the entries in the scatter list and reduces > the DMA mapping overhead, particularly with the > IOMMU. > > Also, purge npages in struct ib_umem as we now > DMA map the umem SGL with sg_nents, and update > remaining non ODP drivers that use umem->npages. > Move npages tracking to ib_umem_odp as ODP drivers > still need it. > > This patch should be applied post > https://patchwork.kernel.org/cover/10857607/ The vmw_pvrdma changes look fine. Acked-by: Adit Ranadive <aditr@vmware.com> > > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca> > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com> > --- > This patch is decoupled from series https://patchwork.kernel.org/patch/10819993/ > as it can be standalone. > Changes since v1: > * made ib_umem_add_sg_table() static > * simplified bool variable 'first' initialization > > v2->v3: > * Added a comment on calculation for checking page_list > contiguity in ib_umem_add_sg_table() > * Moved iterator to body of for loop in ib_umem_add_sg_table() > * Removed explicit update to sg->length prior to calling sg_set_page() > * Removed white space clean-up in ib_umem struct > * Purge npages from ib_umem struct and updated remaining non ODP > drivers to use ib_umem_num_pages() > * Add npages tracking in ib_umem_odp as ODP drivers still need it. > > drivers/infiniband/core/umem.c | 96 ++++++++++++++++++++++------ > drivers/infiniband/core/umem_odp.c | 4 +- > drivers/infiniband/hw/mlx5/odp.c | 2 +- > drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 11 ++-- > include/rdma/ib_umem.h | 2 +- > include/rdma/ib_umem_odp.h | 1 + > 6 files changed, 87 insertions(+), 29 deletions(-) > > diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c > index fe55515..17ebad9 100644 > --- a/drivers/infiniband/core/umem.c > +++ b/drivers/infiniband/core/umem.c > @@ -39,25 +39,22 @@ > #include <linux/export.h> > #include <linux/hugetlb.h> > #include <linux/slab.h> > +#include <linux/pagemap.h> > #include <rdma/ib_umem_odp.h> > > #include "uverbs.h" > > - > static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) > { > - struct scatterlist *sg; > + struct sg_page_iter sg_iter; > struct page *page; > - int i; > > if (umem->nmap > 0) > - ib_dma_unmap_sg(dev, umem->sg_head.sgl, > - umem->npages, > + ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, > DMA_BIDIRECTIONAL); > > - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { > - > - page = sg_page(sg); > + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { > + page = sg_page_iter_page(&sg_iter); > if (!PageDirty(page) && umem->writable && dirty) > set_page_dirty_lock(page); > put_page(page); > @@ -66,6 +63,66 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d > sg_free_table(&umem->sg_head); > } > > +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table > + * > + * sg: current scatterlist entry > + * page_list: array of npage struct page pointers > + * npages: number of pages in page_list > + * nents: [out] number of entries in the scatterlist > + * > + * Return new end of scatterlist > + */ > +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, > + struct page **page_list, > + unsigned long npages, > + int *nents) > +{ > + unsigned long first_pfn; > + unsigned long i = 0; > + bool update_cur_sg = false; > + bool first = !sg_page(sg); > + > + /* Check if new page_list is contiguous with end of previous page_list. > + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. > + */ > + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == > + page_to_pfn(page_list[0]))) > + update_cur_sg = true; > + > + while (i != npages) { > + unsigned long len; > + struct page *first_page = page_list[i]; > + > + first_pfn = page_to_pfn(first_page); > + > + /* Compute the number of contiguous pages we have starting > + * at i > + */ > + for (len = 0; i != npages && > + first_pfn + len == page_to_pfn(page_list[i]); > + len++) > + i++; > + > + /* Squash N contiguous pages from page_list into current sge */ > + if (update_cur_sg) { > + sg_set_page(sg, sg_page(sg), > + sg->length + (len << PAGE_SHIFT), 0); > + update_cur_sg = false; > + continue; > + } > + > + /* Squash N contiguous pages into next sge or first sge */ > + if (!first) > + sg = sg_next(sg); > + > + (*nents)++; > + sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); > + first = false; > + } > + > + return sg; > +} > + > /** > * ib_umem_get - Pin and DMA map userspace memory. > * > @@ -93,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, > int ret; > int i; > unsigned long dma_attrs = 0; > - struct scatterlist *sg, *sg_list_start; > + struct scatterlist *sg; > unsigned int gup_flags = FOLL_WRITE; > > if (!udata) > @@ -185,7 +242,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, > if (!umem->writable) > gup_flags |= FOLL_FORCE; > > - sg_list_start = umem->sg_head.sgl; > + sg = umem->sg_head.sgl; > > while (npages) { > down_read(&mm->mmap_sem); > @@ -198,28 +255,27 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, > goto umem_release; > } > > - umem->npages += ret; > cur_base += ret * PAGE_SIZE; > npages -= ret; > > + sg = ib_umem_add_sg_table(sg, page_list, ret, &umem->sg_nents); > + > /* Continue to hold the mmap_sem as vma_list access > * needs to be protected. > */ > - for_each_sg(sg_list_start, sg, ret, i) { > + for (i = 0; i < ret && umem->hugetlb; i++) { > if (vma_list && !is_vm_hugetlb_page(vma_list[i])) > umem->hugetlb = 0; > - > - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); > } > - up_read(&mm->mmap_sem); > > - /* preparing for next loop */ > - sg_list_start = sg; > + up_read(&mm->mmap_sem); > } > > + sg_mark_end(sg); > + > umem->nmap = ib_dma_map_sg_attrs(context->device, > umem->sg_head.sgl, > - umem->npages, > + umem->sg_nents, > DMA_BIDIRECTIONAL, > dma_attrs); > > @@ -315,8 +371,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, > return -EINVAL; > } > > - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, > - offset + ib_umem_offset(umem)); > + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, ib_umem_num_pages(umem), > + dst, length, offset + ib_umem_offset(umem)); > > if (ret < 0) > return ret; > diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c > index e6ec79a..e2d87c4 100644 > --- a/drivers/infiniband/core/umem_odp.c > +++ b/drivers/infiniband/core/umem_odp.c > @@ -527,7 +527,7 @@ static int ib_umem_odp_map_dma_single_page( > } > umem_odp->dma_list[page_index] = dma_addr | access_mask; > umem_odp->page_list[page_index] = page; > - umem->npages++; > + umem_odp->npages++; > stored_page = 1; > } else if (umem_odp->page_list[page_index] == page) { > umem_odp->dma_list[page_index] |= access_mask; > @@ -759,7 +759,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, > put_page(page); > umem_odp->page_list[idx] = NULL; > umem_odp->dma_list[idx] = 0; > - umem->npages--; > + umem_odp->npages--; > } > } > mutex_unlock(&umem_odp->umem_mutex); > diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c > index c20bfc4..0c0dfa4 100644 > --- a/drivers/infiniband/hw/mlx5/odp.c > +++ b/drivers/infiniband/hw/mlx5/odp.c > @@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, > > ib_umem_odp_unmap_dma_pages(umem_odp, start, end); > > - if (unlikely(!umem->npages && mr->parent && > + if (unlikely(!umem_odp->npages && mr->parent && > !umem_odp->dying)) { > WRITE_ONCE(umem_odp->dying, 1); > atomic_inc(&mr->parent->num_leaf_free); > diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c > index a85884e..83167fa 100644 > --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c > +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c > @@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, > union pvrdma_cmd_resp rsp; > struct pvrdma_cmd_create_mr *cmd = &req.create_mr; > struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; > - int ret; > + int ret, npages; > > if (length == 0 || length > dev->dsr->caps.max_mr_size) { > dev_warn(&dev->pdev->dev, "invalid mem region length\n"); > @@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, > return ERR_CAST(umem); > } > > - if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) { > + npages = ib_umem_num_pages(umem); > + if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) { > dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n", > - umem->npages); > + npages); > ret = -EINVAL; > goto err_umem; > } > @@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, > mr->mmr.size = length; > mr->umem = umem; > > - ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false); > + ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false); > if (ret) { > dev_warn(&dev->pdev->dev, > "could not allocate page directory\n"); > @@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, > cmd->length = length; > cmd->pd_handle = to_vpd(pd)->pd_handle; > cmd->access_flags = access_flags; > - cmd->nchunks = umem->npages; > + cmd->nchunks = npages; > cmd->pdir_dma = mr->pdir.dir_dma; > > ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP); > diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h > index 73af05d..ccc44c0 100644 > --- a/include/rdma/ib_umem.h > +++ b/include/rdma/ib_umem.h > @@ -53,7 +53,7 @@ struct ib_umem { > struct work_struct work; > struct sg_table sg_head; > int nmap; > - int npages; > + int sg_nents; > }; > > /* Returns the offset of the umem start relative to the first page. */ > diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h > index dadc96d..eeec4e5 100644 > --- a/include/rdma/ib_umem_odp.h > +++ b/include/rdma/ib_umem_odp.h > @@ -69,6 +69,7 @@ struct ib_umem_odp { > > int notifiers_seq; > int notifiers_count; > + int npages; > > /* Tree tracking */ > struct umem_odp_node interval_tree; >
>Subject: Re: [PATCH v3 rdma-next] RDMA/umem: Combine contiguous PAGE_SIZE >regions in SGEs > >On Thu, Mar 28, 2019 at 11:54:02AM -0500, Shiraz Saleem wrote: >> +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table >> + * >> + * sg: current scatterlist entry >> + * page_list: array of npage struct page pointers >> + * npages: number of pages in page_list >> + * nents: [out] number of entries in the scatterlist >> + * >> + * Return new end of scatterlist >> + */ >> +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, >> + struct page **page_list, >> + unsigned long npages, >> + int *nents) >> +{ >> + unsigned long first_pfn; >> + unsigned long i = 0; >> + bool update_cur_sg = false; >> + bool first = !sg_page(sg); >> + >> + /* Check if new page_list is contiguous with end of previous page_list. >> + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. >> + */ >> + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == >> + page_to_pfn(page_list[0]))) >> + update_cur_sg = true; >> + >> + while (i != npages) { >> + unsigned long len; >> + struct page *first_page = page_list[i]; >> + >> + first_pfn = page_to_pfn(first_page); >> + >> + /* Compute the number of contiguous pages we have starting >> + * at i >> + */ >> + for (len = 0; i != npages && >> + first_pfn + len == page_to_pfn(page_list[i]); >> + len++) >> + i++; >> + >> + /* Squash N contiguous pages from page_list into current sge */ >> + if (update_cur_sg) { >> + sg_set_page(sg, sg_page(sg), >> + sg->length + (len << PAGE_SHIFT), 0); > >There is another problem here.. The sg->length is an 'unsigned int' so this add can >technically overflow. Need to also check 'if length + len > UINT_MAX' before >combining. Thanks Jason! Nice catch. As a first step, we can check against SCATTERLIST_MAX_SEGMENT to avoid the overflow case. > >There is also dma_get_max_seg_size(), maybe that is the right limit? >Maybe the IB core should just call dma_set_max_seg_size(2G) for all drivers? > >I'm looking at the iommu code and thinking if you pass a sg->length > >max_seg_size it probably does something bad .. Probably a DMA mapping error but not sure. I am trying to gather some data internally about how Intel IOMMU would behave in this case. If we have to resort to using the dma_get_max_seg_size() as limit, setting dma_set_max_seg_size to 2G sounds via IB core sounds reasonable as long as there is no HW limitation for any of the devices. Shiraz
On Mon, Apr 01, 2019 at 06:25:04PM +0000, Saleem, Shiraz wrote: > >Subject: Re: [PATCH v3 rdma-next] RDMA/umem: Combine contiguous PAGE_SIZE > >regions in SGEs > > > >On Thu, Mar 28, 2019 at 11:54:02AM -0500, Shiraz Saleem wrote: > >> +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table > >> + * > >> + * sg: current scatterlist entry > >> + * page_list: array of npage struct page pointers > >> + * npages: number of pages in page_list > >> + * nents: [out] number of entries in the scatterlist > >> + * > >> + * Return new end of scatterlist > >> + */ > >> +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, > >> + struct page **page_list, > >> + unsigned long npages, > >> + int *nents) > >> +{ > >> + unsigned long first_pfn; > >> + unsigned long i = 0; > >> + bool update_cur_sg = false; > >> + bool first = !sg_page(sg); > >> + > >> + /* Check if new page_list is contiguous with end of previous page_list. > >> + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. > >> + */ > >> + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == > >> + page_to_pfn(page_list[0]))) > >> + update_cur_sg = true; > >> + > >> + while (i != npages) { > >> + unsigned long len; > >> + struct page *first_page = page_list[i]; > >> + > >> + first_pfn = page_to_pfn(first_page); > >> + > >> + /* Compute the number of contiguous pages we have starting > >> + * at i > >> + */ > >> + for (len = 0; i != npages && > >> + first_pfn + len == page_to_pfn(page_list[i]); > >> + len++) > >> + i++; > >> + > >> + /* Squash N contiguous pages from page_list into current sge */ > >> + if (update_cur_sg) { > >> + sg_set_page(sg, sg_page(sg), > >> + sg->length + (len << PAGE_SHIFT), 0); > > > >There is another problem here.. The sg->length is an 'unsigned int' so this add can > >technically overflow. Need to also check 'if length + len > UINT_MAX' before > >combining. > Thanks Jason! Nice catch. As a first step, we can check against SCATTERLIST_MAX_SEGMENT > to avoid the overflow case. > > > > >There is also dma_get_max_seg_size(), maybe that is the right limit? > >Maybe the IB core should just call dma_set_max_seg_size(2G) for all drivers? > > > >I'm looking at the iommu code and thinking if you pass a sg->length > > >max_seg_size it probably does something bad .. > > Probably a DMA mapping error but not sure. > I am trying to gather some data internally about how Intel IOMMU would behave in this case. I was looking at the ARM iommu stuff and it forcibly segmented the SGL into dma_get_max_seg_size() chunks, and didn't seem to check bounds.. Thus I guess it accesses past the end of the sgl if the input sgl's are larger than dma_get_max_seg_size() as it breaks them up. Didn't check *super* carefully though > If we have to resort to using the dma_get_max_seg_size() as limit, > setting dma_set_max_seg_size to 2G sounds via IB core sounds > reasonable as long as there is no HW limitation for any of the > devices. Nobody is setting anything.. I'm not sure what the default is for a modern PCI device.. 64k? Higher? I assume our default is already higher or the storage stuff isn't working quite right! This whole mechanism seems designed to support devices that have a wired 16 bit length for their DMA lists, and only one kind of DMA list.. For something like IB I think we should set the default to something big and and if the drivers can't handle it then they have to fragement when they build their DMA lists. Jason
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index fe55515..17ebad9 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -39,25 +39,22 @@ #include <linux/export.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" - static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct scatterlist *sg; + struct sg_page_iter sg_iter; struct page *page; - int i; if (umem->nmap > 0) - ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->npages, + ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, DMA_BIDIRECTIONAL); - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - - page = sg_page(sg); + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { + page = sg_page_iter_page(&sg_iter); if (!PageDirty(page) && umem->writable && dirty) set_page_dirty_lock(page); put_page(page); @@ -66,6 +63,66 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d sg_free_table(&umem->sg_head); } +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table + * + * sg: current scatterlist entry + * page_list: array of npage struct page pointers + * npages: number of pages in page_list + * nents: [out] number of entries in the scatterlist + * + * Return new end of scatterlist + */ +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, + struct page **page_list, + unsigned long npages, + int *nents) +{ + unsigned long first_pfn; + unsigned long i = 0; + bool update_cur_sg = false; + bool first = !sg_page(sg); + + /* Check if new page_list is contiguous with end of previous page_list. + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. + */ + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == + page_to_pfn(page_list[0]))) + update_cur_sg = true; + + while (i != npages) { + unsigned long len; + struct page *first_page = page_list[i]; + + first_pfn = page_to_pfn(first_page); + + /* Compute the number of contiguous pages we have starting + * at i + */ + for (len = 0; i != npages && + first_pfn + len == page_to_pfn(page_list[i]); + len++) + i++; + + /* Squash N contiguous pages from page_list into current sge */ + if (update_cur_sg) { + sg_set_page(sg, sg_page(sg), + sg->length + (len << PAGE_SHIFT), 0); + update_cur_sg = false; + continue; + } + + /* Squash N contiguous pages into next sge or first sge */ + if (!first) + sg = sg_next(sg); + + (*nents)++; + sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); + first = false; + } + + return sg; +} + /** * ib_umem_get - Pin and DMA map userspace memory. * @@ -93,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, int ret; int i; unsigned long dma_attrs = 0; - struct scatterlist *sg, *sg_list_start; + struct scatterlist *sg; unsigned int gup_flags = FOLL_WRITE; if (!udata) @@ -185,7 +242,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, if (!umem->writable) gup_flags |= FOLL_FORCE; - sg_list_start = umem->sg_head.sgl; + sg = umem->sg_head.sgl; while (npages) { down_read(&mm->mmap_sem); @@ -198,28 +255,27 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, goto umem_release; } - umem->npages += ret; cur_base += ret * PAGE_SIZE; npages -= ret; + sg = ib_umem_add_sg_table(sg, page_list, ret, &umem->sg_nents); + /* Continue to hold the mmap_sem as vma_list access * needs to be protected. */ - for_each_sg(sg_list_start, sg, ret, i) { + for (i = 0; i < ret && umem->hugetlb; i++) { if (vma_list && !is_vm_hugetlb_page(vma_list[i])) umem->hugetlb = 0; - - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } - up_read(&mm->mmap_sem); - /* preparing for next loop */ - sg_list_start = sg; + up_read(&mm->mmap_sem); } + sg_mark_end(sg); + umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, - umem->npages, + umem->sg_nents, DMA_BIDIRECTIONAL, dma_attrs); @@ -315,8 +371,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, - offset + ib_umem_offset(umem)); + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, ib_umem_num_pages(umem), + dst, length, offset + ib_umem_offset(umem)); if (ret < 0) return ret; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e6ec79a..e2d87c4 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -527,7 +527,7 @@ static int ib_umem_odp_map_dma_single_page( } umem_odp->dma_list[page_index] = dma_addr | access_mask; umem_odp->page_list[page_index] = page; - umem->npages++; + umem_odp->npages++; stored_page = 1; } else if (umem_odp->page_list[page_index] == page) { umem_odp->dma_list[page_index] |= access_mask; @@ -759,7 +759,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, put_page(page); umem_odp->page_list[idx] = NULL; umem_odp->dma_list[idx] = 0; - umem->npages--; + umem_odp->npages--; } } mutex_unlock(&umem_odp->umem_mutex); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index c20bfc4..0c0dfa4 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ib_umem_odp_unmap_dma_pages(umem_odp, start, end); - if (unlikely(!umem->npages && mr->parent && + if (unlikely(!umem_odp->npages && mr->parent && !umem_odp->dying)) { WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&mr->parent->num_leaf_free); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index a85884e..83167fa 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, union pvrdma_cmd_resp rsp; struct pvrdma_cmd_create_mr *cmd = &req.create_mr; struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; - int ret; + int ret, npages; if (length == 0 || length > dev->dsr->caps.max_mr_size) { dev_warn(&dev->pdev->dev, "invalid mem region length\n"); @@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_CAST(umem); } - if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) { + npages = ib_umem_num_pages(umem); + if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) { dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n", - umem->npages); + npages); ret = -EINVAL; goto err_umem; } @@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mmr.size = length; mr->umem = umem; - ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false); + ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false); if (ret) { dev_warn(&dev->pdev->dev, "could not allocate page directory\n"); @@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, cmd->length = length; cmd->pd_handle = to_vpd(pd)->pd_handle; cmd->access_flags = access_flags; - cmd->nchunks = umem->npages; + cmd->nchunks = npages; cmd->pdir_dma = mr->pdir.dir_dma; ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 73af05d..ccc44c0 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -53,7 +53,7 @@ struct ib_umem { struct work_struct work; struct sg_table sg_head; int nmap; - int npages; + int sg_nents; }; /* Returns the offset of the umem start relative to the first page. */ diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index dadc96d..eeec4e5 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -69,6 +69,7 @@ struct ib_umem_odp { int notifiers_seq; int notifiers_count; + int npages; /* Tree tracking */ struct umem_odp_node interval_tree;