Message ID | CAD=hENeqTTmpS5V+G646V0QvJFLVSd3Sq11ffQFcDXU-OSsQEg@mail.gmail.com (mailing list archive) |
---|---|
State | Rejected |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | [1/1] RDMA/umem: add back hugepage sg list | expand |
Please use git send-email to send patches. Also don't write 1/1 for single patch. Thanks On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote: > From: Zhu Yanjun <zyjzyj2000@gmail.com> > > After the commit ("RDMA/umem: Move to allocate SG table from pages"), > the sg list from ib_ume_get is like the following: > " > sg_dma_address(sg):0x4b3c1ce000 > sg_dma_address(sg):0x4c3c1cd000 > sg_dma_address(sg):0x4d3c1cc000 > sg_dma_address(sg):0x4e3c1cb000 > " > > But sometimes, we need sg list like the following: > " > sg_dma_address(sg):0x203b400000 > sg_dma_address(sg):0x213b200000 > sg_dma_address(sg):0x223b000000 > sg_dma_address(sg):0x233ae00000 > sg_dma_address(sg):0x243ac00000 > " > The function ib_umem_add_sg_table can provide the sg list like the > second. And this function is removed in the commit ("RDMA/umem: Move > to allocate SG table from pages"). Now I add it back. > > The new function is ib_umem_get to ib_umem_hugepage_get that calls > ib_umem_add_sg_table. > > This function ib_umem_huagepage_get can get 4K, 2M sg list dma address. > > Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages") > Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com> > --- > drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++ > include/rdma/ib_umem.h | 3 + > 2 files changed, 200 insertions(+) > > diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c > index 2dde99a9ba07..af8733788b1e 100644 > --- a/drivers/infiniband/core/umem.c > +++ b/drivers/infiniband/core/umem.c > @@ -62,6 +62,203 @@ static void __ib_umem_release(struct ib_device > *dev, struct ib_umem *umem, int d > sg_free_table(&umem->sg_head); > } > > +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table > + * > + * sg: current scatterlist entry > + * page_list: array of npage struct page pointers > + * npages: number of pages in page_list > + * max_seg_sz: maximum segment size in bytes > + * nents: [out] number of entries in the scatterlist > + * > + * Return new end of scatterlist > + */ > +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, > + struct page **page_list, > + unsigned long npages, > + unsigned int max_seg_sz, > + int *nents) > +{ > + unsigned long first_pfn; > + unsigned long i = 0; > + bool update_cur_sg = false; > + bool first = !sg_page(sg); > + > + /* Check if new page_list is contiguous with end of previous page_list. > + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. > + */ > + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == > + page_to_pfn(page_list[0]))) > + update_cur_sg = true; > + > + while (i != npages) { > + unsigned long len; > + struct page *first_page = page_list[i]; > + > + first_pfn = page_to_pfn(first_page); > + > + /* Compute the number of contiguous pages we have starting > + * at i > + */ > + for (len = 0; i != npages && > + first_pfn + len == page_to_pfn(page_list[i]) && > + len < (max_seg_sz >> PAGE_SHIFT); > + len++) > + i++; > + > + /* Squash N contiguous pages from page_list into current sge */ > + if (update_cur_sg) { > + if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) { > + sg_set_page(sg, sg_page(sg), > + sg->length + (len << PAGE_SHIFT), > + 0); > + update_cur_sg = false; > + continue; > + } > + update_cur_sg = false; > + } > + > + /* Squash N contiguous pages into next sge or first sge */ > + if (!first) > + sg = sg_next(sg); > + > + (*nents)++; > + sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); > + first = false; > + } > + > + return sg; > +} > + > +/** > + * ib_umem_hugepage_get - Pin and DMA map userspace memory. > + * > + * @device: IB device to connect UMEM > + * @addr: userspace virtual address to start at > + * @size: length of region to pin > + * @access: IB_ACCESS_xxx flags for memory being pinned > + */ > +struct ib_umem *ib_umem_hugepage_get(struct ib_device *device, > + unsigned long addr, > + size_t size, int access) > +{ > + struct ib_umem *umem; > + struct page **page_list; > + unsigned long lock_limit; > + unsigned long new_pinned; > + unsigned long cur_base; > + unsigned long dma_attr = 0; > + struct mm_struct *mm; > + unsigned long npages; > + int ret; > + struct scatterlist *sg; > + unsigned int gup_flags = FOLL_WRITE; > + > + /* > + * If the combination of the addr and size requested for this memory > + * region causes an integer overflow, return error. > + */ > + if (((addr + size) < addr) || > + PAGE_ALIGN(addr + size) < (addr + size)) > + return ERR_PTR(-EINVAL); > + > + if (!can_do_mlock()) > + return ERR_PTR(-EPERM); > + > + if (access & IB_ACCESS_ON_DEMAND) > + return ERR_PTR(-EOPNOTSUPP); > + > + umem = kzalloc(sizeof(*umem), GFP_KERNEL); > + if (!umem) > + return ERR_PTR(-ENOMEM); > + umem->ibdev = device; > + umem->length = size; > + umem->address = addr; > + umem->writable = ib_access_writable(access); > + umem->owning_mm = mm = current->mm; > + mmgrab(mm); > + > + page_list = (struct page **) __get_free_page(GFP_KERNEL); > + if (!page_list) { > + ret = -ENOMEM; > + goto umem_kfree; > + } > + > + npages = ib_umem_num_pages(umem); > + if (npages == 0 || npages > UINT_MAX) { > + ret = -EINVAL; > + goto out; > + } > + > + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + > + new_pinned = atomic64_add_return(npages, &mm->pinned_vm); > + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { > + atomic64_sub(npages, &mm->pinned_vm); > + ret = -ENOMEM; > + goto out; > + } > + > + cur_base = addr & PAGE_MASK; > + > + ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); > + if (ret) > + goto vma; > + > + if (!umem->writable) > + gup_flags |= FOLL_FORCE; > + > + sg = umem->sg_head.sgl; > + > + while (npages) { > + cond_resched(); > + ret = pin_user_pages_fast(cur_base, > + min_t(unsigned long, npages, > + PAGE_SIZE / > + sizeof(struct page *)), > + gup_flags | FOLL_LONGTERM, page_list); > + if (ret < 0) > + goto umem_release; > + > + cur_base += ret * PAGE_SIZE; > + npages -= ret; > + > + sg = ib_umem_add_sg_table(sg, page_list, ret, > + dma_get_max_seg_size(device->dma_device), > + &umem->sg_nents); > + } > + > + sg_mark_end(sg); > + > + if (access & IB_ACCESS_RELAXED_ORDERING) > + dma_attr |= DMA_ATTR_WEAK_ORDERING; > + > + umem->nmap = > + ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents, > + DMA_BIDIRECTIONAL, dma_attr); > + > + if (!umem->nmap) { > + ret = -ENOMEM; > + goto umem_release; > + } > + > + ret = 0; > + goto out; > + > +umem_release: > + __ib_umem_release(device, umem, 0); > +vma: > + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); > +out: > + free_page((unsigned long) page_list); > +umem_kfree: > + if (ret) { > + mmdrop(umem->owning_mm); > + kfree(umem); > + } > + return ret ? ERR_PTR(ret) : umem; > +} > +EXPORT_SYMBOL(ib_umem_hugepage_get); > + > /** > * ib_umem_find_best_pgsz - Find best HW page size to use for this MR > * > diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h > index 676c57f5ca80..fc6350ff21e6 100644 > --- a/include/rdma/ib_umem.h > +++ b/include/rdma/ib_umem.h > @@ -96,6 +96,9 @@ static inline void > __rdma_umem_block_iter_start(struct ib_block_iter *biter, > __rdma_block_iter_next(biter);) > > #ifdef CONFIG_INFINIBAND_USER_MEM > +struct ib_umem *ib_umem_hugepage_get(struct ib_device *device, > + unsigned long addr, > + size_t size, int access); > > struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, > size_t size, int access); > -- > 2.25.1
On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote: > From: Zhu Yanjun <zyjzyj2000@gmail.com> > > After the commit ("RDMA/umem: Move to allocate SG table from pages"), > the sg list from ib_ume_get is like the following: > " > sg_dma_address(sg):0x4b3c1ce000 > sg_dma_address(sg):0x4c3c1cd000 > sg_dma_address(sg):0x4d3c1cc000 > sg_dma_address(sg):0x4e3c1cb000 > " > > But sometimes, we need sg list like the following: > " > sg_dma_address(sg):0x203b400000 > sg_dma_address(sg):0x213b200000 > sg_dma_address(sg):0x223b000000 > sg_dma_address(sg):0x233ae00000 > sg_dma_address(sg):0x243ac00000 > " > The function ib_umem_add_sg_table can provide the sg list like the > second. And this function is removed in the commit ("RDMA/umem: Move > to allocate SG table from pages"). Now I add it back. > > The new function is ib_umem_get to ib_umem_hugepage_get that calls > ib_umem_add_sg_table. > > This function ib_umem_huagepage_get can get 4K, 2M sg list dma address. > > Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages") > Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com> > --- > drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++ > include/rdma/ib_umem.h | 3 + > 2 files changed, 200 insertions(+) You didn't use newly introduced function and didn't really explain why it is needed. Thanks
On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote: > From: Zhu Yanjun <zyjzyj2000@gmail.com> > > After the commit ("RDMA/umem: Move to allocate SG table from pages"), > the sg list from ib_ume_get is like the following: If you are not getting huge pages then you need to fix __sg_alloc_table_from_pages() But as far as I know it is not buggy like you are describing. > The new function is ib_umem_get to ib_umem_hugepage_get that calls > ib_umem_add_sg_table. Nope Jason
On Mon, Mar 8, 2021 at 1:22 AM Leon Romanovsky <leon@kernel.org> wrote: > > On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote: > > From: Zhu Yanjun <zyjzyj2000@gmail.com> > > > > After the commit ("RDMA/umem: Move to allocate SG table from pages"), > > the sg list from ib_ume_get is like the following: > > " > > sg_dma_address(sg):0x4b3c1ce000 > > sg_dma_address(sg):0x4c3c1cd000 > > sg_dma_address(sg):0x4d3c1cc000 > > sg_dma_address(sg):0x4e3c1cb000 > > " > > > > But sometimes, we need sg list like the following: > > " > > sg_dma_address(sg):0x203b400000 > > sg_dma_address(sg):0x213b200000 > > sg_dma_address(sg):0x223b000000 > > sg_dma_address(sg):0x233ae00000 > > sg_dma_address(sg):0x243ac00000 > > " > > The function ib_umem_add_sg_table can provide the sg list like the > > second. And this function is removed in the commit ("RDMA/umem: Move > > to allocate SG table from pages"). Now I add it back. > > > > The new function is ib_umem_get to ib_umem_hugepage_get that calls > > ib_umem_add_sg_table. > > > > This function ib_umem_huagepage_get can get 4K, 2M sg list dma address. > > > > Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages") > > Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com> > > --- > > drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++ > > include/rdma/ib_umem.h | 3 + > > 2 files changed, 200 insertions(+) > > You didn't use newly introduced function and didn't really explain why > it is needed. OK. I will explain later. Zhu Yanjun > > Thanks
On Mon, Mar 8, 2021 at 8:44 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote: > > From: Zhu Yanjun <zyjzyj2000@gmail.com> > > > > After the commit ("RDMA/umem: Move to allocate SG table from pages"), > > the sg list from ib_ume_get is like the following: > > If you are not getting huge pages then you need to fix > __sg_alloc_table_from_pages() > > But as far as I know it is not buggy like you are describing. In the same host and the same test, both __sg_alloc_table_from_pages and ib_umem_add_sg_table are called at the same time, but the returned sg lists are different. It is weird. Zhu Yanjun > > > The new function is ib_umem_get to ib_umem_hugepage_get that calls > > ib_umem_add_sg_table. > > Nope > > Jason
On Mon, Mar 8, 2021 at 1:22 AM Leon Romanovsky <leon@kernel.org> wrote: > > On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote: > > From: Zhu Yanjun <zyjzyj2000@gmail.com> > > > > After the commit ("RDMA/umem: Move to allocate SG table from pages"), > > the sg list from ib_ume_get is like the following: > > " > > sg_dma_address(sg):0x4b3c1ce000 > > sg_dma_address(sg):0x4c3c1cd000 > > sg_dma_address(sg):0x4d3c1cc000 > > sg_dma_address(sg):0x4e3c1cb000 > > " > > > > But sometimes, we need sg list like the following: > > " > > sg_dma_address(sg):0x203b400000 > > sg_dma_address(sg):0x213b200000 > > sg_dma_address(sg):0x223b000000 > > sg_dma_address(sg):0x233ae00000 > > sg_dma_address(sg):0x243ac00000 > > " > > The function ib_umem_add_sg_table can provide the sg list like the > > second. And this function is removed in the commit ("RDMA/umem: Move > > to allocate SG table from pages"). Now I add it back. > > > > The new function is ib_umem_get to ib_umem_hugepage_get that calls > > ib_umem_add_sg_table. > > > > This function ib_umem_huagepage_get can get 4K, 2M sg list dma address. > > > > Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages") > > Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com> > > --- > > drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++ > > include/rdma/ib_umem.h | 3 + > > 2 files changed, 200 insertions(+) > > You didn't use newly introduced function and didn't really explain why Thanks Leon and Jason. Now I set ib_dma_max_seg_size to SZ_2M. Then this problem is fixed. This problem is caused by the capacity parameter of our device. Originally the ib_dma_max_seg_size is set to UINT_MAX. Then the sg dma address is different from the ones before the commit ("RDMA/umem: Move to allocate SG table from pages"). And I delved into the source code of __sg_alloc_table_from_pages. I found that this function is related with ib_dma_max_seg_size. So when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma address is 2M now. The function ib_umem_add_sg_table is not related with ib_dma_max_seg_size. So the value of ib_dma_max_seg_size does not affect sg dma address. Anyway, this problem is not caused by the function __sg_alloc_table_from_pages. Please ignore this commit. Zhu Yanjun > it is needed. > > Thanks
On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote: > And I delved into the source code of __sg_alloc_table_from_pages. I > found that this function is related with ib_dma_max_seg_size. So > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma > address is 2M now. That seems like a bug, you should fix it Jason
On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote: > > > And I delved into the source code of __sg_alloc_table_from_pages. I > > found that this function is related with ib_dma_max_seg_size. So > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma > > address is 2M now. > > That seems like a bug, you should fix it Hi, Jason && Leon I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table. In __sg_alloc_table_from_pages: " 449 if (prv) { 450 unsigned long paddr = (page_to_pfn(sg_page(prv)) * PAGE_SIZE + 451 prv->offset + prv->length) / 452 PAGE_SIZE; 453 454 if (WARN_ON(offset)) 455 return ERR_PTR(-EINVAL); 456 457 /* Merge contiguous pages into the last SG */ 458 prv_len = prv->length; 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { 460 if (prv->length + PAGE_SIZE > max_segment) 461 break; 462 prv->length += PAGE_SIZE; 463 paddr++; 464 pages++; 465 n_pages--; 466 } 467 if (!n_pages) 468 goto out; 469 } " if prv->length + PAGE_SIZE > max_segment, then set another sg. In the commit "RDMA/umem: Move to allocate SG table from pages", max_segment is dma_get_max_seg_size. Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is usually less than max_segment since length is unsigned int. So the following has very few chance to execute. " 485 for (i = 0; i < chunks; i++) { ... 490 for (j = cur_page + 1; j < n_pages; j++) { 491 seg_len += PAGE_SIZE; 492 if (seg_len >= max_segment || 493 page_to_pfn(pages[j]) != 494 page_to_pfn(pages[j - 1]) + 1) 495 break; 496 } 497 498 /* Pass how many chunks might be left */ 499 s = get_next_sg(sgt, s, chunks - i + left_pages, gfp_mask); ... 510 sg_set_page(s, pages[cur_page], 511 min_t(unsigned long, size, chunk_size), offset); 512 added_nents++; ... 516 } " In the function ib_umem_add_sg_table, max_segment is used like the below: " /* Squash N contiguous pages from page_list into current sge */ if (update_cur_sg) { if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) { sg_set_page(sg, sg_page(sg), sg->length + (len << PAGE_SHIFT), 0); update_cur_sg = false; continue; } update_cur_sg = false; } /* Squash N contiguous pages into next sge or first sge */ if (!first) sg = sg_next(sg); (*nents)++; sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); " if (max_seg_sz - sg->length) >= (len << PAGE_SHIFT), the function sg_next and sg_set_page are called several times. The different usage of max_seg_sz/max_segment is the root cause that sg_dma_addresses are different from the function __sg_alloc_table_from_pages and ib_umem_add_sg_table. The following commit tries to fix this problem, please comment on it. From 65472ef21146b0fb72d5eb3e2fe1277380d29446 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun <zyjzyj2000@gmail.com> Date: Thu, 11 Mar 2021 12:35:40 -0500 Subject: [PATCH 1/1] RDMA/umem: fix different sg_dma_address problem After the commit 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages"), the returned sg list has the different dma addresses compared with the removed the function ib_umem_add_sg_table. The root cause is that max_seg_sz/max_segment has different usage in the function ib_umem_add_sg_table and __sg_alloc_table_from_pages. Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages") Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com> --- drivers/infiniband/core/umem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 2dde99a9ba07..71188edbb45f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -232,7 +232,9 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, npages -= ret; sg = __sg_alloc_table_from_pages(&umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT, - ib_dma_max_seg_size(device), sg, npages, + min_t(unsigned int, + ib_dma_max_seg_size(device), ret * PAGE_SIZE), + sg, npages, GFP_KERNEL); umem->sg_nents = umem->sg_head.nents; if (IS_ERR(sg)) { -- 2.27.0 > > Jason
On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote: > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote: > > > > > And I delved into the source code of __sg_alloc_table_from_pages. I > > > found that this function is related with ib_dma_max_seg_size. So > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma > > > address is 2M now. > > > > That seems like a bug, you should fix it > > Hi, Jason && Leon > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table. > In __sg_alloc_table_from_pages: > > " > 449 if (prv) { > 450 unsigned long paddr = (page_to_pfn(sg_page(prv)) > * PAGE_SIZE + > 451 prv->offset + prv->length) / > 452 PAGE_SIZE; > 453 > 454 if (WARN_ON(offset)) > 455 return ERR_PTR(-EINVAL); > 456 > 457 /* Merge contiguous pages into the last SG */ > 458 prv_len = prv->length; > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > 460 if (prv->length + PAGE_SIZE > max_segment) > 461 break; > 462 prv->length += PAGE_SIZE; > 463 paddr++; > 464 pages++; > 465 n_pages--; > 466 } > 467 if (!n_pages) > 468 goto out; > 469 } > > " > if prv->length + PAGE_SIZE > max_segment, then set another sg. > In the commit "RDMA/umem: Move to allocate SG table from pages", > max_segment is dma_get_max_seg_size. > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is > usually less than max_segment > since length is unsigned int. I don't understand what you are trying to say 460 if (prv->length + PAGE_SIZE > max_segment) max_segment should be a very big number and "prv->length + PAGE_SIZE" should always be < max_segment so it should always be increasing the size of prv->length and 'rpv' here is the sgl. The other loops are the same. Jason
On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote: > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote: > > > > > > > And I delved into the source code of __sg_alloc_table_from_pages. I > > > > found that this function is related with ib_dma_max_seg_size. So > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma > > > > address is 2M now. > > > > > > That seems like a bug, you should fix it > > > > Hi, Jason && Leon > > > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table. > > In __sg_alloc_table_from_pages: > > > > " > > 449 if (prv) { > > 450 unsigned long paddr = (page_to_pfn(sg_page(prv)) > > * PAGE_SIZE + > > 451 prv->offset + prv->length) / > > 452 PAGE_SIZE; > > 453 > > 454 if (WARN_ON(offset)) > > 455 return ERR_PTR(-EINVAL); > > 456 > > 457 /* Merge contiguous pages into the last SG */ > > 458 prv_len = prv->length; > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > 460 if (prv->length + PAGE_SIZE > max_segment) > > 461 break; > > 462 prv->length += PAGE_SIZE; > > 463 paddr++; > > 464 pages++; > > 465 n_pages--; > > 466 } > > 467 if (!n_pages) > > 468 goto out; > > 469 } > > > > " > > if prv->length + PAGE_SIZE > max_segment, then set another sg. > > In the commit "RDMA/umem: Move to allocate SG table from pages", > > max_segment is dma_get_max_seg_size. > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is > > usually less than max_segment > > since length is unsigned int. > > I don't understand what you are trying to say > > 460 if (prv->length + PAGE_SIZE > max_segment) > > max_segment should be a very big number and "prv->length + PAGE_SIZE" should > always be < max_segment so it should always be increasing the size of > prv->length and 'rpv' here is the sgl. Sure. When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE. It is big. That is, segment size is UINT_MAX - PAGE_SIZE. But from the function ib_umem_add_sg_table, the prv->length is SZ_2M. That is, segment size if SZ_2M. It is the difference between the 2 functions. Zhu Yanjun > > The other loops are the same. > > Jason
On Fri, Mar 12, 2021 at 4:04 PM Zhu Yanjun <zyjzyj2000@gmail.com> wrote: > > On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote: > > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote: > > > > > > > > > And I delved into the source code of __sg_alloc_table_from_pages. I > > > > > found that this function is related with ib_dma_max_seg_size. So > > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is > > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma > > > > > address is 2M now. > > > > > > > > That seems like a bug, you should fix it > > > > > > Hi, Jason && Leon > > > > > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table. > > > In __sg_alloc_table_from_pages: > > > > > > " > > > 449 if (prv) { > > > 450 unsigned long paddr = (page_to_pfn(sg_page(prv)) > > > * PAGE_SIZE + > > > 451 prv->offset + prv->length) / > > > 452 PAGE_SIZE; > > > 453 > > > 454 if (WARN_ON(offset)) > > > 455 return ERR_PTR(-EINVAL); > > > 456 > > > 457 /* Merge contiguous pages into the last SG */ > > > 458 prv_len = prv->length; > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > 460 if (prv->length + PAGE_SIZE > max_segment) > > > 461 break; > > > 462 prv->length += PAGE_SIZE; > > > 463 paddr++; > > > 464 pages++; > > > 465 n_pages--; > > > 466 } > > > 467 if (!n_pages) > > > 468 goto out; > > > 469 } > > > > > > " > > > if prv->length + PAGE_SIZE > max_segment, then set another sg. > > > In the commit "RDMA/umem: Move to allocate SG table from pages", > > > max_segment is dma_get_max_seg_size. > > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is > > > usually less than max_segment > > > since length is unsigned int. > > > > I don't understand what you are trying to say > > > > 460 if (prv->length + PAGE_SIZE > max_segment) > > > > max_segment should be a very big number and "prv->length + PAGE_SIZE" should > > always be < max_segment so it should always be increasing the size of > > prv->length and 'rpv' here is the sgl. > > Sure. > > When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE. > It is big. That is, segment size is UINT_MAX - PAGE_SIZE. > > But from the function ib_umem_add_sg_table, the prv->length is SZ_2M. > That is, segment size if SZ_2M. > > It is the difference between the 2 functions. BTW, my commit is to fix this difference. Zhu Yanjun > > Zhu Yanjun > > > > > > The other loops are the same. > > > > Jason
On Fri, Mar 12, 2021 at 04:04:35PM +0800, Zhu Yanjun wrote: > On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote: > > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote: > > > > > > > > > And I delved into the source code of __sg_alloc_table_from_pages. I > > > > > found that this function is related with ib_dma_max_seg_size. So > > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is > > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma > > > > > address is 2M now. > > > > > > > > That seems like a bug, you should fix it > > > > > > Hi, Jason && Leon > > > > > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table. > > > In __sg_alloc_table_from_pages: > > > > > > " > > > 449 if (prv) { > > > 450 unsigned long paddr = (page_to_pfn(sg_page(prv)) > > > * PAGE_SIZE + > > > 451 prv->offset + prv->length) / > > > 452 PAGE_SIZE; > > > 453 > > > 454 if (WARN_ON(offset)) > > > 455 return ERR_PTR(-EINVAL); > > > 456 > > > 457 /* Merge contiguous pages into the last SG */ > > > 458 prv_len = prv->length; > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > 460 if (prv->length + PAGE_SIZE > max_segment) > > > 461 break; > > > 462 prv->length += PAGE_SIZE; > > > 463 paddr++; > > > 464 pages++; > > > 465 n_pages--; > > > 466 } > > > 467 if (!n_pages) > > > 468 goto out; > > > 469 } > > > > > > " > > > if prv->length + PAGE_SIZE > max_segment, then set another sg. > > > In the commit "RDMA/umem: Move to allocate SG table from pages", > > > max_segment is dma_get_max_seg_size. > > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is > > > usually less than max_segment > > > since length is unsigned int. > > > > I don't understand what you are trying to say > > > > 460 if (prv->length + PAGE_SIZE > max_segment) > > > > max_segment should be a very big number and "prv->length + PAGE_SIZE" should > > always be < max_segment so it should always be increasing the size of > > prv->length and 'rpv' here is the sgl. > > Sure. > > When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE. > It is big. That is, segment size is UINT_MAX - PAGE_SIZE. > > But from the function ib_umem_add_sg_table, the prv->length is SZ_2M. > That is, segment size if SZ_2M. > > It is the difference between the 2 functions. I still have no idea what you are trying to say. Why would prv->length be 'UINT - PAGE_SIZE'? That sounds like max_segment Jason
On Fri, Mar 12, 2021 at 9:02 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Fri, Mar 12, 2021 at 04:04:35PM +0800, Zhu Yanjun wrote: > > On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote: > > > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote: > > > > > > > > > > > And I delved into the source code of __sg_alloc_table_from_pages. I > > > > > > found that this function is related with ib_dma_max_seg_size. So > > > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is > > > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma > > > > > > address is 2M now. > > > > > > > > > > That seems like a bug, you should fix it > > > > > > > > Hi, Jason && Leon > > > > > > > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table. > > > > In __sg_alloc_table_from_pages: > > > > > > > > " > > > > 449 if (prv) { > > > > 450 unsigned long paddr = (page_to_pfn(sg_page(prv)) > > > > * PAGE_SIZE + > > > > 451 prv->offset + prv->length) / > > > > 452 PAGE_SIZE; > > > > 453 > > > > 454 if (WARN_ON(offset)) > > > > 455 return ERR_PTR(-EINVAL); > > > > 456 > > > > 457 /* Merge contiguous pages into the last SG */ > > > > 458 prv_len = prv->length; > > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > > 460 if (prv->length + PAGE_SIZE > max_segment) > > > > 461 break; > > > > 462 prv->length += PAGE_SIZE; > > > > 463 paddr++; > > > > 464 pages++; > > > > 465 n_pages--; > > > > 466 } > > > > 467 if (!n_pages) > > > > 468 goto out; > > > > 469 } > > > > > > > > " > > > > if prv->length + PAGE_SIZE > max_segment, then set another sg. > > > > In the commit "RDMA/umem: Move to allocate SG table from pages", > > > > max_segment is dma_get_max_seg_size. > > > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is > > > > usually less than max_segment > > > > since length is unsigned int. > > > > > > I don't understand what you are trying to say > > > > > > 460 if (prv->length + PAGE_SIZE > max_segment) > > > > > > max_segment should be a very big number and "prv->length + PAGE_SIZE" should > > > always be < max_segment so it should always be increasing the size of > > > prv->length and 'rpv' here is the sgl. > > > > Sure. > > > > When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE. > > It is big. That is, segment size is UINT_MAX - PAGE_SIZE. > > > > But from the function ib_umem_add_sg_table, the prv->length is SZ_2M. > > That is, segment size if SZ_2M. > > > > It is the difference between the 2 functions. > > I still have no idea what you are trying to say. Why would prv->length > be 'UINT - PAGE_SIZE'? That sounds like max_segment Sure. whatever, this prv->length from __sg_alloc_table_from_pages is greater than sg->length from the function ib_umem_add_sg_table. __sg_alloc_table_from_pages: " 457 /* Merge contiguous pages into the last SG */ 458 prv_len = prv->length; 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { 460 if (prv->length + PAGE_SIZE > max_segment) <-------------- prv->length > max_segment - PAGE_SIZE 461 break; 462 prv->length += PAGE_SIZE; 463 paddr++; 464 pages++; 465 n_pages--; 466 } 467 if (!n_pages) 468 goto out; " Zhu Yanjun > > Jason
On Fri, Mar 12, 2021 at 9:42 PM Zhu Yanjun <zyjzyj2000@gmail.com> wrote: > > On Fri, Mar 12, 2021 at 9:02 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Fri, Mar 12, 2021 at 04:04:35PM +0800, Zhu Yanjun wrote: > > > On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote: > > > > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > > > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote: > > > > > > > > > > > > > And I delved into the source code of __sg_alloc_table_from_pages. I > > > > > > > found that this function is related with ib_dma_max_seg_size. So > > > > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is > > > > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma > > > > > > > address is 2M now. > > > > > > > > > > > > That seems like a bug, you should fix it > > > > > > > > > > Hi, Jason && Leon > > > > > > > > > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table. > > > > > In __sg_alloc_table_from_pages: > > > > > > > > > > " > > > > > 449 if (prv) { > > > > > 450 unsigned long paddr = (page_to_pfn(sg_page(prv)) > > > > > * PAGE_SIZE + > > > > > 451 prv->offset + prv->length) / > > > > > 452 PAGE_SIZE; > > > > > 453 > > > > > 454 if (WARN_ON(offset)) > > > > > 455 return ERR_PTR(-EINVAL); > > > > > 456 > > > > > 457 /* Merge contiguous pages into the last SG */ > > > > > 458 prv_len = prv->length; > > > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > > > 460 if (prv->length + PAGE_SIZE > max_segment) > > > > > 461 break; > > > > > 462 prv->length += PAGE_SIZE; > > > > > 463 paddr++; > > > > > 464 pages++; > > > > > 465 n_pages--; > > > > > 466 } > > > > > 467 if (!n_pages) > > > > > 468 goto out; > > > > > 469 } > > > > > > > > > > " > > > > > if prv->length + PAGE_SIZE > max_segment, then set another sg. > > > > > In the commit "RDMA/umem: Move to allocate SG table from pages", > > > > > max_segment is dma_get_max_seg_size. > > > > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is > > > > > usually less than max_segment > > > > > since length is unsigned int. > > > > > > > > I don't understand what you are trying to say > > > > > > > > 460 if (prv->length + PAGE_SIZE > max_segment) > > > > > > > > max_segment should be a very big number and "prv->length + PAGE_SIZE" should > > > > always be < max_segment so it should always be increasing the size of > > > > prv->length and 'rpv' here is the sgl. > > > > > > Sure. > > > > > > When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE. > > > It is big. That is, segment size is UINT_MAX - PAGE_SIZE. > > > > > > But from the function ib_umem_add_sg_table, the prv->length is SZ_2M. > > > That is, segment size if SZ_2M. > > > > > > It is the difference between the 2 functions. > > > > I still have no idea what you are trying to say. Why would prv->length > > be 'UINT - PAGE_SIZE'? That sounds like max_segment > > Sure. whatever, this prv->length from __sg_alloc_table_from_pages > is greater than sg->length from the function ib_umem_add_sg_table. In short, the sg list from __sg_alloc_table_from_pages is different from the sg list from ib_umem_add_sg_table. This difference will make difference on the later behavior. Zhu Yanjun > > __sg_alloc_table_from_pages: > " > 457 /* Merge contiguous pages into the last SG */ > 458 prv_len = prv->length; > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > 460 if (prv->length + PAGE_SIZE > > max_segment) <-------------- prv->length > max_segment - PAGE_SIZE > 461 break; > 462 prv->length += PAGE_SIZE; > 463 paddr++; > 464 pages++; > 465 n_pages--; > 466 } > 467 if (!n_pages) > 468 goto out; > " > > Zhu Yanjun > > > > Jason
On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > In short, the sg list from __sg_alloc_table_from_pages is different > from the sg list from ib_umem_add_sg_table. I don't care about different. Tell me what is wrong with what we have today. I thought your first message said the sgl's were too small, but now you seem to say they are too big? I still have no idea what this problem is Jason
On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > > In short, the sg list from __sg_alloc_table_from_pages is different > > from the sg list from ib_umem_add_sg_table. > > I don't care about different. Tell me what is wrong with what we have > today. > > I thought your first message said the sgl's were too small, but now > you seem to say they are too big? Sure. The sg list from __sg_alloc_table_from_pages, length of sg is too big. And the dma address is like the followings: " sg_dma_address(sg):0x4b3c1ce000 sg_dma_address(sg):0x4c3c1cd000 sg_dma_address(sg):0x4d3c1cc000 sg_dma_address(sg):0x4e3c1cb000 " The sg list from ib_umem_add_sg_table, length of sg is 2M. And the dma address is like the followings: " sg_dma_address(sg):0x203b400000 sg_dma_address(sg):0x213b200000 sg_dma_address(sg):0x223b000000 sg_dma_address(sg):0x233ae00000 sg_dma_address(sg):0x243ac00000 " The above 2 sg lists will be handled in this function ib_umem_find_best_pgsz. The returned values are different. Zhu Yanjun > > I still have no idea what this problem is > > Jason
On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote: > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > > > In short, the sg list from __sg_alloc_table_from_pages is different > > > from the sg list from ib_umem_add_sg_table. > > > > I don't care about different. Tell me what is wrong with what we have > > today. > > > > I thought your first message said the sgl's were too small, but now > > you seem to say they are too big? > > Sure. > > The sg list from __sg_alloc_table_from_pages, length of sg is too big. > And the dma address is like the followings: > > " > sg_dma_address(sg):0x4b3c1ce000 > sg_dma_address(sg):0x4c3c1cd000 > sg_dma_address(sg):0x4d3c1cc000 > sg_dma_address(sg):0x4e3c1cb000 > " Ok, so how does too big a dma segment side cause __sg_alloc_table_from_pages() to return sg elements that are too small? I assume there is some kind of maths overflow here? Jason
On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote: > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > > > > In short, the sg list from __sg_alloc_table_from_pages is different > > > > from the sg list from ib_umem_add_sg_table. > > > > > > I don't care about different. Tell me what is wrong with what we have > > > today. > > > > > > I thought your first message said the sgl's were too small, but now > > > you seem to say they are too big? > > > > Sure. > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big. > > And the dma address is like the followings: > > > > " > > sg_dma_address(sg):0x4b3c1ce000 > > sg_dma_address(sg):0x4c3c1cd000 > > sg_dma_address(sg):0x4d3c1cc000 > > sg_dma_address(sg):0x4e3c1cb000 > > " > > Ok, so how does too big a dma segment side cause > __sg_alloc_table_from_pages() to return sg elements that are too > small? > > I assume there is some kind of maths overflow here? Please check this function __sg_alloc_table_from_pages " ... 457 /* Merge contiguous pages into the last SG */ 458 prv_len = prv->length; 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { 460 if (prv->length + PAGE_SIZE > max_segment) <--max_segment is too big. So n_pages will be 0. Then the function will goto out to exit. 461 break; 462 prv->length += PAGE_SIZE; 463 paddr++; 464 pages++; 465 n_pages--; 466 } 467 if (!n_pages) 468 goto out; ... " > > Jason
On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote: > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote: > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > > > > > In short, the sg list from __sg_alloc_table_from_pages is different > > > > > from the sg list from ib_umem_add_sg_table. > > > > > > > > I don't care about different. Tell me what is wrong with what we have > > > > today. > > > > > > > > I thought your first message said the sgl's were too small, but now > > > > you seem to say they are too big? > > > > > > Sure. > > > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big. > > > And the dma address is like the followings: > > > > > > " > > > sg_dma_address(sg):0x4b3c1ce000 > > > sg_dma_address(sg):0x4c3c1cd000 > > > sg_dma_address(sg):0x4d3c1cc000 > > > sg_dma_address(sg):0x4e3c1cb000 > > > " > > > > Ok, so how does too big a dma segment side cause > > __sg_alloc_table_from_pages() to return sg elements that are too > > small? > > > > I assume there is some kind of maths overflow here? > Please check this function __sg_alloc_table_from_pages > " > ... > 457 /* Merge contiguous pages into the last SG */ > 458 prv_len = prv->length; > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > 460 if (prv->length + PAGE_SIZE > > max_segment) <--max_segment is too big. So n_pages will be 0. Then > the function will goto out to exit. You already said this. You are reporting 4k pages, if max_segment is larger than 4k there is no such thing as "too big" I assume it is "too small" because of some maths overflow. You should add some prints and find out what is going on. Jason
On Fri, Mar 19, 2021 at 9:48 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote: > > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote: > > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > > > > > > In short, the sg list from __sg_alloc_table_from_pages is different > > > > > > from the sg list from ib_umem_add_sg_table. > > > > > > > > > > I don't care about different. Tell me what is wrong with what we have > > > > > today. > > > > > > > > > > I thought your first message said the sgl's were too small, but now > > > > > you seem to say they are too big? > > > > > > > > Sure. > > > > > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big. > > > > And the dma address is like the followings: > > > > > > > > " > > > > sg_dma_address(sg):0x4b3c1ce000 > > > > sg_dma_address(sg):0x4c3c1cd000 > > > > sg_dma_address(sg):0x4d3c1cc000 > > > > sg_dma_address(sg):0x4e3c1cb000 > > > > " > > > > > > Ok, so how does too big a dma segment side cause > > > __sg_alloc_table_from_pages() to return sg elements that are too > > > small? > > > > > > I assume there is some kind of maths overflow here? > > Please check this function __sg_alloc_table_from_pages > > " > > ... > > 457 /* Merge contiguous pages into the last SG */ > > 458 prv_len = prv->length; > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > 460 if (prv->length + PAGE_SIZE > > > max_segment) <--max_segment is too big. So n_pages will be 0. Then > > the function will goto out to exit. > > You already said this. > > You are reporting 4k pages, if max_segment is larger than 4k there is > no such thing as "too big" > > I assume it is "too small" because of some maths overflow. 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { 460 if (prv->length + PAGE_SIZE > max_segment) <--it max_segment is big, n_pages is zero. 461 break; 462 prv->length += PAGE_SIZE; 463 paddr++; 464 pages++; 465 n_pages--; 466 } 467 if (!n_pages) <---here, this function will goto out. 468 goto out; ... 509 chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset; 510 sg_set_page(s, pages[cur_page], 511 min_t(unsigned long, size, chunk_size), offset); <----this function will not have many chance to be called if max_segment is big. 512 added_nents++; 513 size -= chunk_size; If the max_segment is not big enough, for example it is SZ-2M, sg_set_page will be called every SZ_2M. To now, I do not find any math overflow. Zhu Yanjun > > You should add some prints and find out what is going on. > > Jason
On Sat, Mar 20, 2021 at 11:38 AM Zhu Yanjun <zyjzyj2000@gmail.com> wrote: > > On Fri, Mar 19, 2021 at 9:48 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote: > > > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote: > > > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > > > > > > > In short, the sg list from __sg_alloc_table_from_pages is different > > > > > > > from the sg list from ib_umem_add_sg_table. > > > > > > > > > > > > I don't care about different. Tell me what is wrong with what we have > > > > > > today. > > > > > > > > > > > > I thought your first message said the sgl's were too small, but now > > > > > > you seem to say they are too big? > > > > > > > > > > Sure. > > > > > > > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big. > > > > > And the dma address is like the followings: > > > > > > > > > > " > > > > > sg_dma_address(sg):0x4b3c1ce000 > > > > > sg_dma_address(sg):0x4c3c1cd000 > > > > > sg_dma_address(sg):0x4d3c1cc000 > > > > > sg_dma_address(sg):0x4e3c1cb000 > > > > > " > > > > > > > > Ok, so how does too big a dma segment side cause > > > > __sg_alloc_table_from_pages() to return sg elements that are too > > > > small? > > > > > > > > I assume there is some kind of maths overflow here? > > > Please check this function __sg_alloc_table_from_pages > > > " > > > ... > > > 457 /* Merge contiguous pages into the last SG */ > > > 458 prv_len = prv->length; > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > 460 if (prv->length + PAGE_SIZE > > > > max_segment) <--max_segment is too big. So n_pages will be 0. Then > > > the function will goto out to exit. > > > > You already said this. > > > > You are reporting 4k pages, if max_segment is larger than 4k there is > > no such thing as "too big" > > > > I assume it is "too small" because of some maths overflow. > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > 460 if (prv->length + PAGE_SIZE > > max_segment) <--it max_segment is big, n_pages is zero. > 461 break; > 462 prv->length += PAGE_SIZE; > 463 paddr++; > 464 pages++; > 465 n_pages--; > 466 } > 467 if (!n_pages) <---here, this function will goto out. > 468 goto out; > ... > 509 chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset; > 510 sg_set_page(s, pages[cur_page], > 511 min_t(unsigned long, size, > chunk_size), offset); <----this function will not have many chance to > be called if max_segment is big. > 512 added_nents++; > 513 size -= chunk_size; > > If the max_segment is not big enough, for example it is SZ-2M, > sg_set_page will be called every SZ_2M. > To now, I do not find any math overflow. 147 struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, 148 size_t size, int access) 149 { ... 244 umem->nmap = 245 ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents, 246 DMA_BIDIRECTIONAL, dma_attr); ... And after the function ib_dma_map_sg_attrs, dma address is set. To now, I can not find maths overflow. Zhu Yanjun > > Zhu Yanjun > > > > You should add some prints and find out what is going on. > > > > Jason
On Sat, Mar 20, 2021 at 11:38:26AM +0800, Zhu Yanjun wrote: > On Fri, Mar 19, 2021 at 9:48 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote: > > > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote: > > > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > > > > > > > In short, the sg list from __sg_alloc_table_from_pages is different > > > > > > > from the sg list from ib_umem_add_sg_table. > > > > > > > > > > > > I don't care about different. Tell me what is wrong with what we have > > > > > > today. > > > > > > > > > > > > I thought your first message said the sgl's were too small, but now > > > > > > you seem to say they are too big? > > > > > > > > > > Sure. > > > > > > > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big. > > > > > And the dma address is like the followings: > > > > > > > > > > " > > > > > sg_dma_address(sg):0x4b3c1ce000 > > > > > sg_dma_address(sg):0x4c3c1cd000 > > > > > sg_dma_address(sg):0x4d3c1cc000 > > > > > sg_dma_address(sg):0x4e3c1cb000 > > > > > " > > > > > > > > Ok, so how does too big a dma segment side cause > > > > __sg_alloc_table_from_pages() to return sg elements that are too > > > > small? > > > > > > > > I assume there is some kind of maths overflow here? > > > Please check this function __sg_alloc_table_from_pages > > > " > > > ... > > > 457 /* Merge contiguous pages into the last SG */ > > > 458 prv_len = prv->length; > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > 460 if (prv->length + PAGE_SIZE > > > > max_segment) <--max_segment is too big. So n_pages will be 0. Then > > > the function will goto out to exit. > > > > You already said this. > > > > You are reporting 4k pages, if max_segment is larger than 4k there is > > no such thing as "too big" > > > > I assume it is "too small" because of some maths overflow. > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > 460 if (prv->length + PAGE_SIZE > > max_segment) <--it max_segment is big, n_pages is zero. What does n_pages have to do with max_segment? Please try to explain clearly. Jason
On Sun, Mar 21, 2021 at 4:38 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Sat, Mar 20, 2021 at 11:38:26AM +0800, Zhu Yanjun wrote: > > On Fri, Mar 19, 2021 at 9:48 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote: > > > > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote: > > > > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > > > > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote: > > > > > > > > In short, the sg list from __sg_alloc_table_from_pages is different > > > > > > > > from the sg list from ib_umem_add_sg_table. > > > > > > > > > > > > > > I don't care about different. Tell me what is wrong with what we have > > > > > > > today. > > > > > > > > > > > > > > I thought your first message said the sgl's were too small, but now > > > > > > > you seem to say they are too big? > > > > > > > > > > > > Sure. > > > > > > > > > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big. > > > > > > And the dma address is like the followings: > > > > > > > > > > > > " > > > > > > sg_dma_address(sg):0x4b3c1ce000 > > > > > > sg_dma_address(sg):0x4c3c1cd000 > > > > > > sg_dma_address(sg):0x4d3c1cc000 > > > > > > sg_dma_address(sg):0x4e3c1cb000 > > > > > > " > > > > > > > > > > Ok, so how does too big a dma segment side cause > > > > > __sg_alloc_table_from_pages() to return sg elements that are too > > > > > small? > > > > > > > > > > I assume there is some kind of maths overflow here? > > > > Please check this function __sg_alloc_table_from_pages > > > > " > > > > ... > > > > 457 /* Merge contiguous pages into the last SG */ > > > > 458 prv_len = prv->length; > > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > > 460 if (prv->length + PAGE_SIZE > > > > > max_segment) <--max_segment is too big. So n_pages will be 0. Then > > > > the function will goto out to exit. > > > > > > You already said this. > > > > > > You are reporting 4k pages, if max_segment is larger than 4k there is > > > no such thing as "too big" > > > > > > I assume it is "too small" because of some maths overflow. > > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > 460 if (prv->length + PAGE_SIZE > > > max_segment) <--it max_segment is big, n_pages is zero. > > What does n_pages have to do with max_segment? With the following snippet " struct ib_umem *region; region = ib_umem_get(pd->device, start, len, access); page_size = ib_umem_find_best_pgsz(region, SZ_4K | SZ_2M | SZ_1G, virt); " Before the commit 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages"), the variable page_size is SZ_2M. After the commit 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages"), the variable page_size is SZ_4K. IMHO, you can reproduce this problem in your local host. Zhu Yanjun > > Please try to explain clearly. > > Jason
On Sun, Mar 21, 2021 at 04:06:07PM +0800, Zhu Yanjun wrote: > > > > You are reporting 4k pages, if max_segment is larger than 4k there is > > > > no such thing as "too big" > > > > > > > > I assume it is "too small" because of some maths overflow. > > > > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > 460 if (prv->length + PAGE_SIZE > > > > max_segment) <--it max_segment is big, n_pages is zero. > > > > What does n_pages have to do with max_segment? > > With the following snippet > " > struct ib_umem *region; > region = ib_umem_get(pd->device, start, len, access); > > page_size = ib_umem_find_best_pgsz(region, > SZ_4K | SZ_2M | SZ_1G, > virt); > " Can you please stop posting random bits of code that do nothing to answer the question? > IMHO, you can reproduce this problem in your local host. Uh, no, you need to communicate effectively or stop posting please. Jason
On Sun, Mar 21, 2021 at 8:07 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Sun, Mar 21, 2021 at 04:06:07PM +0800, Zhu Yanjun wrote: > > > > > You are reporting 4k pages, if max_segment is larger than 4k there is > > > > > no such thing as "too big" > > > > > > > > > > I assume it is "too small" because of some maths overflow. > > > > > > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > > 460 if (prv->length + PAGE_SIZE > > > > > max_segment) <--it max_segment is big, n_pages is zero. > > > > > > What does n_pages have to do with max_segment? > > > > With the following snippet > > " > > struct ib_umem *region; > > region = ib_umem_get(pd->device, start, len, access); > > > > page_size = ib_umem_find_best_pgsz(region, > > SZ_4K | SZ_2M | SZ_1G, > > virt); > > " > > Can you please stop posting random bits of code that do nothing to > answer the question? > > > IMHO, you can reproduce this problem in your local host. > > Uh, no, you need to communicate effectively or stop posting please. can you read the following link again https://patchwork.kernel.org/project/linux-rdma/patch/CAD=hENeqTTmpS5V+G646V0QvJFLVSd3Sq11ffQFcDXU-OSsQEg@mail.gmail.com/#24031643 In this link, I explained it in details. Since the max_segment is very big, so normally the prv->length + PAGE_SIZE is less than max_segment, it will loop over and over again, until n_pages is zero, then the function __sg_alloc_table_from_pages will exit. So the function __sg_alloc_table_from_pages has few chances to call the function sg_set_page. This behavior is not like the function ib_umem_add_sg_table. In ib_umem_add_sg_table, the "max_seg_sz - sg->length" has more chances to be greater than (len << PAGE_SHIFT). So the function sg_set_page is called more times than __sg_alloc_table_from_pages. The above causes the different sg lists from the 2 functions __sg_alloc_table_from_pages and ib_umem_add_sg_table. The most important thing is "if (prv->length + PAGE_SIZE > max_segment)" in __sg_alloc_table_from_pages and "if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT))" in the function ib_umem_add_sg_table. When max_segment is UINT_MAX, the different test in the 2 functions causes the different sg lists. Zhu Yanjun > > Jason
On Sun, Mar 21, 2021 at 08:54:31PM +0800, Zhu Yanjun wrote: > On Sun, Mar 21, 2021 at 8:07 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Sun, Mar 21, 2021 at 04:06:07PM +0800, Zhu Yanjun wrote: > > > > > > You are reporting 4k pages, if max_segment is larger than 4k there is > > > > > > no such thing as "too big" > > > > > > > > > > > > I assume it is "too small" because of some maths overflow. > > > > > > > > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > > > 460 if (prv->length + PAGE_SIZE > > > > > > max_segment) <--it max_segment is big, n_pages is zero. > > > > > > > > What does n_pages have to do with max_segment? > > > > > > With the following snippet > > > " > > > struct ib_umem *region; > > > region = ib_umem_get(pd->device, start, len, access); > > > > > > page_size = ib_umem_find_best_pgsz(region, > > > SZ_4K | SZ_2M | SZ_1G, > > > virt); > > > " > > > > Can you please stop posting random bits of code that do nothing to > > answer the question? > > > > > IMHO, you can reproduce this problem in your local host. > > > > Uh, no, you need to communicate effectively or stop posting please. > > can you read the following link again > > https://patchwork.kernel.org/project/linux-rdma/patch/CAD=hENeqTTmpS5V+G646V0QvJFLVSd3Sq11ffQFcDXU-OSsQEg@mail.gmail.com/#24031643 > > In this link, I explained it in details. No, this is all nonsense > Since the max_segment is very big, so normally the prv->length + > PAGE_SIZE is less than max_segment, it will loop over and over again, > until n_pages is zero, And each iteration increases prv->length += PAGE_SIZE; n_pages--; Where prv is the last sgl entry, so decreasing n_pages cannot also yield a 4k sgl entry like you are reporting. You get 4k sgl entries if max_segment is *too small* not too big! Jason
On Sun, Mar 21, 2021 at 9:03 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Sun, Mar 21, 2021 at 08:54:31PM +0800, Zhu Yanjun wrote: > > On Sun, Mar 21, 2021 at 8:07 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > On Sun, Mar 21, 2021 at 04:06:07PM +0800, Zhu Yanjun wrote: > > > > > > > You are reporting 4k pages, if max_segment is larger than 4k there is > > > > > > > no such thing as "too big" > > > > > > > > > > > > > > I assume it is "too small" because of some maths overflow. > > > > > > > > > > > > 459 while (n_pages && page_to_pfn(pages[0]) == paddr) { > > > > > > 460 if (prv->length + PAGE_SIZE > > > > > > > max_segment) <--it max_segment is big, n_pages is zero. > > > > > > > > > > What does n_pages have to do with max_segment? > > > > > > > > With the following snippet > > > > " > > > > struct ib_umem *region; > > > > region = ib_umem_get(pd->device, start, len, access); > > > > > > > > page_size = ib_umem_find_best_pgsz(region, > > > > SZ_4K | SZ_2M | SZ_1G, > > > > virt); > > > > " > > > > > > Can you please stop posting random bits of code that do nothing to > > > answer the question? > > > > > > > IMHO, you can reproduce this problem in your local host. > > > > > > Uh, no, you need to communicate effectively or stop posting please. > > > > can you read the following link again > > > > https://patchwork.kernel.org/project/linux-rdma/patch/CAD=hENeqTTmpS5V+G646V0QvJFLVSd3Sq11ffQFcDXU-OSsQEg@mail.gmail.com/#24031643 > > > > In this link, I explained it in details. > > No, this is all nonsense > > > Since the max_segment is very big, so normally the prv->length + > > PAGE_SIZE is less than max_segment, it will loop over and over again, > > until n_pages is zero, > > And each iteration increases > > prv->length += PAGE_SIZE; > n_pages--; > > Where prv is the last sgl entry, so decreasing n_pages cannot also > yield a 4k sgl entry like you are reporting. No. You suppose n_pages is very big, then prv->length will increase to reach max_segment. If n_pages is very small, for example, we suppose n_pages is 1, then prv->length will not reach max_segment. In fact, in my test case, n_pages is very small. Zhu Yanjun > > You get 4k sgl entries if max_segment is *too small* not too big! > > Jason
On Sun, Mar 21, 2021 at 10:38:45PM +0800, Zhu Yanjun wrote: > No. You suppose n_pages is very big, then prv->length will increase to > reach max_segment. > If n_pages is very small, for example, we suppose n_pages is 1, then > prv->length will > not reach max_segment. > > In fact, in my test case, n_pages is very small. n_pages is an *input* it is the number of pages in the VA range, if it is very small it is not a problem with this algorithm Jason
Got it. Thanks a lot for your help. Zhu Yanjun On Sun, Mar 21, 2021 at 11:52 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Sun, Mar 21, 2021 at 10:38:45PM +0800, Zhu Yanjun wrote: > > > No. You suppose n_pages is very big, then prv->length will increase to > > reach max_segment. > > If n_pages is very small, for example, we suppose n_pages is 1, then > > prv->length will > > not reach max_segment. > > > > In fact, in my test case, n_pages is very small. > > n_pages is an *input* it is the number of pages in the VA range, if it > is very small it is not a problem with this algorithm > > Jason
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 2dde99a9ba07..af8733788b1e 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -62,6 +62,203 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d sg_free_table(&umem->sg_head); } +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table + * + * sg: current scatterlist entry + * page_list: array of npage struct page pointers + * npages: number of pages in page_list + * max_seg_sz: maximum segment size in bytes + * nents: [out] number of entries in the scatterlist + * + * Return new end of scatterlist + */ +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, + struct page **page_list, + unsigned long npages, + unsigned int max_seg_sz, + int *nents) +{ + unsigned long first_pfn; + unsigned long i = 0; + bool update_cur_sg = false; + bool first = !sg_page(sg); + + /* Check if new page_list is contiguous with end of previous page_list. + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. + */ + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == + page_to_pfn(page_list[0]))) + update_cur_sg = true; + + while (i != npages) { + unsigned long len; + struct page *first_page = page_list[i]; + + first_pfn = page_to_pfn(first_page); + + /* Compute the number of contiguous pages we have starting + * at i + */ + for (len = 0; i != npages && + first_pfn + len == page_to_pfn(page_list[i]) && + len < (max_seg_sz >> PAGE_SHIFT); + len++) + i++; + + /* Squash N contiguous pages from page_list into current sge */ + if (update_cur_sg) { + if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) { + sg_set_page(sg, sg_page(sg), + sg->length + (len << PAGE_SHIFT), + 0); + update_cur_sg = false; + continue; + } + update_cur_sg = false; + } + + /* Squash N contiguous pages into next sge or first sge */ + if (!first) + sg = sg_next(sg); + + (*nents)++; + sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); + first = false; + } + + return sg; +} + +/** + * ib_umem_hugepage_get - Pin and DMA map userspace memory. + * + * @device: IB device to connect UMEM + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + */ +struct ib_umem *ib_umem_hugepage_get(struct ib_device *device, + unsigned long addr, + size_t size, int access) +{ + struct ib_umem *umem; + struct page **page_list; + unsigned long lock_limit; + unsigned long new_pinned; + unsigned long cur_base; + unsigned long dma_attr = 0; + struct mm_struct *mm; + unsigned long npages; + int ret; + struct scatterlist *sg; + unsigned int gup_flags = FOLL_WRITE; + + /* + * If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) + return ERR_PTR(-EINVAL); + + if (!can_do_mlock()) + return ERR_PTR(-EPERM); + + if (access & IB_ACCESS_ON_DEMAND) + return ERR_PTR(-EOPNOTSUPP); + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->ibdev = device; + umem->length = size; + umem->address = addr; + umem->writable = ib_access_writable(access); + umem->owning_mm = mm = current->mm; + mmgrab(mm); + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) { + ret = -ENOMEM; + goto umem_kfree; + } + + npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + new_pinned = atomic64_add_return(npages, &mm->pinned_vm); + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { + atomic64_sub(npages, &mm->pinned_vm); + ret = -ENOMEM; + goto out; + } + + cur_base = addr & PAGE_MASK; + + ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); + if (ret) + goto vma; + + if (!umem->writable) + gup_flags |= FOLL_FORCE; + + sg = umem->sg_head.sgl; + + while (npages) { + cond_resched(); + ret = pin_user_pages_fast(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / + sizeof(struct page *)), + gup_flags | FOLL_LONGTERM, page_list); + if (ret < 0) + goto umem_release; + + cur_base += ret * PAGE_SIZE; + npages -= ret; + + sg = ib_umem_add_sg_table(sg, page_list, ret, + dma_get_max_seg_size(device->dma_device), + &umem->sg_nents); + } + + sg_mark_end(sg); + + if (access & IB_ACCESS_RELAXED_ORDERING) + dma_attr |= DMA_ATTR_WEAK_ORDERING; + + umem->nmap = + ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents, + DMA_BIDIRECTIONAL, dma_attr); + + if (!umem->nmap) { + ret = -ENOMEM; + goto umem_release; + } + + ret = 0; + goto out; + +umem_release: + __ib_umem_release(device, umem, 0); +vma: + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); +out: + free_page((unsigned long) page_list); +umem_kfree: + if (ret) { + mmdrop(umem->owning_mm); + kfree(umem); + } + return ret ? ERR_PTR(ret) : umem; +} +EXPORT_SYMBOL(ib_umem_hugepage_get); + /** * ib_umem_find_best_pgsz - Find best HW page size to use for this MR * diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 676c57f5ca80..fc6350ff21e6 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -96,6 +96,9 @@ static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter, __rdma_block_iter_next(biter);) #ifdef CONFIG_INFINIBAND_USER_MEM +struct ib_umem *ib_umem_hugepage_get(struct ib_device *device, + unsigned long addr, + size_t size, int access); struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,