Message ID | 20190211152508.25040-2-shiraz.saleem@intel.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | Adapt drivers to handle page combining on umem SGEs | expand |
On 11-Feb-19 17:24, Shiraz Saleem wrote: > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c > index c8502c2..c8478c1 100644 > --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > struct scatterlist *sghead, u32 pages, u32 pg_size) > { > - struct scatterlist *sg; > + struct sg_dma_page_iter sg_iter; > bool is_umem = false; > int i; > > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > } else { > i = 0; > is_umem = true; > - for_each_sg(sghead, sg, pages, i) { > - pbl->pg_map_arr[i] = sg_dma_address(sg); > - pbl->pg_arr[i] = sg_virt(sg); > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > + pbl->pg_arr[i] = NULL;> if (!pbl->pg_arr[i]) Surely something went wrong here? > goto fail; > > + i++; > pbl->pg_count++; > } > } >
On Wed, Feb 13, 2019 at 01:12:37PM +0200, Gal Pressman wrote: > On 11-Feb-19 17:24, Shiraz Saleem wrote: > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > index c8502c2..c8478c1 100644 > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > struct scatterlist *sghead, u32 pages, u32 pg_size) > > { > > - struct scatterlist *sg; > > + struct sg_dma_page_iter sg_iter; > > bool is_umem = false; > > int i; > > > > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > } else { > > i = 0; > > is_umem = true; > > - for_each_sg(sghead, sg, pages, i) { > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > - pbl->pg_arr[i] = sg_virt(sg); > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > + pbl->pg_arr[i] = NULL; > if (!pbl->pg_arr[i]) > > Surely something went wrong here? That if does look wrong. I recall Shiraz removed the pg_arr as it wasn't being used, so maybe the if should go to? Devesh? Someone send a patch... Jason
On Wed, Feb 13, 2019 at 4:42 PM Gal Pressman <galpress@amazon.com> wrote: > > On 11-Feb-19 17:24, Shiraz Saleem wrote: > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > index c8502c2..c8478c1 100644 > > --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > struct scatterlist *sghead, u32 pages, u32 pg_size) > > { > > - struct scatterlist *sg; > > + struct sg_dma_page_iter sg_iter; > > bool is_umem = false; > > int i; > > > > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > } else { > > i = 0; > > is_umem = true; > > - for_each_sg(sghead, sg, pages, i) { > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > - pbl->pg_arr[i] = sg_virt(sg); > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > + pbl->pg_arr[i] = NULL;> if (!pbl->pg_arr[i]) > > Surely something went wrong here? Thanks for the review. Yes, its breaks the functionality. pg_arr needs to hold the virtual address, which would be used by the caller of __alloc_pbl. Also, the current code always goes to the fail statement because of the !pbl->pg_arr[i] check . I feel the following change is required. pbl->pg_arr[i] = page_address(sg_page_iter_page(&sg_iter.base)); > > > goto fail; > > > > + i++; > > pbl->pg_count++; > > } > > } > > >
On Wed, Feb 13, 2019 at 9:28 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Wed, Feb 13, 2019 at 01:12:37PM +0200, Gal Pressman wrote: > > On 11-Feb-19 17:24, Shiraz Saleem wrote: > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > > index c8502c2..c8478c1 100644 > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > struct scatterlist *sghead, u32 pages, u32 pg_size) > > > { > > > - struct scatterlist *sg; > > > + struct sg_dma_page_iter sg_iter; > > > bool is_umem = false; > > > int i; > > > > > > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > } else { > > > i = 0; > > > is_umem = true; > > > - for_each_sg(sghead, sg, pages, i) { > > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > > - pbl->pg_arr[i] = sg_virt(sg); > > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > > + pbl->pg_arr[i] = NULL; > > if (!pbl->pg_arr[i]) > > > > Surely something went wrong here? > > That if does look wrong. > > I recall Shiraz removed the pg_arr as it wasn't being used, so maybe > the if should go to? > > Devesh? > > Someone send a patch... We will sent the patch. > > Jason
On Wed, Feb 13, 2019 at 09:31:39PM +0530, Selvin Xavier wrote: > On Wed, Feb 13, 2019 at 4:42 PM Gal Pressman <galpress@amazon.com> wrote: > > > > On 11-Feb-19 17:24, Shiraz Saleem wrote: > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > > index c8502c2..c8478c1 100644 > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > struct scatterlist *sghead, u32 pages, u32 pg_size) > > > { > > > - struct scatterlist *sg; > > > + struct sg_dma_page_iter sg_iter; > > > bool is_umem = false; > > > int i; > > > > > > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > } else { > > > i = 0; > > > is_umem = true; > > > - for_each_sg(sghead, sg, pages, i) { > > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > > - pbl->pg_arr[i] = sg_virt(sg); > > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > > + pbl->pg_arr[i] = NULL;> if (!pbl->pg_arr[i]) > > > > Surely something went wrong here? > Thanks for the review. Yes, its breaks the functionality. pg_arr needs > to hold the virtual address, which would be used by the caller of > __alloc_pbl. Also, the current code always goes to the fail statement > because of the !pbl->pg_arr[i] check . > > I feel the following change is required. > pbl->pg_arr[i] = page_address(sg_page_iter_page(&sg_iter.base)); Drivers cannot iterate over DMA and struct pages like this, it is not allowed by the DMA API. Jason
>Subject: Re: [rdma-next 01/12] RDMA/bnxt_re: Use for_each_sg_dma_page >iterator on umem SGL > >On Wed, Feb 13, 2019 at 01:12:37PM +0200, Gal Pressman wrote: >> On 11-Feb-19 17:24, Shiraz Saleem wrote: >> > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c >> > b/drivers/infiniband/hw/bnxt_re/qplib_res.c >> > index c8502c2..c8478c1 100644 >> > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c >> > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, >> > struct bnxt_qplib_pbl *pbl, static int __alloc_pbl(struct pci_dev *pdev, struct >bnxt_qplib_pbl *pbl, >> > struct scatterlist *sghead, u32 pages, u32 pg_size) { >> > - struct scatterlist *sg; >> > + struct sg_dma_page_iter sg_iter; >> > bool is_umem = false; >> > int i; >> > >> > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct >bnxt_qplib_pbl *pbl, >> > } else { >> > i = 0; >> > is_umem = true; >> > - for_each_sg(sghead, sg, pages, i) { >> > - pbl->pg_map_arr[i] = sg_dma_address(sg); >> > - pbl->pg_arr[i] = sg_virt(sg); >> > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { >> > + pbl->pg_map_arr[i] = >sg_page_iter_dma_address(&sg_iter); >> > + pbl->pg_arr[i] = NULL; >> if (!pbl->pg_arr[i]) >> >> Surely something went wrong here? > >That if does look wrong. > >I recall Shiraz removed the pg_arr as it wasn't being used, so maybe the if should >go to? Thanks Gal for catching this. Yes the reason pg_arr is set to NULL here is because it appears used only in !umem case. But then the following if check is a bug. Duh! bnxt_re folks - is this acceptable? --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, struct scatterlist *sghead, u32 pages, u32 pg_size) { - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; bool is_umem = false; int i; @@ -116,12 +116,9 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, } else { i = 0; is_umem = true; - for_each_sg(sghead, sg, pages, i) { - pbl->pg_map_arr[i] = sg_dma_address(sg); - pbl->pg_arr[i] = sg_virt(sg); - if (!pbl->pg_arr[i]) - goto fail; - + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); + i++; pbl->pg_count++; } } Shiraz
On Wed, Feb 13, 2019 at 10:36 PM Saleem, Shiraz <shiraz.saleem@intel.com> wrote: > > >Subject: Re: [rdma-next 01/12] RDMA/bnxt_re: Use for_each_sg_dma_page > >iterator on umem SGL > > > >On Wed, Feb 13, 2019 at 01:12:37PM +0200, Gal Pressman wrote: > >> On 11-Feb-19 17:24, Shiraz Saleem wrote: > >> > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c > >> > b/drivers/infiniband/hw/bnxt_re/qplib_res.c > >> > index c8502c2..c8478c1 100644 > >> > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > >> > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, > >> > struct bnxt_qplib_pbl *pbl, static int __alloc_pbl(struct pci_dev *pdev, struct > >bnxt_qplib_pbl *pbl, > >> > struct scatterlist *sghead, u32 pages, u32 pg_size) { > >> > - struct scatterlist *sg; > >> > + struct sg_dma_page_iter sg_iter; > >> > bool is_umem = false; > >> > int i; > >> > > >> > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct > >bnxt_qplib_pbl *pbl, > >> > } else { > >> > i = 0; > >> > is_umem = true; > >> > - for_each_sg(sghead, sg, pages, i) { > >> > - pbl->pg_map_arr[i] = sg_dma_address(sg); > >> > - pbl->pg_arr[i] = sg_virt(sg); > >> > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > >> > + pbl->pg_map_arr[i] = > >sg_page_iter_dma_address(&sg_iter); > >> > + pbl->pg_arr[i] = NULL; > >> if (!pbl->pg_arr[i]) > >> > >> Surely something went wrong here? > > > >That if does look wrong. > > > >I recall Shiraz removed the pg_arr as it wasn't being used, so maybe the if should > >go to? > > Thanks Gal for catching this. Yes the reason pg_arr is set to NULL here is because it > appears used only in !umem case. But then the following if check is a bug. Duh! > > bnxt_re folks - is this acceptable? > > --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > struct scatterlist *sghead, u32 pages, u32 pg_size) > { > - struct scatterlist *sg; > + struct sg_dma_page_iter sg_iter; > bool is_umem = false; > int i; > > @@ -116,12 +116,9 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > } else { > i = 0; > is_umem = true; > - for_each_sg(sghead, sg, pages, i) { > - pbl->pg_map_arr[i] = sg_dma_address(sg); > - pbl->pg_arr[i] = sg_virt(sg); > - if (!pbl->pg_arr[i]) > - goto fail; > - > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > + i++; > pbl->pg_count++; The thing is we need the virtual address for future reference in this path. If we do not have virt-addr populated it breaks! > } > } > > Shiraz >
On Wed, Feb 13, 2019 at 9:42 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Wed, Feb 13, 2019 at 09:31:39PM +0530, Selvin Xavier wrote: > > On Wed, Feb 13, 2019 at 4:42 PM Gal Pressman <galpress@amazon.com> wrote: > > > > > > On 11-Feb-19 17:24, Shiraz Saleem wrote: > > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > > > index c8502c2..c8478c1 100644 > > > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > > > > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > > static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > > struct scatterlist *sghead, u32 pages, u32 pg_size) > > > > { > > > > - struct scatterlist *sg; > > > > + struct sg_dma_page_iter sg_iter; > > > > bool is_umem = false; > > > > int i; > > > > > > > > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > > } else { > > > > i = 0; > > > > is_umem = true; > > > > - for_each_sg(sghead, sg, pages, i) { > > > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > > > - pbl->pg_arr[i] = sg_virt(sg); > > > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > > > + pbl->pg_arr[i] = NULL;> if (!pbl->pg_arr[i]) > > > > > > Surely something went wrong here? > > Thanks for the review. Yes, its breaks the functionality. pg_arr needs > > to hold the virtual address, which would be used by the caller of > > __alloc_pbl. Also, the current code always goes to the fail statement > > because of the !pbl->pg_arr[i] check . > > > > I feel the following change is required. > > pbl->pg_arr[i] = page_address(sg_page_iter_page(&sg_iter.base)); > > Drivers cannot iterate over DMA and struct pages like this, it is not > allowed by the DMA API. And what Shiraz proposing is the way to go? > > Jason
On Mon, Feb 11, 2019 at 8:56 PM Shiraz Saleem <shiraz.saleem@intel.com> wrote: > > From: "Shiraz, Saleem" <shiraz.saleem@intel.com> > > Use the for_each_sg_dma_page iterator variant to walk the umem > DMA-mapped SGL and get the page DMA address. This avoids the extra > loop to iterate pages in the SGE when for_each_sg iterator is used. > > Additionally, purge umem->page_shift usage in the driver > as its only relevant for ODP MRs. Use system page size and > shift instead. > > Cc: Selvin Xavier <selvin.xavier@broadcom.com> > Cc: Devesh Sharma <devesh.sharma@broadcom.com> > Signed-off-by: Shiraz, Saleem <shiraz.saleem@intel.com> > --- > drivers/infiniband/hw/bnxt_re/ib_verbs.c | 23 +++++++++-------------- > drivers/infiniband/hw/bnxt_re/qplib_res.c | 9 +++++---- > 2 files changed, 14 insertions(+), 18 deletions(-) > > diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > index 1d7469e..ed7cb0b 100644 > --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c > +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > @@ -3566,19 +3566,14 @@ static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig, > u64 *pbl_tbl = pbl_tbl_orig; > u64 paddr; > u64 page_mask = (1ULL << page_shift) - 1; > - int i, pages; > - struct scatterlist *sg; > - int entry; > - > - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { > - pages = sg_dma_len(sg) >> PAGE_SHIFT; > - for (i = 0; i < pages; i++) { > - paddr = sg_dma_address(sg) + (i << PAGE_SHIFT); > - if (pbl_tbl == pbl_tbl_orig) > - *pbl_tbl++ = paddr & ~page_mask; > - else if ((paddr & page_mask) == 0) > - *pbl_tbl++ = paddr; > - } > + struct sg_dma_page_iter sg_iter; > + > + for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { > + paddr = sg_page_iter_dma_address(&sg_iter); > + if (pbl_tbl == pbl_tbl_orig) > + *pbl_tbl++ = paddr & ~page_mask; > + else if ((paddr & page_mask) == 0) > + *pbl_tbl++ = paddr; > } > return pbl_tbl - pbl_tbl_orig; > } > @@ -3641,7 +3636,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, > goto free_umem; > } > > - page_shift = umem->page_shift; > + page_shift = PAGE_SHIFT; I fear this will break Hugepage support also! > > if (!bnxt_re_page_size_ok(page_shift)) { > dev_err(rdev_to_dev(rdev), "umem page size unsupported!"); > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c > index c8502c2..c8478c1 100644 > --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > struct scatterlist *sghead, u32 pages, u32 pg_size) > { > - struct scatterlist *sg; > + struct sg_dma_page_iter sg_iter; > bool is_umem = false; > int i; > > @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > } else { > i = 0; > is_umem = true; > - for_each_sg(sghead, sg, pages, i) { > - pbl->pg_map_arr[i] = sg_dma_address(sg); > - pbl->pg_arr[i] = sg_virt(sg); > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > + pbl->pg_arr[i] = NULL; > if (!pbl->pg_arr[i]) > goto fail; > > + i++; > pbl->pg_count++; > } > } > -- > 1.8.3.1 >
>Subject: Re: [rdma-next 01/12] RDMA/bnxt_re: Use for_each_sg_dma_page >iterator on umem SGL > >On Mon, Feb 11, 2019 at 8:56 PM Shiraz Saleem <shiraz.saleem@intel.com> >wrote: >> >> From: "Shiraz, Saleem" <shiraz.saleem@intel.com> >> >> Use the for_each_sg_dma_page iterator variant to walk the umem >> DMA-mapped SGL and get the page DMA address. This avoids the extra >> loop to iterate pages in the SGE when for_each_sg iterator is used. >> >> Additionally, purge umem->page_shift usage in the driver as its only >> relevant for ODP MRs. Use system page size and shift instead. >> >> Cc: Selvin Xavier <selvin.xavier@broadcom.com> >> Cc: Devesh Sharma <devesh.sharma@broadcom.com> >> Signed-off-by: Shiraz, Saleem <shiraz.saleem@intel.com> >> --- >> drivers/infiniband/hw/bnxt_re/ib_verbs.c | 23 >> +++++++++-------------- drivers/infiniband/hw/bnxt_re/qplib_res.c | >> 9 +++++---- >> 2 files changed, 14 insertions(+), 18 deletions(-) >> >> diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c >> b/drivers/infiniband/hw/bnxt_re/ib_verbs.c >> index 1d7469e..ed7cb0b 100644 >> --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c >> +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c >> @@ -3566,19 +3566,14 @@ static int fill_umem_pbl_tbl(struct ib_umem >*umem, u64 *pbl_tbl_orig, >> u64 *pbl_tbl = pbl_tbl_orig; >> u64 paddr; >> u64 page_mask = (1ULL << page_shift) - 1; >> - int i, pages; >> - struct scatterlist *sg; >> - int entry; >> - >> - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { >> - pages = sg_dma_len(sg) >> PAGE_SHIFT; >> - for (i = 0; i < pages; i++) { >> - paddr = sg_dma_address(sg) + (i << PAGE_SHIFT); >> - if (pbl_tbl == pbl_tbl_orig) >> - *pbl_tbl++ = paddr & ~page_mask; >> - else if ((paddr & page_mask) == 0) >> - *pbl_tbl++ = paddr; >> - } >> + struct sg_dma_page_iter sg_iter; >> + >> + for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) >{ >> + paddr = sg_page_iter_dma_address(&sg_iter); >> + if (pbl_tbl == pbl_tbl_orig) >> + *pbl_tbl++ = paddr & ~page_mask; >> + else if ((paddr & page_mask) == 0) >> + *pbl_tbl++ = paddr; >> } >> return pbl_tbl - pbl_tbl_orig; } @@ -3641,7 +3636,7 @@ struct >> ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, >> goto free_umem; >> } >> >> - page_shift = umem->page_shift; >> + page_shift = PAGE_SHIFT; >I fear this will break Hugepage support also! >> How? umem->page_shift is always the system page shift for non-ODP MRs. This driver is keying off the umem->hugetlb flag for determining if MR is backed by huge pages right? FYI - I will be submitting a follow on series to remove that flag from core and instead bnxt_re and i40iw will use the core helpers to get the huge page aligned address. It would be good to test drive that series. Shiraz
On Wed, Feb 13, 2019 at 10:44:47PM +0530, Devesh Sharma wrote: > > @@ -116,12 +116,9 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > } else { > > i = 0; > > is_umem = true; > > - for_each_sg(sghead, sg, pages, i) { > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > - pbl->pg_arr[i] = sg_virt(sg); > > - if (!pbl->pg_arr[i]) > > - goto fail; > > - > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > + i++; > > pbl->pg_count++; > The thing is we need the virtual address for future reference in this > path. If we do not have virt-addr populated it breaks! Drivers should not use both DMA and struct page for umems. One other other. Where is the pg_arr read in the umem case, and what is it for? Jason
On Wed, Feb 13, 2019 at 10:57 PM Saleem, Shiraz <shiraz.saleem@intel.com> wrote: > > >Subject: Re: [rdma-next 01/12] RDMA/bnxt_re: Use for_each_sg_dma_page > >iterator on umem SGL > > > >On Mon, Feb 11, 2019 at 8:56 PM Shiraz Saleem <shiraz.saleem@intel.com> > >wrote: > >> > >> From: "Shiraz, Saleem" <shiraz.saleem@intel.com> > >> > >> Use the for_each_sg_dma_page iterator variant to walk the umem > >> DMA-mapped SGL and get the page DMA address. This avoids the extra > >> loop to iterate pages in the SGE when for_each_sg iterator is used. > >> > >> Additionally, purge umem->page_shift usage in the driver as its only > >> relevant for ODP MRs. Use system page size and shift instead. > >> > >> Cc: Selvin Xavier <selvin.xavier@broadcom.com> > >> Cc: Devesh Sharma <devesh.sharma@broadcom.com> > >> Signed-off-by: Shiraz, Saleem <shiraz.saleem@intel.com> > >> --- > >> drivers/infiniband/hw/bnxt_re/ib_verbs.c | 23 > >> +++++++++-------------- drivers/infiniband/hw/bnxt_re/qplib_res.c | > >> 9 +++++---- > >> 2 files changed, 14 insertions(+), 18 deletions(-) > >> > >> diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c > >> b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > >> index 1d7469e..ed7cb0b 100644 > >> --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c > >> +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > >> @@ -3566,19 +3566,14 @@ static int fill_umem_pbl_tbl(struct ib_umem > >*umem, u64 *pbl_tbl_orig, > >> u64 *pbl_tbl = pbl_tbl_orig; > >> u64 paddr; > >> u64 page_mask = (1ULL << page_shift) - 1; > >> - int i, pages; > >> - struct scatterlist *sg; > >> - int entry; > >> - > >> - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { > >> - pages = sg_dma_len(sg) >> PAGE_SHIFT; > >> - for (i = 0; i < pages; i++) { > >> - paddr = sg_dma_address(sg) + (i << PAGE_SHIFT); > >> - if (pbl_tbl == pbl_tbl_orig) > >> - *pbl_tbl++ = paddr & ~page_mask; > >> - else if ((paddr & page_mask) == 0) > >> - *pbl_tbl++ = paddr; > >> - } > >> + struct sg_dma_page_iter sg_iter; > >> + > >> + for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) > >{ > >> + paddr = sg_page_iter_dma_address(&sg_iter); > >> + if (pbl_tbl == pbl_tbl_orig) > >> + *pbl_tbl++ = paddr & ~page_mask; > >> + else if ((paddr & page_mask) == 0) > >> + *pbl_tbl++ = paddr; > >> } > >> return pbl_tbl - pbl_tbl_orig; } @@ -3641,7 +3636,7 @@ struct > >> ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, > >> goto free_umem; > >> } > >> > >> - page_shift = umem->page_shift; > >> + page_shift = PAGE_SHIFT; > >I fear this will break Hugepage support also! > >> > > How? umem->page_shift is always the system page shift for non-ODP MRs. > This driver is keying off the umem->hugetlb flag for determining if MR is backed > by huge pages right? Yeah, Looks like not breaking, because driver updates page_shift after this initialization. Still I would say a test run is required. > > FYI - I will be submitting a follow on series to remove that flag > from core and instead bnxt_re and i40iw will use the core helpers to get the > huge page aligned address. It would be good to test drive that series. Okay, We will do some basic UT for that series, just cc us in the patch series -Regards Devesh > > Shiraz >
On Wed, Feb 13, 2019 at 10:58 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Wed, Feb 13, 2019 at 10:44:47PM +0530, Devesh Sharma wrote: > > > @@ -116,12 +116,9 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > } else { > > > i = 0; > > > is_umem = true; > > > - for_each_sg(sghead, sg, pages, i) { > > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > > - pbl->pg_arr[i] = sg_virt(sg); > > > - if (!pbl->pg_arr[i]) > > > - goto fail; > > > - > > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > > + i++; > > > pbl->pg_count++; > > The thing is we need the virtual address for future reference in this > > path. If we do not have virt-addr populated it breaks! > > Drivers should not use both DMA and struct page for umems. One other > other. > > Where is the pg_arr read in the umem case, and what is it for? Right there at the caller of __alloc_pbl(). > > Jason
On Wed, Feb 13, 2019 at 11:58:24PM +0530, Devesh Sharma wrote: > On Wed, Feb 13, 2019 at 10:58 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Wed, Feb 13, 2019 at 10:44:47PM +0530, Devesh Sharma wrote: > > > > @@ -116,12 +116,9 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > > } else { > > > > i = 0; > > > > is_umem = true; > > > > - for_each_sg(sghead, sg, pages, i) { > > > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > > > - pbl->pg_arr[i] = sg_virt(sg); > > > > - if (!pbl->pg_arr[i]) > > > > - goto fail; > > > > - > > > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > > > + i++; > > > > pbl->pg_count++; > > > The thing is we need the virtual address for future reference in this > > > path. If we do not have virt-addr populated it breaks! > > > > Drivers should not use both DMA and struct page for umems. One other > > other. > > > > Where is the pg_arr read in the umem case, and what is it for? > Right there at the caller of __alloc_pbl(). What I see is a confusing mess where sometimes the pg_arr points to dma_alloc_coherent value that needs to be freed, and sometimes points to a struct page pointing into user memory. All the cases that touch pg_arr seem to want to touch the dma_alloc_coherent memory, not the user pages, so setting it to NULL for the usage pages seems like the right thing to do? Jason
On Thu, Feb 14, 2019 at 12:23 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Wed, Feb 13, 2019 at 11:58:24PM +0530, Devesh Sharma wrote: > > On Wed, Feb 13, 2019 at 10:58 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > > On Wed, Feb 13, 2019 at 10:44:47PM +0530, Devesh Sharma wrote: > > > > > @@ -116,12 +116,9 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > > > > > } else { > > > > > i = 0; > > > > > is_umem = true; > > > > > - for_each_sg(sghead, sg, pages, i) { > > > > > - pbl->pg_map_arr[i] = sg_dma_address(sg); > > > > > - pbl->pg_arr[i] = sg_virt(sg); > > > > > - if (!pbl->pg_arr[i]) > > > > > - goto fail; > > > > > - > > > > > + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { > > > > > + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > > > > > + i++; > > > > > pbl->pg_count++; > > > > The thing is we need the virtual address for future reference in this > > > > path. If we do not have virt-addr populated it breaks! > > > > > > Drivers should not use both DMA and struct page for umems. One other > > > other. > > > > > > Where is the pg_arr read in the umem case, and what is it for? > > Right there at the caller of __alloc_pbl(). > > What I see is a confusing mess where sometimes the pg_arr points to > dma_alloc_coherent value that needs to be freed, and sometimes points > to a struct page pointing into user memory. rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_2], sghead, pages, pg_size); if (rc) goto fail; /* Fill in lvl1 PBL */ dst_virt_ptr = (dma_addr_t **)hwq->pbl[PBL_LVL_1].pg_arr; src_phys_ptr = hwq->pbl[PBL_LVL_2].pg_map_arr; for (i = 0; i < hwq->pbl[PBL_LVL_2].pg_count; i++) { dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = src_phys_ptr[i] | PTU_PTE_VALID; } Here the driver code is trying to fill the PTEs to the page tables data-structure needed by Broadcom devices. I agree that that code deserves careful cleanup. We can supply patch series for that. There are couple of other places too where the driver tries to access pg_arr even though the object belong to user space. One such example is __clean_cq() during destroy QP. > > All the cases that touch pg_arr seem to want to touch the > dma_alloc_coherent memory, not the user pages, so setting it to NULL > for the usage pages seems like the right thing to do? I agree that in user-space object case setting pg_arr to NULL should be okay. However the driver is broken in current condition. It immediately requires additional patch to restrict unconditional access of pg_arr data structure when the object belong to user-space. The fix would be to put check if any object belongs to user space then skip calling the functions which require pg_arr. i.e. __clean_cq() etc. > > Jason
On Thu, Feb 14, 2019 at 09:16:50PM +0530, Devesh Sharma wrote: > > All the cases that touch pg_arr seem to want to touch the > > dma_alloc_coherent memory, not the user pages, so setting it to NULL > > for the usage pages seems like the right thing to do? > > I agree that in user-space object case setting pg_arr to NULL should be okay. > However the driver is broken in current condition. It immediately > requires additional patch to restrict unconditional access > of pg_arr data structure when the object belong to user-space. The fix > would be to put check if any object belongs to user space then > skip calling the functions which require pg_arr. i.e. __clean_cq() etc. This is completly wrong, you can't just call sg_virt on user memory then use the pointer and hope for the best. That breaks so many valid flows in the kernel. Look you guys need to review patches on your driver, Shiraz had posted this stuff multiple times over months. Send a patch to fix this mess and don't mis-use the SG & DMA API. This should probably fix things in the short term. Please test it: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index d08b9d9948fd3c..42838c4dab4c92 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -118,9 +118,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, is_umem = true; for_each_sg_dma_page (sghead, &sg_iter, pages, 0) { pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); - pbl->pg_arr[i] = NULL; - if (!pbl->pg_arr[i]) - goto fail; + + /* + * FIXME: The driver should not be trying to get + * kernel access to user pages in this way. + */ + pbl->pg_arr[i] = + page_address(sg_page_iter_page(&sg_iter.base)); i++; pbl->pg_count++;
On Thu, Feb 14, 2019 at 9:49 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Thu, Feb 14, 2019 at 09:16:50PM +0530, Devesh Sharma wrote: > > > > All the cases that touch pg_arr seem to want to touch the > > > dma_alloc_coherent memory, not the user pages, so setting it to NULL > > > for the usage pages seems like the right thing to do? > > > > I agree that in user-space object case setting pg_arr to NULL should be okay. > > However the driver is broken in current condition. It immediately > > requires additional patch to restrict unconditional access > > of pg_arr data structure when the object belong to user-space. The fix > > would be to put check if any object belongs to user space then > > skip calling the functions which require pg_arr. i.e. __clean_cq() etc. > > This is completly wrong, you can't just call sg_virt on user memory > then use the pointer and hope for the best. That breaks so many valid > flows in the kernel. > > Look you guys need to review patches on your driver, Shiraz had posted > this stuff multiple times over months. > > Send a patch to fix this mess and don't mis-use the SG & DMA API. Yes, that will be done, we will supply an immediate fix so that the broken driver comes to working condition without crashing and then put in clean-up stuff. > > This should probably fix things in the short term. Please test it: > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c > index d08b9d9948fd3c..42838c4dab4c92 100644 > --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c > +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c > @@ -118,9 +118,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, > is_umem = true; > for_each_sg_dma_page (sghead, &sg_iter, pages, 0) { > pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); > - pbl->pg_arr[i] = NULL; > - if (!pbl->pg_arr[i]) > - goto fail; > + > + /* > + * FIXME: The driver should not be trying to get > + * kernel access to user pages in this way. > + */ > + pbl->pg_arr[i] = > + page_address(sg_page_iter_page(&sg_iter.base)); > > i++; > pbl->pg_count++;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 1d7469e..ed7cb0b 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3566,19 +3566,14 @@ static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig, u64 *pbl_tbl = pbl_tbl_orig; u64 paddr; u64 page_mask = (1ULL << page_shift) - 1; - int i, pages; - struct scatterlist *sg; - int entry; - - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - pages = sg_dma_len(sg) >> PAGE_SHIFT; - for (i = 0; i < pages; i++) { - paddr = sg_dma_address(sg) + (i << PAGE_SHIFT); - if (pbl_tbl == pbl_tbl_orig) - *pbl_tbl++ = paddr & ~page_mask; - else if ((paddr & page_mask) == 0) - *pbl_tbl++ = paddr; - } + struct sg_dma_page_iter sg_iter; + + for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + paddr = sg_page_iter_dma_address(&sg_iter); + if (pbl_tbl == pbl_tbl_orig) + *pbl_tbl++ = paddr & ~page_mask; + else if ((paddr & page_mask) == 0) + *pbl_tbl++ = paddr; } return pbl_tbl - pbl_tbl_orig; } @@ -3641,7 +3636,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, goto free_umem; } - page_shift = umem->page_shift; + page_shift = PAGE_SHIFT; if (!bnxt_re_page_size_ok(page_shift)) { dev_err(rdev_to_dev(rdev), "umem page size unsupported!"); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index c8502c2..c8478c1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, struct scatterlist *sghead, u32 pages, u32 pg_size) { - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; bool is_umem = false; int i; @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, } else { i = 0; is_umem = true; - for_each_sg(sghead, sg, pages, i) { - pbl->pg_map_arr[i] = sg_dma_address(sg); - pbl->pg_arr[i] = sg_virt(sg); + for_each_sg_dma_page(sghead, &sg_iter, pages, 0) { + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); + pbl->pg_arr[i] = NULL; if (!pbl->pg_arr[i]) goto fail; + i++; pbl->pg_count++; } }