Message ID | 1707318566-3141-1-git-send-email-kotaranov@linux.microsoft.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [rdma-next,v1,1/1] RDMA/mana_ib: Fix bug in creation of dma regions | expand |
On Wed, Feb 07, 2024 at 07:09:26AM -0800, Konstantin Taranov wrote: > From: Konstantin Taranov <kotaranov@microsoft.com> > > Dma registration was ignoring virtual addresses by setting it to 0. > As a result, mana_ib could only register page-aligned memory. > As well as, it could fail to produce dma regions with zero offset > for WQs and CQs (e.g., page size is 8192 but address is only 4096 > bytes aligned), which is required by hardware. > > This patch takes into account the virtual address, allowing to create > a dma region with any offset. For queues (e.g., WQs, CQs) that require > dma regions with zero offset we add a flag to ensure zero offset. > > Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com> > --- > drivers/infiniband/hw/mana/cq.c | 3 ++- > drivers/infiniband/hw/mana/main.c | 16 +++++++++++++--- > drivers/infiniband/hw/mana/mana_ib.h | 2 +- > drivers/infiniband/hw/mana/mr.c | 2 +- > drivers/infiniband/hw/mana/qp.c | 4 ++-- > drivers/infiniband/hw/mana/wq.c | 3 ++- > 6 files changed, 21 insertions(+), 9 deletions(-) You definitely advised to look at the Documentation/process/submitting-patches.rst guide. 1. First revision doesn't need to be v1. 2. One logical fix/change == one patch. 3. Fixes should have Fixes: tag in the commit message. And I'm confident that the force_zero_offset change is not correct. Thanks > > diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c > index 83d20c3f0..e35de6b92 100644 > --- a/drivers/infiniband/hw/mana/cq.c > +++ b/drivers/infiniband/hw/mana/cq.c > @@ -48,7 +48,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > return err; > } > > - err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region); > + err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region, > + ucmd.buf_addr, true); > if (err) { > ibdev_dbg(ibdev, > "Failed to create dma region for create cq, %d\n", > diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c > index 29dd2438d..13a4d5ab4 100644 > --- a/drivers/infiniband/hw/mana/main.c > +++ b/drivers/infiniband/hw/mana/main.c > @@ -302,7 +302,7 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, > } > > int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, > - mana_handle_t *gdma_region) > + mana_handle_t *gdma_region, u64 virt, bool force_zero_offset) > { > struct gdma_dma_region_add_pages_req *add_req = NULL; > size_t num_pages_processed = 0, num_pages_to_handle; > @@ -324,11 +324,21 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, > hwc = gc->hwc.driver_data; > > /* Hardware requires dma region to align to chosen page size */ > - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); > + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); > if (!page_sz) { > ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); > return -ENOMEM; > } > + > + if (force_zero_offset) { > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > PAGE_SIZE) > + page_sz /= 2; > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > + ibdev_dbg(&dev->ib_dev, "failed to find page size to force zero offset.\n"); > + return -ENOMEM; > + } > + } > + > num_pages_total = ib_umem_num_dma_blocks(umem, page_sz); > > max_pgs_create_cmd = > @@ -348,7 +358,7 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, > sizeof(struct gdma_create_dma_region_resp)); > > create_req->length = umem->length; > - create_req->offset_in_page = umem->address & (page_sz - 1); > + create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); > create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; > create_req->page_count = num_pages_total; > > diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h > index 6a03ae645..0a5a8f3f8 100644 > --- a/drivers/infiniband/hw/mana/mana_ib.h > +++ b/drivers/infiniband/hw/mana/mana_ib.h > @@ -161,7 +161,7 @@ static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 > int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); > > int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, > - mana_handle_t *gdma_region); > + mana_handle_t *gdma_region, u64 virt, bool force_zero_offset); > > int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, > mana_handle_t gdma_region); > diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c > index ee4d4f834..856d73ea2 100644 > --- a/drivers/infiniband/hw/mana/mr.c > +++ b/drivers/infiniband/hw/mana/mr.c > @@ -127,7 +127,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, > goto err_free; > } > > - err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle); > + err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle, iova, false); > if (err) { > ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", > err); > diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c > index 5d4c05dcd..02de90317 100644 > --- a/drivers/infiniband/hw/mana/qp.c > +++ b/drivers/infiniband/hw/mana/qp.c > @@ -357,8 +357,8 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, > } > qp->sq_umem = umem; > > - err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, > - &qp->sq_gdma_region); > + err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, &qp->sq_gdma_region, > + ucmd.sq_buf_addr, true); > if (err) { > ibdev_dbg(&mdev->ib_dev, > "Failed to create dma region for create qp-raw, %d\n", > diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c > index 372d36151..d9c1a2d5d 100644 > --- a/drivers/infiniband/hw/mana/wq.c > +++ b/drivers/infiniband/hw/mana/wq.c > @@ -46,7 +46,8 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, > wq->wq_buf_size = ucmd.wq_buf_size; > wq->rx_object = INVALID_MANA_HANDLE; > > - err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region); > + err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region, > + ucmd.wq_buf_addr, true); > if (err) { > ibdev_dbg(&mdev->ib_dev, > "Failed to create dma region for create wq, %d\n", > > base-commit: aafe4cc5096996873817ff4981a3744e8caf7808 > -- > 2.43.0 >
> From: Leon Romanovsky <leon@kernel.org> > > From: Konstantin Taranov <kotaranov@microsoft.com> > > > > Dma registration was ignoring virtual addresses by setting it to 0. > > As a result, mana_ib could only register page-aligned memory. > > As well as, it could fail to produce dma regions with zero offset for > > WQs and CQs (e.g., page size is 8192 but address is only 4096 bytes > > aligned), which is required by hardware. > > > > This patch takes into account the virtual address, allowing to create > > a dma region with any offset. For queues (e.g., WQs, CQs) that require > > dma regions with zero offset we add a flag to ensure zero offset. > > > > Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com> > > --- > > drivers/infiniband/hw/mana/cq.c | 3 ++- > > drivers/infiniband/hw/mana/main.c | 16 +++++++++++++--- > > drivers/infiniband/hw/mana/mana_ib.h | 2 +- > > drivers/infiniband/hw/mana/mr.c | 2 +- > > drivers/infiniband/hw/mana/qp.c | 4 ++-- > > drivers/infiniband/hw/mana/wq.c | 3 ++- > > 6 files changed, 21 insertions(+), 9 deletions(-) > > You definitely advised to look at the Documentation/process/submitting- > patches.rst guide. > 1. First revision doesn't need to be v1. Thanks. I did not know that. > 2. One logical fix/change == one patch. It is one fix. If I only replace 0 with virt, the code will stop working as the offset will not be zero quite often. That is why I need to make offset = 0 for queues. > 3. Fixes should have Fixes: tag in the commit message. As existing applications were made to go around this limitation, I wanted this patch arrive to rdma-next. Or do you say that I cannot opt for rdma-next and must make it a "fix"? > > And I'm confident that the force_zero_offset change is not correct. It was tested with many page sizes and offsets. Could you elaborate why it is not correct? Thanks! > > Thanks > > > > > diff --git a/drivers/infiniband/hw/mana/cq.c > > b/drivers/infiniband/hw/mana/cq.c index 83d20c3f0..e35de6b92 100644 > > --- a/drivers/infiniband/hw/mana/cq.c > > +++ b/drivers/infiniband/hw/mana/cq.c > > @@ -48,7 +48,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct > ib_cq_init_attr *attr, > > return err; > > } > > > > - err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq- > >gdma_region); > > + err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq- > >gdma_region, > > + ucmd.buf_addr, true); > > if (err) { > > ibdev_dbg(ibdev, > > "Failed to create dma region for create cq, > > %d\n", diff --git a/drivers/infiniband/hw/mana/main.c > > b/drivers/infiniband/hw/mana/main.c > > index 29dd2438d..13a4d5ab4 100644 > > --- a/drivers/infiniband/hw/mana/main.c > > +++ b/drivers/infiniband/hw/mana/main.c > > @@ -302,7 +302,7 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev > *dev, > > struct gdma_context *gc, } > > > > int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct > ib_umem *umem, > > - mana_handle_t *gdma_region) > > + mana_handle_t *gdma_region, u64 virt, > > + bool force_zero_offset) > > { > > struct gdma_dma_region_add_pages_req *add_req = NULL; > > size_t num_pages_processed = 0, num_pages_to_handle; @@ -324,11 > > +324,21 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, > struct ib_umem *umem, > > hwc = gc->hwc.driver_data; > > > > /* Hardware requires dma region to align to chosen page size */ > > - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); > > + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); > > if (!page_sz) { > > ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); > > return -ENOMEM; > > } > > + > > + if (force_zero_offset) { > > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > > PAGE_SIZE) > > + page_sz /= 2; > > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > > + ibdev_dbg(&dev->ib_dev, "failed to find page size to force zero > offset.\n"); > > + return -ENOMEM; > > + } > > + } > > + > > num_pages_total = ib_umem_num_dma_blocks(umem, page_sz); > > > > max_pgs_create_cmd = > > @@ -348,7 +358,7 @@ int mana_ib_gd_create_dma_region(struct > mana_ib_dev *dev, struct ib_umem *umem, > > sizeof(struct > > gdma_create_dma_region_resp)); > > > > create_req->length = umem->length; > > - create_req->offset_in_page = umem->address & (page_sz - 1); > > + create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); > > create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; > > create_req->page_count = num_pages_total; > > > > diff --git a/drivers/infiniband/hw/mana/mana_ib.h > > b/drivers/infiniband/hw/mana/mana_ib.h > > index 6a03ae645..0a5a8f3f8 100644 > > --- a/drivers/infiniband/hw/mana/mana_ib.h > > +++ b/drivers/infiniband/hw/mana/mana_ib.h > > @@ -161,7 +161,7 @@ static inline struct net_device > > *mana_ib_get_netdev(struct ib_device *ibdev, u32 int > > mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq > > *cq); > > > > int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct > ib_umem *umem, > > - mana_handle_t *gdma_region); > > + mana_handle_t *gdma_region, u64 virt, > > + bool force_zero_offset); > > > > int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, > > mana_handle_t gdma_region); diff --git > > a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c > > index ee4d4f834..856d73ea2 100644 > > --- a/drivers/infiniband/hw/mana/mr.c > > +++ b/drivers/infiniband/hw/mana/mr.c > > @@ -127,7 +127,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd > *ibpd, u64 start, u64 length, > > goto err_free; > > } > > > > - err = mana_ib_gd_create_dma_region(dev, mr->umem, > &dma_region_handle); > > + err = mana_ib_gd_create_dma_region(dev, mr->umem, > > + &dma_region_handle, iova, false); > > if (err) { > > ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", > > err); > > diff --git a/drivers/infiniband/hw/mana/qp.c > > b/drivers/infiniband/hw/mana/qp.c index 5d4c05dcd..02de90317 100644 > > --- a/drivers/infiniband/hw/mana/qp.c > > +++ b/drivers/infiniband/hw/mana/qp.c > > @@ -357,8 +357,8 @@ static int mana_ib_create_qp_raw(struct ib_qp > *ibqp, struct ib_pd *ibpd, > > } > > qp->sq_umem = umem; > > > > - err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, > > - &qp->sq_gdma_region); > > + err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, &qp- > >sq_gdma_region, > > + ucmd.sq_buf_addr, true); > > if (err) { > > ibdev_dbg(&mdev->ib_dev, > > "Failed to create dma region for create > > qp-raw, %d\n", diff --git a/drivers/infiniband/hw/mana/wq.c > > b/drivers/infiniband/hw/mana/wq.c index 372d36151..d9c1a2d5d 100644 > > --- a/drivers/infiniband/hw/mana/wq.c > > +++ b/drivers/infiniband/hw/mana/wq.c > > @@ -46,7 +46,8 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, > > wq->wq_buf_size = ucmd.wq_buf_size; > > wq->rx_object = INVALID_MANA_HANDLE; > > > > - err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq- > >gdma_region); > > + err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq- > >gdma_region, > > + ucmd.wq_buf_addr, true); > > if (err) { > > ibdev_dbg(&mdev->ib_dev, > > "Failed to create dma region for create wq, > > %d\n", > > > > base-commit: aafe4cc5096996873817ff4981a3744e8caf7808 > > -- > > 2.43.0 > >
On Thu, Feb 08, 2024 at 08:49:43AM +0000, Konstantin Taranov wrote: > > From: Leon Romanovsky <leon@kernel.org> > > > From: Konstantin Taranov <kotaranov@microsoft.com> > > > > > > Dma registration was ignoring virtual addresses by setting it to 0. > > > As a result, mana_ib could only register page-aligned memory. > > > As well as, it could fail to produce dma regions with zero offset for > > > WQs and CQs (e.g., page size is 8192 but address is only 4096 bytes > > > aligned), which is required by hardware. > > > > > > This patch takes into account the virtual address, allowing to create > > > a dma region with any offset. For queues (e.g., WQs, CQs) that require > > > dma regions with zero offset we add a flag to ensure zero offset. > > > > > > Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com> > > > --- > > > drivers/infiniband/hw/mana/cq.c | 3 ++- > > > drivers/infiniband/hw/mana/main.c | 16 +++++++++++++--- > > > drivers/infiniband/hw/mana/mana_ib.h | 2 +- > > > drivers/infiniband/hw/mana/mr.c | 2 +- > > > drivers/infiniband/hw/mana/qp.c | 4 ++-- > > > drivers/infiniband/hw/mana/wq.c | 3 ++- > > > 6 files changed, 21 insertions(+), 9 deletions(-) > > > > You definitely advised to look at the Documentation/process/submitting- > > patches.rst guide. > > 1. First revision doesn't need to be v1. > > Thanks. I did not know that. > > > 2. One logical fix/change == one patch. > > It is one fix. If I only replace 0 with virt, the code will stop working as the offset will not be > zero quite often. That is why I need to make offset = 0 for queues. > > > 3. Fixes should have Fixes: tag in the commit message. > As existing applications were made to go around this limitation, I wanted this patch arrive to rdma-next. > Or do you say that I cannot opt for rdma-next and must make it a "fix"? Once you write "fix" word in the patch, the expectation is to have Fixes line. There is nothing wrong with applying patch with such tag to rdma-next and we are doing it all the time. Our policy is fluid here and can be summarized as follows: 1. Try to satisfy submitters request to put in specific target rdma-rc/rdma-next. 2. Very lax with taking patches to rdma-rc before -rc4. 3. In general, strict after -rc4, only patches with panics, build breakage and UAPI visible bugs. 4. More pedantic review of -rc material. So if you write rdma-next in title, add Fixes line which points to "old" code, we will apply your patch to rdma-next. > > > > > And I'm confident that the force_zero_offset change is not correct. > > It was tested with many page sizes and offsets. Could you elaborate why it is not correct? I prefer that Jason will elaborate more on this, he will do it better than me. > > Thanks! > > > > > Thanks > > > > > > > > diff --git a/drivers/infiniband/hw/mana/cq.c > > > b/drivers/infiniband/hw/mana/cq.c index 83d20c3f0..e35de6b92 100644 > > > --- a/drivers/infiniband/hw/mana/cq.c > > > +++ b/drivers/infiniband/hw/mana/cq.c > > > @@ -48,7 +48,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct > > ib_cq_init_attr *attr, > > > return err; > > > } > > > > > > - err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq- > > >gdma_region); > > > + err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq- > > >gdma_region, > > > + ucmd.buf_addr, true); > > > if (err) { > > > ibdev_dbg(ibdev, > > > "Failed to create dma region for create cq, > > > %d\n", diff --git a/drivers/infiniband/hw/mana/main.c > > > b/drivers/infiniband/hw/mana/main.c > > > index 29dd2438d..13a4d5ab4 100644 > > > --- a/drivers/infiniband/hw/mana/main.c > > > +++ b/drivers/infiniband/hw/mana/main.c > > > @@ -302,7 +302,7 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev > > *dev, > > > struct gdma_context *gc, } > > > > > > int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct > > ib_umem *umem, > > > - mana_handle_t *gdma_region) > > > + mana_handle_t *gdma_region, u64 virt, > > > + bool force_zero_offset) > > > { > > > struct gdma_dma_region_add_pages_req *add_req = NULL; > > > size_t num_pages_processed = 0, num_pages_to_handle; @@ -324,11 > > > +324,21 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, > > struct ib_umem *umem, > > > hwc = gc->hwc.driver_data; > > > > > > /* Hardware requires dma region to align to chosen page size */ > > > - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); > > > + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); > > > if (!page_sz) { > > > ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); > > > return -ENOMEM; > > > } > > > + > > > + if (force_zero_offset) { > > > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > > > PAGE_SIZE) > > > + page_sz /= 2; > > > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > > > + ibdev_dbg(&dev->ib_dev, "failed to find page size to force zero > > offset.\n"); > > > + return -ENOMEM; > > > + } > > > + } > > > + > > > num_pages_total = ib_umem_num_dma_blocks(umem, page_sz); > > > > > > max_pgs_create_cmd = > > > @@ -348,7 +358,7 @@ int mana_ib_gd_create_dma_region(struct > > mana_ib_dev *dev, struct ib_umem *umem, > > > sizeof(struct > > > gdma_create_dma_region_resp)); > > > > > > create_req->length = umem->length; > > > - create_req->offset_in_page = umem->address & (page_sz - 1); > > > + create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); > > > create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; > > > create_req->page_count = num_pages_total; > > > > > > diff --git a/drivers/infiniband/hw/mana/mana_ib.h > > > b/drivers/infiniband/hw/mana/mana_ib.h > > > index 6a03ae645..0a5a8f3f8 100644 > > > --- a/drivers/infiniband/hw/mana/mana_ib.h > > > +++ b/drivers/infiniband/hw/mana/mana_ib.h > > > @@ -161,7 +161,7 @@ static inline struct net_device > > > *mana_ib_get_netdev(struct ib_device *ibdev, u32 int > > > mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq > > > *cq); > > > > > > int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct > > ib_umem *umem, > > > - mana_handle_t *gdma_region); > > > + mana_handle_t *gdma_region, u64 virt, > > > + bool force_zero_offset); > > > > > > int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, > > > mana_handle_t gdma_region); diff --git > > > a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c > > > index ee4d4f834..856d73ea2 100644 > > > --- a/drivers/infiniband/hw/mana/mr.c > > > +++ b/drivers/infiniband/hw/mana/mr.c > > > @@ -127,7 +127,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd > > *ibpd, u64 start, u64 length, > > > goto err_free; > > > } > > > > > > - err = mana_ib_gd_create_dma_region(dev, mr->umem, > > &dma_region_handle); > > > + err = mana_ib_gd_create_dma_region(dev, mr->umem, > > > + &dma_region_handle, iova, false); > > > if (err) { > > > ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", > > > err); > > > diff --git a/drivers/infiniband/hw/mana/qp.c > > > b/drivers/infiniband/hw/mana/qp.c index 5d4c05dcd..02de90317 100644 > > > --- a/drivers/infiniband/hw/mana/qp.c > > > +++ b/drivers/infiniband/hw/mana/qp.c > > > @@ -357,8 +357,8 @@ static int mana_ib_create_qp_raw(struct ib_qp > > *ibqp, struct ib_pd *ibpd, > > > } > > > qp->sq_umem = umem; > > > > > > - err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, > > > - &qp->sq_gdma_region); > > > + err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, &qp- > > >sq_gdma_region, > > > + ucmd.sq_buf_addr, true); > > > if (err) { > > > ibdev_dbg(&mdev->ib_dev, > > > "Failed to create dma region for create > > > qp-raw, %d\n", diff --git a/drivers/infiniband/hw/mana/wq.c > > > b/drivers/infiniband/hw/mana/wq.c index 372d36151..d9c1a2d5d 100644 > > > --- a/drivers/infiniband/hw/mana/wq.c > > > +++ b/drivers/infiniband/hw/mana/wq.c > > > @@ -46,7 +46,8 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, > > > wq->wq_buf_size = ucmd.wq_buf_size; > > > wq->rx_object = INVALID_MANA_HANDLE; > > > > > > - err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq- > > >gdma_region); > > > + err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq- > > >gdma_region, > > > + ucmd.wq_buf_addr, true); > > > if (err) { > > > ibdev_dbg(&mdev->ib_dev, > > > "Failed to create dma region for create wq, > > > %d\n", > > > > > > base-commit: aafe4cc5096996873817ff4981a3744e8caf7808 > > > -- > > > 2.43.0 > > >
> > /* Hardware requires dma region to align to chosen page size */ > - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); > + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); > if (!page_sz) { > ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); > return -ENOMEM; > } How about doing: page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, force_zero_offset ? 0 : virt); Will this work? This can get rid of the following while loop. > + > + if (force_zero_offset) { > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > > PAGE_SIZE) > + page_sz /= 2; > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > + ibdev_dbg(&dev->ib_dev, "failed to find page size to > force zero offset.\n"); > + return -ENOMEM; > + } > + } > +
> From: Long Li <longli@microsoft.com> > Sent: Thursday, 8 February 2024 19:43 > To: Konstantin Taranov <kotaranov@linux.microsoft.com>; Konstantin > Taranov <kotaranov@microsoft.com>; sharmaajay@microsoft.com; > jgg@ziepe.ca; leon@kernel.org > Cc: linux-rdma@vger.kernel.org; linux-kernel@vger.kernel.org > Subject: RE: [PATCH rdma-next v1 1/1] RDMA/mana_ib: Fix bug in creation of > dma regions > > > > > /* Hardware requires dma region to align to chosen page size */ > > - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); > > + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); > > if (!page_sz) { > > ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); > > return -ENOMEM; > > } > > How about doing: > page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, force_zero_offset > ? 0 : virt); > > Will this work? This can get rid of the following while loop. > I do not think so. I mentioned once, that it was failing for me with existing code with the 4K-aligned addresses and 8K pages. In this case, we miscalculate the number of pages. So, we think that it is one 8K page, but it is in fact two. > > + > > + if (force_zero_offset) { > > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > > > PAGE_SIZE) > > + page_sz /= 2; > > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > > + ibdev_dbg(&dev->ib_dev, "failed to find page size to > > force zero offset.\n"); > > + return -ENOMEM; > > + } > > + } > > +
On Thu, Feb 08, 2024 at 06:53:05PM +0000, Konstantin Taranov wrote: > > From: Long Li <longli@microsoft.com> > > Sent: Thursday, 8 February 2024 19:43 > > To: Konstantin Taranov <kotaranov@linux.microsoft.com>; Konstantin > > Taranov <kotaranov@microsoft.com>; sharmaajay@microsoft.com; > > jgg@ziepe.ca; leon@kernel.org > > Cc: linux-rdma@vger.kernel.org; linux-kernel@vger.kernel.org > > Subject: RE: [PATCH rdma-next v1 1/1] RDMA/mana_ib: Fix bug in creation of > > dma regions > > > > > > > > /* Hardware requires dma region to align to chosen page size */ > > > - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); > > > + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); > > > if (!page_sz) { > > > ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); > > > return -ENOMEM; > > > } > > > > How about doing: > > page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, force_zero_offset > > ? 0 : virt); > > > > Will this work? This can get rid of the following while loop. > > > > I do not think so. I mentioned once, that it was failing for me with existing code > with the 4K-aligned addresses and 8K pages. In this case, we miscalculate the > number of pages. So, we think that it is one 8K page, but it is in fact two. That is a confusing statement.. What is "we" here? ib_umem_dma_offset() is not always guaranteed to be zero, with a 0 iova. With higher order pages the offset can be within the page, it generates offset = IOVA % pgsz There are a couple places that do want the offset to be fixed to zero and have the loop, at this point it would be good to consolidate them into some common ib_umem_find_best_pgsz_zero_offset() or something. > > > + > > > + if (force_zero_offset) { > > > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > > > > PAGE_SIZE) > > > + page_sz /= 2; > > > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > > > + ibdev_dbg(&dev->ib_dev, "failed to find page size to > > > force zero offset.\n"); > > > + return -ENOMEM; > > > + } > > > + } > > > + Yes this doesn't look quite right.. It should flow from the HW capability, the helper you call should be tightly linked to what the HW can do. ib_umem_find_best_pgsz() is used for MRs that have the usual offset = IOVA % pgsz We've always created other helpers for other restrictions. So you should move your "force_zero_offset" into another helper and describe exactly how the HW works to support the calculation It is odd to have the offset loop and be using ib_umem_find_best_pgsz() with some iova, usually you'd use ib_umem_find_best_pgoff() in those cases, see the other callers. Jason
> From: Jason Gunthorpe <jgg@ziepe.ca> > On Thu, Feb 08, 2024 at 06:53:05PM +0000, Konstantin Taranov wrote: > > > From: Long Li <longli@microsoft.com> > > > Sent: Thursday, 8 February 2024 19:43 > > > To: Konstantin Taranov <kotaranov@linux.microsoft.com>; Konstantin > > > Taranov <kotaranov@microsoft.com>; sharmaajay@microsoft.com; > > > jgg@ziepe.ca; leon@kernel.org > > > Cc: linux-rdma@vger.kernel.org; linux-kernel@vger.kernel.org > > > Subject: RE: [PATCH rdma-next v1 1/1] RDMA/mana_ib: Fix bug in > > > creation of dma regions > > > > > > > > > > > /* Hardware requires dma region to align to chosen page size */ > > > > - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); > > > > + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); > > > > if (!page_sz) { > > > > ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); > > > > return -ENOMEM; > > > > } > > > > > > How about doing: > > > page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, > force_zero_offset > > > ? 0 : virt); > > > > > > Will this work? This can get rid of the following while loop. > > > > > > > I do not think so. I mentioned once, that it was failing for me with > > existing code with the 4K-aligned addresses and 8K pages. In this > > case, we miscalculate the number of pages. So, we think that it is one 8K > page, but it is in fact two. > > That is a confusing statement.. What is "we" here? Sorry, I meant here the helper thinks that it is one 8K page for 8K buffer. But the offset will not not zero when the buffer is only 4K aligned. So the actual Number of pages is 2, but the helper thinks that it is one because of wrong virt. > > ib_umem_dma_offset() is not always guaranteed to be zero, with a 0 iova. > With higher order pages the offset can be within the page, it generates > > offset = IOVA % pgsz > > There are a couple places that do want the offset to be fixed to zero and have > the loop, at this point it would be good to consolidate them into some > common ib_umem_find_best_pgsz_zero_offset() or something. > > > > > + > > > > + if (force_zero_offset) { > > > > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > > > > > PAGE_SIZE) > > > > + page_sz /= 2; > > > > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > > > > + ibdev_dbg(&dev->ib_dev, "failed to find page > > > > + size to > > > > force zero offset.\n"); > > > > + return -ENOMEM; > > > > + } > > > > + } > > > > + > > Yes this doesn't look quite right.. > > It should flow from the HW capability, the helper you call should be tightly > linked to what the HW can do. > > ib_umem_find_best_pgsz() is used for MRs that have the usual > offset = IOVA % pgsz > > We've always created other helpers for other restrictions. > > So you should move your "force_zero_offset" into another helper and > describe exactly how the HW works to support the calculation > > It is odd to have the offset loop and be using > ib_umem_find_best_pgsz() with some iova, usually you'd use > ib_umem_find_best_pgoff() in those cases, see the other callers. Hi Jason, Thanks for the comments. To be honest, I do not understand how I could employ ib_umem_find_best_pgoff for my purpose. As well as I do not see any mistake in the patch, and I think you neither. I can explain the logic behind the code, and could you say what I need to change to get this patch accepted? Our HW cannot create a work queue or a completion queue from a dma region that has non zero offset. As a result, applications must allocate at least 4K-aligned memory for such queues. As a queue can be long, the helper functions may suggest large page sizes (let's call it X), but compromise the zero offset. As we see that the offset is not zero, we half X till first page size that can offer us zero offset. This page size will be at least 4K. If we cannot find such page size, it means that the user did not provide 4K-aligned buffer. I can make a special helper, but I do not think that it will be useful to anyone. Plus, there is no better approach then halving the page size, so the helper will end up with that loop under the hood. As I see mlnx also uses a loop with halving page_sz, but for a different purpose, I do not see why our code cannot do the same without a special helper. > > Jason
> > > > > + > > > > > + if (force_zero_offset) { > > > > > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > > > > > > PAGE_SIZE) > > > > > + page_sz /= 2; > > > > > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > > > > > + ibdev_dbg(&dev->ib_dev, "failed to find page > > > > > + size to > > > > > force zero offset.\n"); > > > > > + return -ENOMEM; > > > > > + } > > > > > + } > > > > > + > > > > Yes this doesn't look quite right.. > > > > It should flow from the HW capability, the helper you call should be tightly > > linked to what the HW can do. > > > > ib_umem_find_best_pgsz() is used for MRs that have the usual > > offset = IOVA % pgsz > > > > We've always created other helpers for other restrictions. > > > > So you should move your "force_zero_offset" into another helper and > > describe exactly how the HW works to support the calculation > > > > It is odd to have the offset loop and be using > > ib_umem_find_best_pgsz() with some iova, usually you'd use > > ib_umem_find_best_pgoff() in those cases, see the other callers. > > Hi Jason, > Thanks for the comments. > > To be honest, I do not understand how I could employ ib_umem_find_best_pgoff > for my purpose. As well as I do not see any mistake in the patch, and I think you neither. It does exactly the same thing, it is just intended to be used by things that are not doing the IOVA calculation. It is a matter of documentation. > I can make a special helper, but I do not think that it will be useful to anyone. Plus, > there is no better approach then halving the page size, so the helper will end up with that > loop under the hood. As I see mlnx also uses a loop with halving page_sz, but for a different > purpose, I do not see why our code cannot do the same without a special helper. Are you sure you don't need the length check too? You have a granular size but not a granular offset? In that case yes, a helper does not seem necessary However, you should still be calling ib_umem_find_best_pgoff() for the initialize sizing as a matter of clarity since this is not a MR and does not use IOVA addressing. Jason
> From: Jason Gunthorpe <jgg@ziepe.ca> > > > > > > + > > > > > > + if (force_zero_offset) { > > > > > > + while (ib_umem_dma_offset(umem, page_sz) && page_sz > > > > > > + > > > > > > > PAGE_SIZE) > > > > > > + page_sz /= 2; > > > > > > + if (ib_umem_dma_offset(umem, page_sz) != 0) { > > > > > > + ibdev_dbg(&dev->ib_dev, "failed to find page > > > > > > + size to > > > > > > force zero offset.\n"); > > > > > > + return -ENOMEM; > > > > > > + } > > > > > > + } > > > > > > + > > > > > > Yes this doesn't look quite right.. > > > > > > It should flow from the HW capability, the helper you call should be > > > tightly linked to what the HW can do. > > > > > > ib_umem_find_best_pgsz() is used for MRs that have the usual > > > offset = IOVA % pgsz > > > > > > We've always created other helpers for other restrictions. > > > > > > So you should move your "force_zero_offset" into another helper and > > > describe exactly how the HW works to support the calculation > > > > > > It is odd to have the offset loop and be using > > > ib_umem_find_best_pgsz() with some iova, usually you'd use > > > ib_umem_find_best_pgoff() in those cases, see the other callers. > > > > Hi Jason, > > Thanks for the comments. > > > > To be honest, I do not understand how I could employ > > ib_umem_find_best_pgoff for my purpose. As well as I do not see any > mistake in the patch, and I think you neither. > > It does exactly the same thing, it is just intended to be used by things that > are not doing the IOVA calculation. It is a matter of documentation. > > > I can make a special helper, but I do not think that it will be useful > > to anyone. Plus, there is no better approach then halving the page > > size, so the helper will end up with that loop under the hood. As I > > see mlnx also uses a loop with halving page_sz, but for a different purpose, > I do not see why our code cannot do the same without a special helper. > > Are you sure you don't need the length check too? You have a granular size > but not a granular offset? Yes, we do not have constraints on the size. > > In that case yes, a helper does not seem necessary > > However, you should still be calling ib_umem_find_best_pgoff() for the > initialize sizing as a matter of clarity since this is not a MR and does not use > IOVA addressing. Thanks for the clarification! I agree that the use of ib_umem_find_best_pgoff() will make the code more understandable, even though it will do the same computation. I have already prepared the patch. I will send it next week after running tests. > > Jason
diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 83d20c3f0..e35de6b92 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -48,7 +48,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return err; } - err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region); + err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region, + ucmd.buf_addr, true); if (err) { ibdev_dbg(ibdev, "Failed to create dma region for create cq, %d\n", diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 29dd2438d..13a4d5ab4 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -302,7 +302,7 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, } int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, - mana_handle_t *gdma_region) + mana_handle_t *gdma_region, u64 virt, bool force_zero_offset) { struct gdma_dma_region_add_pages_req *add_req = NULL; size_t num_pages_processed = 0, num_pages_to_handle; @@ -324,11 +324,21 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, hwc = gc->hwc.driver_data; /* Hardware requires dma region to align to chosen page size */ - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); if (!page_sz) { ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); return -ENOMEM; } + + if (force_zero_offset) { + while (ib_umem_dma_offset(umem, page_sz) && page_sz > PAGE_SIZE) + page_sz /= 2; + if (ib_umem_dma_offset(umem, page_sz) != 0) { + ibdev_dbg(&dev->ib_dev, "failed to find page size to force zero offset.\n"); + return -ENOMEM; + } + } + num_pages_total = ib_umem_num_dma_blocks(umem, page_sz); max_pgs_create_cmd = @@ -348,7 +358,7 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, sizeof(struct gdma_create_dma_region_resp)); create_req->length = umem->length; - create_req->offset_in_page = umem->address & (page_sz - 1); + create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; create_req->page_count = num_pages_total; diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 6a03ae645..0a5a8f3f8 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -161,7 +161,7 @@ static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, - mana_handle_t *gdma_region); + mana_handle_t *gdma_region, u64 virt, bool force_zero_offset); int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, mana_handle_t gdma_region); diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index ee4d4f834..856d73ea2 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -127,7 +127,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, goto err_free; } - err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle); + err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle, iova, false); if (err) { ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", err); diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 5d4c05dcd..02de90317 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -357,8 +357,8 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, } qp->sq_umem = umem; - err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, - &qp->sq_gdma_region); + err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, &qp->sq_gdma_region, + ucmd.sq_buf_addr, true); if (err) { ibdev_dbg(&mdev->ib_dev, "Failed to create dma region for create qp-raw, %d\n", diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c index 372d36151..d9c1a2d5d 100644 --- a/drivers/infiniband/hw/mana/wq.c +++ b/drivers/infiniband/hw/mana/wq.c @@ -46,7 +46,8 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, wq->wq_buf_size = ucmd.wq_buf_size; wq->rx_object = INVALID_MANA_HANDLE; - err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region); + err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region, + ucmd.wq_buf_addr, true); if (err) { ibdev_dbg(&mdev->ib_dev, "Failed to create dma region for create wq, %d\n",