Message ID | 25d903e0136ea1e65c612d8f6b8c18c1f010add7.1684397037.git.matsuda-daisuke@fujitsu.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | On-Demand Paging on SoftRoCE | expand |
On 5/18/23 03:21, Daisuke Matsuda wrote: > rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses > it to load payloads of requesting packets; responder uses it to process > Send, Write, and Read operaetions; completer uses it to copy data from > response packets of Read and Atomic operations to a user MR. > > Allow these operations to be used with ODP by adding a subordinate function > rxe_odp_mr_copy(). It is comprised of the following steps: > 1. Check the driver page table(umem_odp->dma_list) to see if pages being > accessed are present with appropriate permission. > 2. If necessary, trigger page fault to map the pages. > 3. Update the MR xarray using PFNs in umem_odp->pfn_list. > 4. Execute data copy to/from the pages. > > umem_mutex is used to ensure that dma_list (an array of addresses of an MR) > is not changed while it is being checked and that mapped pages are not > invalidated before data copy completes. > > Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com> > --- > drivers/infiniband/sw/rxe/rxe.c | 10 +++ > drivers/infiniband/sw/rxe/rxe_loc.h | 8 ++ > drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- > drivers/infiniband/sw/rxe/rxe_odp.c | 109 ++++++++++++++++++++++++++++ > 4 files changed, 128 insertions(+), 1 deletion(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index f2284d27229b..207a022156f0 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -79,6 +79,16 @@ static void rxe_init_device_param(struct rxe_dev *rxe) > > /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ > rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; > + > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > + > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > } > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index 93247d123642..4b95c8c46bdc 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -206,6 +206,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) > #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING > int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, > u64 iova, int access_flags, struct rxe_mr *mr); > +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, > + enum rxe_mr_copy_dir dir); > #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > static inline int > rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > @@ -213,6 +215,12 @@ rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > { > return -EOPNOTSUPP; > } > +static inline int > +rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > + int length, enum rxe_mr_copy_dir dir) > +{ > + return -EOPNOTSUPP; > +} > > #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > > diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c > index cd368cd096c8..0e3cda59d702 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mr.c > +++ b/drivers/infiniband/sw/rxe/rxe_mr.c > @@ -319,7 +319,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > } > > if (mr->odp_enabled) > - return -EOPNOTSUPP; > + return rxe_odp_mr_copy(mr, iova, addr, length, dir); > else > return rxe_mr_copy_xarray(mr, iova, addr, length, dir); > } > diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c > index e5497d09c399..cbe5d0c3fcc4 100644 > --- a/drivers/infiniband/sw/rxe/rxe_odp.c > +++ b/drivers/infiniband/sw/rxe/rxe_odp.c > @@ -174,3 +174,112 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, > > return err; > } > + > +static inline bool rxe_is_pagefault_neccesary(struct ib_umem_odp *umem_odp, > + u64 iova, int length, u32 perm) > +{ > + int idx; > + u64 addr; > + bool need_fault = false; > + > + addr = iova & (~(BIT(umem_odp->page_shift) - 1)); > + > + /* Skim through all pages that are to be accessed. */ > + while (addr < iova + length) { > + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > + > + if (!(umem_odp->dma_list[idx] & perm)) { > + need_fault = true; > + break; > + } > + > + addr += BIT(umem_odp->page_shift); > + } > + return need_fault; > +} > + > +/* umem mutex must be locked before entering this function. */ > +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + const int max_tries = 3; > + int cnt = 0; > + > + int err; > + u64 perm; > + bool need_fault; > + > + if (unlikely(length < 1)) { > + mutex_unlock(&umem_odp->umem_mutex); > + return -EINVAL; > + } > + > + perm = ODP_READ_ALLOWED_BIT; > + if (!(flags & RXE_PAGEFAULT_RDONLY)) > + perm |= ODP_WRITE_ALLOWED_BIT; > + > + /* > + * A successful return from rxe_odp_do_pagefault() does not guarantee > + * that all pages in the range became present. Recheck the DMA address > + * array, allowing max 3 tries for pagefault. > + */ > + while ((need_fault = rxe_is_pagefault_neccesary(umem_odp, > + iova, length, perm))) { > + if (cnt >= max_tries) > + break; > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + /* umem_mutex is locked on success. */ > + err = rxe_odp_do_pagefault(mr, iova, length, flags); > + if (err < 0) > + return err; > + > + cnt++; > + } > + > + if (need_fault) > + return -EFAULT; > + > + return 0; > +} > + > +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, > + enum rxe_mr_copy_dir dir) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + u32 flags = 0; > + int err; > + > + if (unlikely(!mr->odp_enabled)) > + return -EOPNOTSUPP; > + > + switch (dir) { > + case RXE_TO_MR_OBJ: > + break; > + > + case RXE_FROM_MR_OBJ: > + flags = RXE_PAGEFAULT_RDONLY; > + break; > + > + default: > + return -EINVAL; > + } > + > + /* If pagefault is not required, umem mutex will be held until data > + * copy to the MR completes. Otherwise, it is released and locked > + * again in rxe_odp_map_range() to let invalidation handler do its > + * work meanwhile. > + */ > + mutex_lock(&umem_odp->umem_mutex); > + > + err = rxe_odp_map_range(mr, iova, length, flags); > + if (err) > + return err; > + > + err = rxe_mr_copy_xarray(mr, iova, addr, length, dir); > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + return err; > +} Reviewed-by: Bob Pearson <rpearsonhpe@gmail.com>
On 5/18/23 03:21, Daisuke Matsuda wrote: > rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses > it to load payloads of requesting packets; responder uses it to process > Send, Write, and Read operaetions; completer uses it to copy data from > response packets of Read and Atomic operations to a user MR. > > Allow these operations to be used with ODP by adding a subordinate function > rxe_odp_mr_copy(). It is comprised of the following steps: > 1. Check the driver page table(umem_odp->dma_list) to see if pages being > accessed are present with appropriate permission. > 2. If necessary, trigger page fault to map the pages. > 3. Update the MR xarray using PFNs in umem_odp->pfn_list. > 4. Execute data copy to/from the pages. > > umem_mutex is used to ensure that dma_list (an array of addresses of an MR) > is not changed while it is being checked and that mapped pages are not > invalidated before data copy completes. > > Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com> > --- > drivers/infiniband/sw/rxe/rxe.c | 10 +++ > drivers/infiniband/sw/rxe/rxe_loc.h | 8 ++ > drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- > drivers/infiniband/sw/rxe/rxe_odp.c | 109 ++++++++++++++++++++++++++++ > 4 files changed, 128 insertions(+), 1 deletion(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index f2284d27229b..207a022156f0 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -79,6 +79,16 @@ static void rxe_init_device_param(struct rxe_dev *rxe) > > /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ > rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; > + > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > + > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > } > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index 93247d123642..4b95c8c46bdc 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -206,6 +206,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) > #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING > int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, > u64 iova, int access_flags, struct rxe_mr *mr); > +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, > + enum rxe_mr_copy_dir dir); > #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > static inline int > rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > @@ -213,6 +215,12 @@ rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > { > return -EOPNOTSUPP; > } > +static inline int > +rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > + int length, enum rxe_mr_copy_dir dir) > +{ > + return -EOPNOTSUPP; > +} > > #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > > diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c > index cd368cd096c8..0e3cda59d702 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mr.c > +++ b/drivers/infiniband/sw/rxe/rxe_mr.c > @@ -319,7 +319,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > } > > if (mr->odp_enabled) > - return -EOPNOTSUPP; > + return rxe_odp_mr_copy(mr, iova, addr, length, dir); > else > return rxe_mr_copy_xarray(mr, iova, addr, length, dir); > } > diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c > index e5497d09c399..cbe5d0c3fcc4 100644 > --- a/drivers/infiniband/sw/rxe/rxe_odp.c > +++ b/drivers/infiniband/sw/rxe/rxe_odp.c > @@ -174,3 +174,112 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, > > return err; > } > + > +static inline bool rxe_is_pagefault_neccesary(struct ib_umem_odp *umem_odp, > + u64 iova, int length, u32 perm) > +{ > + int idx; > + u64 addr; > + bool need_fault = false; > + > + addr = iova & (~(BIT(umem_odp->page_shift) - 1)); > + > + /* Skim through all pages that are to be accessed. */ > + while (addr < iova + length) { > + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > + > + if (!(umem_odp->dma_list[idx] & perm)) { > + need_fault = true; > + break; > + } > + > + addr += BIT(umem_odp->page_shift); > + } > + return need_fault; > +} > + > +/* umem mutex must be locked before entering this function. */ > +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + const int max_tries = 3; > + int cnt = 0; > + > + int err; > + u64 perm; > + bool need_fault; > + > + if (unlikely(length < 1)) { > + mutex_unlock(&umem_odp->umem_mutex); > + return -EINVAL; > + } > + > + perm = ODP_READ_ALLOWED_BIT; > + if (!(flags & RXE_PAGEFAULT_RDONLY)) > + perm |= ODP_WRITE_ALLOWED_BIT; > + > + /* > + * A successful return from rxe_odp_do_pagefault() does not guarantee > + * that all pages in the range became present. Recheck the DMA address > + * array, allowing max 3 tries for pagefault. > + */ > + while ((need_fault = rxe_is_pagefault_neccesary(umem_odp, > + iova, length, perm))) { > + if (cnt >= max_tries) > + break; > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + /* umem_mutex is locked on success. */ > + err = rxe_odp_do_pagefault(mr, iova, length, flags); > + if (err < 0) > + return err; > + > + cnt++; > + } > + > + if (need_fault) > + return -EFAULT; > + > + return 0; > +} > + > +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, > + enum rxe_mr_copy_dir dir) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + u32 flags = 0; > + int err; > + > + if (unlikely(!mr->odp_enabled)) > + return -EOPNOTSUPP; > + > + switch (dir) { > + case RXE_TO_MR_OBJ: > + break; > + > + case RXE_FROM_MR_OBJ: > + flags = RXE_PAGEFAULT_RDONLY; > + break; > + > + default: > + return -EINVAL; > + } > + > + /* If pagefault is not required, umem mutex will be held until data > + * copy to the MR completes. Otherwise, it is released and locked > + * again in rxe_odp_map_range() to let invalidation handler do its > + * work meanwhile. > + */ > + mutex_lock(&umem_odp->umem_mutex); > + > + err = rxe_odp_map_range(mr, iova, length, flags); > + if (err) > + return err; > + > + err = rxe_mr_copy_xarray(mr, iova, addr, length, dir); > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + return err; > +} Reviewed-by: Bob Pearson <rpearsonhpe@gmail.com>
On Thu, May 18, 2023 at 05:21:51PM +0900, Daisuke Matsuda wrote: > +/* umem mutex must be locked before entering this function. */ > +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + const int max_tries = 3; > + int cnt = 0; > + > + int err; > + u64 perm; > + bool need_fault; > + > + if (unlikely(length < 1)) { > + mutex_unlock(&umem_odp->umem_mutex); > + return -EINVAL; > + } > + > + perm = ODP_READ_ALLOWED_BIT; > + if (!(flags & RXE_PAGEFAULT_RDONLY)) > + perm |= ODP_WRITE_ALLOWED_BIT; > + > + /* > + * A successful return from rxe_odp_do_pagefault() does not guarantee > + * that all pages in the range became present. Recheck the DMA address > + * array, allowing max 3 tries for pagefault. > + */ > + while ((need_fault = rxe_is_pagefault_neccesary(umem_odp, > + iova, length, perm))) { > + if (cnt >= max_tries) > + break; I don't think this makes sense.. You need to make this work more like mlx5 does, you take the spinlock on the xarray, you search it for your index and whatever is there tells what to do. Hold the spinlock while doing the copy. This is enough locking for the fast path. If there is no index present, or it is not writable and you need write, then you unlock the spinlock and prefetch the missing entry and try again, this time also holding the mutex so there isn't a race. You shouldn't probe into parts of the umem to discover information already stored in the xarray then do the same lookup into the xarray. IIRC this also needs to keep track in the xarray on a per page basis if the page is writable. Jason
On Tue, June 13, 2023 1:23 AM Jason Gunthorpe wrote: > > On Thu, May 18, 2023 at 05:21:51PM +0900, Daisuke Matsuda wrote: > > > +/* umem mutex must be locked before entering this function. */ > > +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags) > > +{ > > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > > + const int max_tries = 3; > > + int cnt = 0; > > + > > + int err; > > + u64 perm; > > + bool need_fault; > > + > > + if (unlikely(length < 1)) { > > + mutex_unlock(&umem_odp->umem_mutex); > > + return -EINVAL; > > + } > > + > > + perm = ODP_READ_ALLOWED_BIT; > > + if (!(flags & RXE_PAGEFAULT_RDONLY)) > > + perm |= ODP_WRITE_ALLOWED_BIT; > > + > > + /* > > + * A successful return from rxe_odp_do_pagefault() does not guarantee > > + * that all pages in the range became present. Recheck the DMA address > > + * array, allowing max 3 tries for pagefault. > > + */ > > + while ((need_fault = rxe_is_pagefault_neccesary(umem_odp, > > + iova, length, perm))) { > > + if (cnt >= max_tries) > > + break; > > I don't think this makes sense.. > > You need to make this work more like mlx5 does, you take the spinlock > on the xarray, you search it for your index and whatever is there > tells what to do. Hold the spinlock while doing the copy. This is > enough locking for the fast path. > > If there is no index present, or it is not writable and you need > write, then you unlock the spinlock and prefetch the missing entry and > try again, this time also holding the mutex so there isn't a race. > > You shouldn't probe into parts of the umem to discover information > already stored in the xarray then do the same lookup into the xarray. > > IIRC this also needs to keep track in the xarray on a per page basis > if the page is writable. Thank you for the detailed explanation. I agree the current implementation is inefficient. I think I can fix the patches according to your comment. I'll submit a new series once it is complete. Thanks, Daisuke > > Jason
On Tue, June 13, 2023 1:23 AM Jason Gunthorpe wrote: > On Thu, May 18, 2023 at 05:21:51PM +0900, Daisuke Matsuda wrote: > > > +/* umem mutex must be locked before entering this function. */ > > +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags) > > +{ > > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > > + const int max_tries = 3; > > + int cnt = 0; > > + > > + int err; > > + u64 perm; > > + bool need_fault; > > + > > + if (unlikely(length < 1)) { > > + mutex_unlock(&umem_odp->umem_mutex); > > + return -EINVAL; > > + } > > + > > + perm = ODP_READ_ALLOWED_BIT; > > + if (!(flags & RXE_PAGEFAULT_RDONLY)) > > + perm |= ODP_WRITE_ALLOWED_BIT; > > + > > + /* > > + * A successful return from rxe_odp_do_pagefault() does not guarantee > > + * that all pages in the range became present. Recheck the DMA address > > + * array, allowing max 3 tries for pagefault. > > + */ > > + while ((need_fault = rxe_is_pagefault_neccesary(umem_odp, > > + iova, length, perm))) { > > + if (cnt >= max_tries) > > + break; > > I don't think this makes sense.. I have posted the new implementation, but it is somewhat different from your suggestion. The reason is as below. > > You need to make this work more like mlx5 does, you take the spinlock > on the xarray, you search it for your index and whatever is there > tells what to do. Hold the spinlock while doing the copy. This is > enough locking for the fast path. This lock should be umem mutex. Actual page-out is executed in the kernel after rxe_ib_invalidate_range() is done. It is possible the spinlock does not block this flow if it is locked while the invalidation is running. The umem mutex can serialize these accesses. > > If there is no index present, or it is not writable and you need > write, then you unlock the spinlock and prefetch the missing entry and > try again, this time also holding the mutex so there isn't a race. > > You shouldn't probe into parts of the umem to discover information > already stored in the xarray then do the same lookup into the xarray. > > IIRC this also needs to keep track in the xarray on a per page basis > if the page is writable. An xarray entry can hold a pointer or a value from 0 to LONG_MAX. That is not enough to store page address and its permission. If we try to do everything with xarray, we need to allocate a new struct for each page that holds a pointer to a page and a value to store r/w permission. That is inefficient in terms of memory usage and implementation. I think the xarray can be used to check presence of pages just like we have been doing in the non-ODP case. On the other hand, the permission should be fetched from umem_odp->pfn_list, which is updated everytime page fault is executed. Thanks, Daisuke > > Jason
On Fri, Sep 08, 2023 at 06:35:56AM +0000, Daisuke Matsuda (Fujitsu) wrote: > > IIRC this also needs to keep track in the xarray on a per page basis > > if the page is writable. > > An xarray entry can hold a pointer or a value from 0 to LONG_MAX. > That is not enough to store page address and its permission. It is, this is a page list so you know the lower 12 bits are not used and you can encode stuff there. > If we try to do everything with xarray, we need to allocate a new struct > for each page that holds a pointer to a page and a value to store r/w permission. > That is inefficient in terms of memory usage and implementation. No, just use the lower extra bits. > I think the xarray can be used to check presence of pages just like we have > been doing in the non-ODP case. On the other hand, the permission > should be fetched from umem_odp->pfn_list, which is updated everytime > page fault is executed. Definately not Jason
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index f2284d27229b..207a022156f0 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -79,6 +79,16 @@ static void rxe_init_device_param(struct rxe_dev *rxe) /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; + + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 93247d123642..4b95c8c46bdc 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -206,6 +206,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access_flags, struct rxe_mr *mr); +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, + enum rxe_mr_copy_dir dir); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -213,6 +215,12 @@ rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, { return -EOPNOTSUPP; } +static inline int +rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + int length, enum rxe_mr_copy_dir dir) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index cd368cd096c8..0e3cda59d702 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -319,7 +319,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, } if (mr->odp_enabled) - return -EOPNOTSUPP; + return rxe_odp_mr_copy(mr, iova, addr, length, dir); else return rxe_mr_copy_xarray(mr, iova, addr, length, dir); } diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index e5497d09c399..cbe5d0c3fcc4 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -174,3 +174,112 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, return err; } + +static inline bool rxe_is_pagefault_neccesary(struct ib_umem_odp *umem_odp, + u64 iova, int length, u32 perm) +{ + int idx; + u64 addr; + bool need_fault = false; + + addr = iova & (~(BIT(umem_odp->page_shift) - 1)); + + /* Skim through all pages that are to be accessed. */ + while (addr < iova + length) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; + + if (!(umem_odp->dma_list[idx] & perm)) { + need_fault = true; + break; + } + + addr += BIT(umem_odp->page_shift); + } + return need_fault; +} + +/* umem mutex must be locked before entering this function. */ +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + const int max_tries = 3; + int cnt = 0; + + int err; + u64 perm; + bool need_fault; + + if (unlikely(length < 1)) { + mutex_unlock(&umem_odp->umem_mutex); + return -EINVAL; + } + + perm = ODP_READ_ALLOWED_BIT; + if (!(flags & RXE_PAGEFAULT_RDONLY)) + perm |= ODP_WRITE_ALLOWED_BIT; + + /* + * A successful return from rxe_odp_do_pagefault() does not guarantee + * that all pages in the range became present. Recheck the DMA address + * array, allowing max 3 tries for pagefault. + */ + while ((need_fault = rxe_is_pagefault_neccesary(umem_odp, + iova, length, perm))) { + if (cnt >= max_tries) + break; + + mutex_unlock(&umem_odp->umem_mutex); + + /* umem_mutex is locked on success. */ + err = rxe_odp_do_pagefault(mr, iova, length, flags); + if (err < 0) + return err; + + cnt++; + } + + if (need_fault) + return -EFAULT; + + return 0; +} + +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, + enum rxe_mr_copy_dir dir) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + u32 flags = 0; + int err; + + if (unlikely(!mr->odp_enabled)) + return -EOPNOTSUPP; + + switch (dir) { + case RXE_TO_MR_OBJ: + break; + + case RXE_FROM_MR_OBJ: + flags = RXE_PAGEFAULT_RDONLY; + break; + + default: + return -EINVAL; + } + + /* If pagefault is not required, umem mutex will be held until data + * copy to the MR completes. Otherwise, it is released and locked + * again in rxe_odp_map_range() to let invalidation handler do its + * work meanwhile. + */ + mutex_lock(&umem_odp->umem_mutex); + + err = rxe_odp_map_range(mr, iova, length, flags); + if (err) + return err; + + err = rxe_mr_copy_xarray(mr, iova, addr, length, dir); + + mutex_unlock(&umem_odp->umem_mutex); + + return err; +}
rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses it to load payloads of requesting packets; responder uses it to process Send, Write, and Read operaetions; completer uses it to copy data from response packets of Read and Atomic operations to a user MR. Allow these operations to be used with ODP by adding a subordinate function rxe_odp_mr_copy(). It is comprised of the following steps: 1. Check the driver page table(umem_odp->dma_list) to see if pages being accessed are present with appropriate permission. 2. If necessary, trigger page fault to map the pages. 3. Update the MR xarray using PFNs in umem_odp->pfn_list. 4. Execute data copy to/from the pages. umem_mutex is used to ensure that dma_list (an array of addresses of an MR) is not changed while it is being checked and that mapped pages are not invalidated before data copy completes. Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com> --- drivers/infiniband/sw/rxe/rxe.c | 10 +++ drivers/infiniband/sw/rxe/rxe_loc.h | 8 ++ drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- drivers/infiniband/sw/rxe/rxe_odp.c | 109 ++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 1 deletion(-)