diff mbox series

[RFC,3/4] RDMA/umem: Add API to return optimal HW DMA addresses from SG list

Message ID 20181019233409.1104-4-shiraz.saleem@intel.com (mailing list archive)
State Superseded
Headers show
Series Introduce APIs to get DMA addresses aligned to a HW supported page size | expand

Commit Message

Shiraz Saleem Oct. 19, 2018, 11:34 p.m. UTC
This helper iterates the SG list and returns suitable
HW aligned DMA addresses within a driver supported
page size. The implementation is intended to work for
HW that support single page sizes or mixed page sizes
in an MR. This avoids the need for having driver specific
algorithms to achieve the same thing and redundant walks
of the SG list.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/core/umem.c | 68 ++++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_umem.h         | 19 ++++++++++++
 2 files changed, 87 insertions(+)

Comments

Jason Gunthorpe Oct. 22, 2018, 9:40 p.m. UTC | #1
On Fri, Oct 19, 2018 at 06:34:08PM -0500, Shiraz Saleem wrote:
> This helper iterates the SG list and returns suitable
> HW aligned DMA addresses within a driver supported
> page size. The implementation is intended to work for
> HW that support single page sizes or mixed page sizes
> in an MR. This avoids the need for having driver specific
> algorithms to achieve the same thing and redundant walks
> of the SG list.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>  drivers/infiniband/core/umem.c | 68 ++++++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h         | 19 ++++++++++++
>  2 files changed, 87 insertions(+)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 04071b5..cba79ab 100644
> +++ b/drivers/infiniband/core/umem.c
> @@ -160,6 +160,74 @@ unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
>  }
>  EXPORT_SYMBOL(ib_umem_find_single_pg_size);
>  
> +void ib_umem_start_phys_iter(struct ib_umem *umem,
> +			     struct sg_phys_iter *sg_phys_iter)
> +{
> +	memset(sg_phys_iter, 0, sizeof(struct sg_phys_iter));
> +	sg_phys_iter->sg = umem->sg_head.sgl;
> +}
> +EXPORT_SYMBOL(ib_umem_start_phys_iter);

I think the supported_pgsz should be an input here, it doesn't make
sense to change the sizes during iteration, does it? Would it even
work right?

> +
> +/**
> + * ib_umem_next_phys_iter - SG list iterator that returns aligned HW address
> + * @umem: umem struct
> + * @sg_phys_iter: SG HW address iterator
> + * @supported_pgsz: bitmask of HW supported page sizes
> + *
> + * This helper iterates over the SG list and returns the HW
> + * address aligned to a supported HW page size.
> + *
> + * The algorithm differs slightly between HW that supports single
> + * page sizes vs mixed page sizes in an MR. For example, if an
> + * MR of size 4M-4K, starts at an offset PAGE_SIZE (ex: 4K) into
> + * a 2M page; HW that supports multiple page sizes (ex: 4K, 2M)
> + * would get 511 4K pages and one 2M page. Single page support
> + * HW would get back two 2M pages or 1023 4K pages.

That doesn't seem quite right, the first and last pages should always
be the largest needed to get to alignment, as HW always has an
offset/length sort of scheme to make this efficient.

So I would expect 4M-4k to always return two 2M pages if the HW
supports 2M, even if supports smaller sizes.

For this to work the algorithm needs to know start/end...

Maybe we should not support multiple page sizes yet, do we have HW
that can even do that?

> + */
> +bool ib_umem_next_phys_iter(struct ib_umem *umem,
> +			    struct sg_phys_iter *sg_phys_iter,
> +			    unsigned long supported_pgsz)
> +{
> +	unsigned long pg_mask, offset;
> +	int pg_bit;

unsigned

> +
> +	if (!sg_phys_iter->sg || !supported_pgsz)
> +		return false;

> +	if (sg_phys_iter->remaining) {
> +		sg_phys_iter->phyaddr += sg_phys_iter->len;
> +	} else {
> +		sg_phys_iter->phyaddr = sg_dma_address(sg_phys_iter->sg);
> +		sg_phys_iter->remaining = sg_dma_len(sg_phys_iter->sg);

Why not put the
		sg_phys_iter->sg = sg_next(sg_phys_iter->sg);

Here?

> +	}
> +
> +	/* Single page support in MR */
> +	if (hweight_long(supported_pgsz) == 1) {
> +		pg_bit = fls64(supported_pgsz) - 1;

Then this could be precomputed once

> +	} else {
> +		/* Mixed page support in MR*/
> +		pg_bit = ib_umem_find_pg_bit(sg_phys_iter->phyaddr,
> +					     supported_pgsz);
> +	}
> +
> +	/* page bit cannot be less than the lowest supported HW size */
> +	if (WARN_ON(pg_bit < __ffs(supported_pgsz)))
> +		return false;

and this

> +
> +	pg_mask = ~((1 << pg_bit) - 1);
> +
> +	offset = sg_phys_iter->phyaddr & ~pg_mask;
> +	sg_phys_iter->phyaddr = sg_phys_iter->phyaddr & pg_mask;
> +	sg_phys_iter->len = min_t(int, sg_phys_iter->remaining,
> +				  (1 << (pg_bit)) - offset);
> +	sg_phys_iter->remaining -= sg_phys_iter->len;
> +	if (!sg_phys_iter->remaining)
> +		sg_phys_iter->sg = sg_next(sg_phys_iter->sg);
> +
> +	return true;
> +}
> +EXPORT_SYMBOL(ib_umem_next_phys_iter);
Shiraz Saleem Oct. 25, 2018, 10:20 p.m. UTC | #2
On Mon, Oct 22, 2018 at 03:40:20PM -0600, Jason Gunthorpe wrote:
> On Fri, Oct 19, 2018 at 06:34:08PM -0500, Shiraz Saleem wrote:
> > This helper iterates the SG list and returns suitable
> > HW aligned DMA addresses within a driver supported
> > page size. The implementation is intended to work for
> > HW that support single page sizes or mixed page sizes
> > in an MR. This avoids the need for having driver specific
> > algorithms to achieve the same thing and redundant walks
> > of the SG list.
> >

[..]
> >  
> > +void ib_umem_start_phys_iter(struct ib_umem *umem,
> > +			     struct sg_phys_iter *sg_phys_iter)
> > +{
> > +	memset(sg_phys_iter, 0, sizeof(struct sg_phys_iter));
> > +	sg_phys_iter->sg = umem->sg_head.sgl;
> > +}
> > +EXPORT_SYMBOL(ib_umem_start_phys_iter);
> 
> I think the supported_pgsz should be an input here, it doesn't make
> sense to change the sizes during iteration, does it? Would it even
> work right?

Yes. The sizes shouldnt change during iteration. I ll move it here.
 
> > +
> > +/**
> > + * ib_umem_next_phys_iter - SG list iterator that returns aligned HW address
> > + * @umem: umem struct
> > + * @sg_phys_iter: SG HW address iterator
> > + * @supported_pgsz: bitmask of HW supported page sizes
> > + *
> > + * This helper iterates over the SG list and returns the HW
> > + * address aligned to a supported HW page size.
> > + *
> > + * The algorithm differs slightly between HW that supports single
> > + * page sizes vs mixed page sizes in an MR. For example, if an
> > + * MR of size 4M-4K, starts at an offset PAGE_SIZE (ex: 4K) into
> > + * a 2M page; HW that supports multiple page sizes (ex: 4K, 2M)
> > + * would get 511 4K pages and one 2M page. Single page support
> > + * HW would get back two 2M pages or 1023 4K pages.
> 
> That doesn't seem quite right, the first and last pages should always
> be the largest needed to get to alignment, as HW always has an
> offset/length sort of scheme to make this efficient.
> 
> So I would expect 4M-4k to always return two 2M pages if the HW
> supports 2M, even if supports smaller sizes.
> 
> For this to work the algorithm needs to know start/end...
> 
> Maybe we should not support multiple page sizes yet, do we have HW
> that can even do that?


Sorry. I think my comment is confusing.
There is a typo in my description. Its not multiple page size but mixed
page size. In which case, I think it should be 511 4K and one 2M page
for my example of 4M-4k. This is how I understood it based on your
description here.
https://patchwork.kernel.org/patch/10499753/#22186847

For HW like i40iw which can do a single PG size in an MR, we would call
ib_umem_find_single_pg_size first and it should return 2 2M pages, except
if we have to reduce the page size to 4K due to start offset misalignment
between user-space and physical buf. This algorithm would just use best page
size passed in.

 
> > +	unsigned long pg_mask, offset;
> > +	int pg_bit;
> 
> unsigned

OK.
 
> > +
> > +	if (!sg_phys_iter->sg || !supported_pgsz)
> > +		return false;
> 
> > +	if (sg_phys_iter->remaining) {
> > +		sg_phys_iter->phyaddr += sg_phys_iter->len;
> > +	} else {
> > +		sg_phys_iter->phyaddr = sg_dma_address(sg_phys_iter->sg);
> > +		sg_phys_iter->remaining = sg_dma_len(sg_phys_iter->sg);
> 
> Why not put the
> 		sg_phys_iter->sg = sg_next(sg_phys_iter->sg);
> 
> Here?

I think we might have a problem moving it here. For example if we had an MR composing of
3 contiguous 4K pages starting at 4K boundary. That would be represented by a single SGE.

> +	sg_phys_iter->len = min_t(int, sg_phys_iter->remaining,
> +				  (1 << (pg_bit)) - offset);
> +	sg_phys_iter->remaining -= sg_phys_iter->len;

When we execute this code; pg_bit = 12, sg_phys_iter->len = 4K, sg_phys_iter->remaining = 8K.
Since we moved bumping of sg_phys_iter->sg here, it would be NULL on next iteration and we would
exit without processing the remaining 8K from the SGE.
 
> > +	}
> > +
> > +	/* Single page support in MR */
> > +	if (hweight_long(supported_pgsz) == 1) {
> > +		pg_bit = fls64(supported_pgsz) - 1;
> 
> Then this could be precomputed once
> 
> > +	} else {
> > +		/* Mixed page support in MR*/
> > +		pg_bit = ib_umem_find_pg_bit(sg_phys_iter->phyaddr,
> > +					     supported_pgsz);
> > +	}
> > +
> > +	/* page bit cannot be less than the lowest supported HW size */
> > +	if (WARN_ON(pg_bit < __ffs(supported_pgsz)))
> > +		return false;
> 
> and this


I think there is opportunity to pre-compute, especially for single page support case.

Shiraz
Jason Gunthorpe Oct. 26, 2018, 2:40 a.m. UTC | #3
On Thu, Oct 25, 2018 at 05:20:43PM -0500, Shiraz Saleem wrote:
> > > +
> > > +/**
> > > + * ib_umem_next_phys_iter - SG list iterator that returns aligned HW address
> > > + * @umem: umem struct
> > > + * @sg_phys_iter: SG HW address iterator
> > > + * @supported_pgsz: bitmask of HW supported page sizes
> > > + *
> > > + * This helper iterates over the SG list and returns the HW
> > > + * address aligned to a supported HW page size.
> > > + *
> > > + * The algorithm differs slightly between HW that supports single
> > > + * page sizes vs mixed page sizes in an MR. For example, if an
> > > + * MR of size 4M-4K, starts at an offset PAGE_SIZE (ex: 4K) into
> > > + * a 2M page; HW that supports multiple page sizes (ex: 4K, 2M)
> > > + * would get 511 4K pages and one 2M page. Single page support
> > > + * HW would get back two 2M pages or 1023 4K pages.
> > 
> > That doesn't seem quite right, the first and last pages should always
> > be the largest needed to get to alignment, as HW always has an
> > offset/length sort of scheme to make this efficient.
> > 
> > So I would expect 4M-4k to always return two 2M pages if the HW
> > supports 2M, even if supports smaller sizes.
> > 
> > For this to work the algorithm needs to know start/end...
> > 
> > Maybe we should not support multiple page sizes yet, do we have HW
> > that can even do that?
> 
> 
> Sorry. I think my comment is confusing.
> There is a typo in my description. Its not multiple page size but mixed
> page size. In which case, I think it should be 511 4K and one 2M page
> for my example of 4M-4k. This is how I understood it based on your
> description here.
> https://patchwork.kernel.org/patch/10499753/#22186847

I depends if the HW can do the start/end offset thing, ie if you can
start and end on a partial page size.

Since IB requires this I assume that all HW can do it for all page
sizes, and we can pretty much ignore the alignment of the start of the
first sgl and the end of the last sgl.

Jason
Shiraz Saleem Oct. 30, 2018, 11:25 p.m. UTC | #4
On Thu, Oct 25, 2018 at 08:40:29PM -0600, Jason Gunthorpe wrote:
> On Thu, Oct 25, 2018 at 05:20:43PM -0500, Shiraz Saleem wrote:
> > > > +
> > > > +/**
> > > > + * ib_umem_next_phys_iter - SG list iterator that returns aligned HW address
> > > > + * @umem: umem struct
> > > > + * @sg_phys_iter: SG HW address iterator
> > > > + * @supported_pgsz: bitmask of HW supported page sizes
> > > > + *
> > > > + * This helper iterates over the SG list and returns the HW
> > > > + * address aligned to a supported HW page size.
> > > > + *
> > > > + * The algorithm differs slightly between HW that supports single
> > > > + * page sizes vs mixed page sizes in an MR. For example, if an
> > > > + * MR of size 4M-4K, starts at an offset PAGE_SIZE (ex: 4K) into
> > > > + * a 2M page; HW that supports multiple page sizes (ex: 4K, 2M)
> > > > + * would get 511 4K pages and one 2M page. Single page support
> > > > + * HW would get back two 2M pages or 1023 4K pages.
> > > 
> > > That doesn't seem quite right, the first and last pages should always
> > > be the largest needed to get to alignment, as HW always has an
> > > offset/length sort of scheme to make this efficient.
> > > 
> > > So I would expect 4M-4k to always return two 2M pages if the HW
> > > supports 2M, even if supports smaller sizes.
> > > 
> > > For this to work the algorithm needs to know start/end...
> > > 
> > > Maybe we should not support multiple page sizes yet, do we have HW
> > > that can even do that?
> > 
> > 
> > Sorry. I think my comment is confusing.
> > There is a typo in my description. Its not multiple page size but mixed
> > page size. In which case, I think it should be 511 4K and one 2M page
> > for my example of 4M-4k. This is how I understood it based on your
> > description here.
> > https://patchwork.kernel.org/patch/10499753/#22186847
> 
> I depends if the HW can do the start/end offset thing, ie if you can
> start and end on a partial page size.
> 
> Since IB requires this I assume that all HW can do it for all page
> sizes, and we can pretty much ignore the alignment of the start of the
> first sgl and the end of the last sgl.
> 

OK. I will tweak the algorithm accordingly.
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 04071b5..cba79ab 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -160,6 +160,74 @@  unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
 }
 EXPORT_SYMBOL(ib_umem_find_single_pg_size);
 
+void ib_umem_start_phys_iter(struct ib_umem *umem,
+			     struct sg_phys_iter *sg_phys_iter)
+{
+	memset(sg_phys_iter, 0, sizeof(struct sg_phys_iter));
+	sg_phys_iter->sg = umem->sg_head.sgl;
+}
+EXPORT_SYMBOL(ib_umem_start_phys_iter);
+
+/**
+ * ib_umem_next_phys_iter - SG list iterator that returns aligned HW address
+ * @umem: umem struct
+ * @sg_phys_iter: SG HW address iterator
+ * @supported_pgsz: bitmask of HW supported page sizes
+ *
+ * This helper iterates over the SG list and returns the HW
+ * address aligned to a supported HW page size.
+ *
+ * The algorithm differs slightly between HW that supports single
+ * page sizes vs mixed page sizes in an MR. For example, if an
+ * MR of size 4M-4K, starts at an offset PAGE_SIZE (ex: 4K) into
+ * a 2M page; HW that supports multiple page sizes (ex: 4K, 2M)
+ * would get 511 4K pages and one 2M page. Single page support
+ * HW would get back two 2M pages or 1023 4K pages.
+ */
+bool ib_umem_next_phys_iter(struct ib_umem *umem,
+			    struct sg_phys_iter *sg_phys_iter,
+			    unsigned long supported_pgsz)
+{
+	unsigned long pg_mask, offset;
+	int pg_bit;
+
+	if (!sg_phys_iter->sg || !supported_pgsz)
+		return false;
+
+	if (sg_phys_iter->remaining) {
+		sg_phys_iter->phyaddr += sg_phys_iter->len;
+	} else {
+		sg_phys_iter->phyaddr = sg_dma_address(sg_phys_iter->sg);
+		sg_phys_iter->remaining = sg_dma_len(sg_phys_iter->sg);
+	}
+
+	/* Single page support in MR */
+	if (hweight_long(supported_pgsz) == 1) {
+		pg_bit = fls64(supported_pgsz) - 1;
+	} else {
+		/* Mixed page support in MR*/
+		pg_bit = ib_umem_find_pg_bit(sg_phys_iter->phyaddr,
+					     supported_pgsz);
+	}
+
+	/* page bit cannot be less than the lowest supported HW size */
+	if (WARN_ON(pg_bit < __ffs(supported_pgsz)))
+		return false;
+
+	pg_mask = ~((1 << pg_bit) - 1);
+
+	offset = sg_phys_iter->phyaddr & ~pg_mask;
+	sg_phys_iter->phyaddr = sg_phys_iter->phyaddr & pg_mask;
+	sg_phys_iter->len = min_t(int, sg_phys_iter->remaining,
+				  (1 << (pg_bit)) - offset);
+	sg_phys_iter->remaining -= sg_phys_iter->len;
+	if (!sg_phys_iter->remaining)
+		sg_phys_iter->sg = sg_next(sg_phys_iter->sg);
+
+	return true;
+}
+EXPORT_SYMBOL(ib_umem_next_phys_iter);
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 24ba6c6..8114fd1 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -55,6 +55,13 @@  struct ib_umem {
 	int             npages;
 };
 
+struct sg_phys_iter {
+	struct scatterlist *sg;
+	unsigned long phyaddr;
+	size_t len;
+	unsigned int remaining;
+};
+
 /* Returns the offset of the umem start relative to the first page. */
 static inline int ib_umem_offset(struct ib_umem *umem)
 {
@@ -88,6 +95,11 @@  int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 		      size_t length);
 unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
 					  unsigned long supported_pgsz);
+void ib_umem_start_phys_iter(struct ib_umem *umem,
+			     struct sg_phys_iter *sg_phys_iter);
+bool ib_umem_next_phys_iter(struct ib_umem *umem,
+			    struct sg_phys_iter *sg_phys_iter,
+			    unsigned long supported_pgsz);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -108,6 +120,13 @@  static inline int ib_umem_find_single_pg_size(struct ib_umem *umem,
 					      unsigned long supported_pgsz) {
 	return -EINVAL;
 }
+static inline void ib_umem_start_phys_iter(struct ib_umem *umem,
+					   struct sg_phys_iter *sg_phys_iter) { }
+static inline bool ib_umem_next_phys_iter(struct ib_umem *umem,
+					  struct sg_phys_iter *sg_phys_iter,
+					  unsigned long supported_pgsz) {
+	return false;
+}
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */