diff mbox series

[RFC,2/4] RDMA/umem: Add API to find best driver supported page size in an MR

Message ID 20181019233409.1104-3-shiraz.saleem@intel.com (mailing list archive)
State Superseded
Headers show
Series Introduce APIs to get DMA addresses aligned to a HW supported page size | expand

Commit Message

Shiraz Saleem Oct. 19, 2018, 11:34 p.m. UTC
This helper iterates through the SG list to find the best page size to use
from a bitmap of HW supported page sizes. Drivers that support multiple
page sizes, but not mixed pages in an MR can call this API.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/core/umem.c | 95 ++++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_umem.h         |  7 ++++
 2 files changed, 102 insertions(+)

Comments

Jason Gunthorpe Oct. 22, 2018, 9:32 p.m. UTC | #1
On Fri, Oct 19, 2018 at 06:34:07PM -0500, Shiraz Saleem wrote:
> This helper iterates through the SG list to find the best page size to use
> from a bitmap of HW supported page sizes. Drivers that support multiple
> page sizes, but not mixed pages in an MR can call this API.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>  drivers/infiniband/core/umem.c | 95 ++++++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h         |  7 ++++
>  2 files changed, 102 insertions(+)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 486d6d7..04071b5 100644
> +++ b/drivers/infiniband/core/umem.c
> @@ -66,6 +66,101 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
>  }
>  
>  /**
> + * ib_umem_find_pg_bit - Find the page bit to use for phyaddr
> + *
> + * @phyaddr: Physical address after DMA translation
> + * @supported_pgsz: bitmask of HW supported page sizes
> + */
> +static int ib_umem_find_pg_bit(unsigned long phyaddr,
> +			       unsigned long supported_pgsz)

This returns an unsigned value

> +{
> +	unsigned long num_zeroes;
> +	int pg_bit;

unsigned

> +
> +	/* Trailing zero bits in the address */
> +	num_zeroes = __ffs(phyaddr);
> +
> +	/* Find page bit such that phyaddr is aligned to the highest supported
> +	 * HW page size
> +	 */
> +	pg_bit = fls64(supported_pgsz & (BIT_ULL(num_zeroes + 1) - 1)) - 1;
> +
> +	return pg_bit;
> +}
> +
> +/**
> + * ib_umem_find_single_pg_size - Find best HW page size to use for this MR
> + * @umem: umem struct
> + * @supported_pgsz: bitmask of HW supported page sizes
> + *
> + * This helper is intended for HW that support multiple page
> + * sizes but can do only a single page size in an MR.
> + */
> +unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
> +					  unsigned long supported_pgsz)
> +{
> +	struct scatterlist *sg;
> +	unsigned long dma_addr_start, dma_addr_end;
> +	unsigned long uvirt_offset, phy_offset;
> +	unsigned long pg_mask, bitmap;
> +	int pg_bit_start, pg_bit_end, pg_bit_sg_chunk;
> +	int lowest_pg_bit, best_pg_bit;
> +	int i;
> +
> +	if (!supported_pgsz)
> +		return 0;
> +
> +	lowest_pg_bit = __ffs(supported_pgsz);
> +	best_pg_bit = fls64(supported_pgsz) - 1;
> +
> +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.orig_nents, i) {
> +		dma_addr_start = sg_dma_address(sg);
> +		dma_addr_end = sg_dma_address(sg) + sg_dma_len(sg);
> +		pg_bit_start = ib_umem_find_pg_bit(dma_addr_start, supported_pgsz);
> +		pg_bit_end = ib_umem_find_pg_bit(dma_addr_end, supported_pgsz);

This seems convoluted.. 

Shouldn't this just be another ffs on sg_dma_len() to compute the
maximum possible page size?

	pg_bit_start = ib_umem_find_pg_bit(dma_addr_start,
   	   supported_pgsz & (BIT_ULL(__ffs(sg_dma_len(sg) + 1)) - 1));

Or so? then bit_end isn't needed and nor is the while loop below

> +		if (!i) {
> +			pg_bit_sg_chunk = max_t(int, pg_bit_start, pg_bit_end);
> +			bitmap = supported_pgsz;
> +			/* The start offset of the MR into a first _large_ page
> +			 * should line up exactly for the user-space virtual buf
> +			 * and physical buffer, in order to upgrade the page bit
> +			 */
> +			while (pg_bit_sg_chunk > PAGE_SHIFT) {

Hurm.. This doesn't look quite right.

This is a HW restriction where we must have RDMA_ADDR & PAGE_SIZE-1 ==
0, for all the interior pages, right? ie the HW is computing the
offset into the DMA list naively?

First of all, umem->address is the wrong thing to use, the RDMA
virtual MR base address could be zero (ie IB_ZERO_BASED was used)
there should be an accessor helper for this.

Second this should only be checked once, not every loop. If we know
the RDMA MR base address, the length of the first sgl, and the page
size, we can do a single computation to reduce the page size to make
it sufficiently aligned to the MR base.

> +				pg_mask = ~((1 << pg_bit_sg_chunk) - 1);
> +				uvirt_offset = umem->address & ~pg_mask;
> +				phy_offset = (dma_addr_start + ib_umem_offset(umem)) &
> +					      ~pg_mask;
> +				if (uvirt_offset == phy_offset)
> +					break;
> +
> +				/* Retry with next supported page size */
> +				clear_bit(pg_bit_sg_chunk, &bitmap);
> +				pg_bit_sg_chunk = fls64(bitmap) - 1;
> +			}
> +		} else if (i == (umem->sg_head.orig_nents - 1)) {
> +			/* last SG chunk: Does not matter if MR ends at an
> +			 * unaligned offset.
> +			 */
> +			pg_bit_sg_chunk = pg_bit_start;
> +		} else {
> +			pg_bit_sg_chunk = min_t(int, pg_bit_start, pg_bit_end);
> +		}
> +
> +		best_pg_bit = min_t(int, best_pg_bit, pg_bit_sg_chunk);
> +		if (best_pg_bit == lowest_pg_bit)
> +			break;
> +	}

In an ideal world we would build a bitmap of the possible page sizes
in the SGL at the same time as inserting pages into the scatter table,
then this second (rather expensive looking) iteration would not be
needed..

But get_user_pages is so expensive already this overhead is probably
OK for now.

Jason
Shiraz Saleem Oct. 25, 2018, 12:05 a.m. UTC | #2
On Mon, Oct 22, 2018 at 03:32:23PM -0600, Jason Gunthorpe wrote:
> On Fri, Oct 19, 2018 at 06:34:07PM -0500, Shiraz Saleem wrote:
> > This helper iterates through the SG list to find the best page size to use
> > from a bitmap of HW supported page sizes. Drivers that support multiple
> > page sizes, but not mixed pages in an MR can call this API.
> > 
> > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> >  drivers/infiniband/core/umem.c | 95 ++++++++++++++++++++++++++++++++++++++++++
> >  include/rdma/ib_umem.h         |  7 ++++
> >  2 files changed, 102 insertions(+)
> > 
> > +static int ib_umem_find_pg_bit(unsigned long phyaddr,
> > +			       unsigned long supported_pgsz)
> 
> This returns an unsigned value
> 
> > +{
> > +	unsigned long num_zeroes;
> > +	int pg_bit;
> 
> unsigned

Right.

[..] 
> > +
> > +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.orig_nents, i) {
> > +		dma_addr_start = sg_dma_address(sg);
> > +		dma_addr_end = sg_dma_address(sg) + sg_dma_len(sg);
> > +		pg_bit_start = ib_umem_find_pg_bit(dma_addr_start, supported_pgsz);
> > +		pg_bit_end = ib_umem_find_pg_bit(dma_addr_end, supported_pgsz);
> 
> This seems convoluted.. 
> 
> Shouldn't this just be another ffs on sg_dma_len() to compute the
> maximum possible page size?
> 
> 	pg_bit_start = ib_umem_find_pg_bit(dma_addr_start,
>    	   supported_pgsz & (BIT_ULL(__ffs(sg_dma_len(sg) + 1)) - 1));
> 
> Or so? then bit_end isn't needed and nor is the while loop below

Looks simpler. But, would it work correct for last SGE?
For a HW that supports 2M and 4K for example, if the last SGE starts at a 2M aligned addr
and has length equal 200 4K pages, the pg_bit as per above equation would compute 12
when it should be 21. Same applies to the first SGE I think.
 
> > +		if (!i) {
> > +			pg_bit_sg_chunk = max_t(int, pg_bit_start, pg_bit_end);
> > +			bitmap = supported_pgsz;
> > +			/* The start offset of the MR into a first _large_ page
> > +			 * should line up exactly for the user-space virtual buf
> > +			 * and physical buffer, in order to upgrade the page bit
> > +			 */
> > +			while (pg_bit_sg_chunk > PAGE_SHIFT) {
> 
> Hurm.. This doesn't look quite right.
> 
> This is a HW restriction where we must have RDMA_ADDR & PAGE_SIZE-1 ==
> 0, for all the interior pages, right? ie the HW is computing the
> offset into the DMA list naively?

Yes. For VA-based, least significant bits (12 bits for 4K page size, 21 bits for 2M page size etc.)
of the VA indicate offset.

> First of all, umem->address is the wrong thing to use, the RDMA
> virtual MR base address could be zero (ie IB_ZERO_BASED was used)
> there should be an accessor helper for this.

Yes. Zero based VA was not considered.

> Second this should only be checked once, not every loop. If we know
> the RDMA MR base address, the length of the first sgl, and the page
> size, we can do a single computation to reduce the page size to make
> it sufficiently aligned to the MR base.

I am not on this page yet (no pun intended :) ). But will spend some
time figuring how this can be done.

Shiraz
Jason Gunthorpe Oct. 26, 2018, 2:34 a.m. UTC | #3
On Wed, Oct 24, 2018 at 07:05:11PM -0500, Shiraz Saleem wrote:
> On Mon, Oct 22, 2018 at 03:32:23PM -0600, Jason Gunthorpe wrote:
> > On Fri, Oct 19, 2018 at 06:34:07PM -0500, Shiraz Saleem wrote:
> > > This helper iterates through the SG list to find the best page size to use
> > > from a bitmap of HW supported page sizes. Drivers that support multiple
> > > page sizes, but not mixed pages in an MR can call this API.
> > > 
> > > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> > > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > >  drivers/infiniband/core/umem.c | 95 ++++++++++++++++++++++++++++++++++++++++++
> > >  include/rdma/ib_umem.h         |  7 ++++
> > >  2 files changed, 102 insertions(+)
> > > 
> > > +static int ib_umem_find_pg_bit(unsigned long phyaddr,
> > > +			       unsigned long supported_pgsz)
> > 
> > This returns an unsigned value
> > 
> > > +{
> > > +	unsigned long num_zeroes;
> > > +	int pg_bit;
> > 
> > unsigned
> 
> Right.
> 
> [..] 
> > > +
> > > +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.orig_nents, i) {
> > > +		dma_addr_start = sg_dma_address(sg);
> > > +		dma_addr_end = sg_dma_address(sg) + sg_dma_len(sg);
> > > +		pg_bit_start = ib_umem_find_pg_bit(dma_addr_start, supported_pgsz);
> > > +		pg_bit_end = ib_umem_find_pg_bit(dma_addr_end, supported_pgsz);
> > 
> > This seems convoluted.. 
> > 
> > Shouldn't this just be another ffs on sg_dma_len() to compute the
> > maximum possible page size?
> > 
> > 	pg_bit_start = ib_umem_find_pg_bit(dma_addr_start,
> >    	   supported_pgsz & (BIT_ULL(__ffs(sg_dma_len(sg) + 1)) - 1));
> > 
> > Or so? then bit_end isn't needed and nor is the while loop below
> 
> Looks simpler. But, would it work correct for last SGE?

No, last SGE checks alignment of the start only

First SGE checks alignment of the end only (presumably? depends on HW
I guess)

> > > +		if (!i) {
> > > +			pg_bit_sg_chunk = max_t(int, pg_bit_start, pg_bit_end);
> > > +			bitmap = supported_pgsz;
> > > +			/* The start offset of the MR into a first _large_ page
> > > +			 * should line up exactly for the user-space virtual buf
> > > +			 * and physical buffer, in order to upgrade the page bit
> > > +			 */
> > > +			while (pg_bit_sg_chunk > PAGE_SHIFT) {
> > 
> > Hurm.. This doesn't look quite right.
> > 
> > This is a HW restriction where we must have RDMA_ADDR & PAGE_SIZE-1 ==
> > 0, for all the interior pages, right? ie the HW is computing the
> > offset into the DMA list naively?
> 
> Yes. For VA-based, least significant bits (12 bits for 4K page size,
> 21 bits for 2M page size etc.)  of the VA indicate offset.

I wonder if all HW works this way or if we need a flag.

> > First of all, umem->address is the wrong thing to use, the RDMA
> > virtual MR base address could be zero (ie IB_ZERO_BASED was used)
> > there should be an accessor helper for this.
> 
> Yes. Zero based VA was not considered.
> 
> > Second this should only be checked once, not every loop. If we know
> > the RDMA MR base address, the length of the first sgl, and the page
> > size, we can do a single computation to reduce the page size to make
> > it sufficiently aligned to the MR base.
> 
> I am not on this page yet (no pun intended :) ). But will spend some
> time figuring how this can be done.

basically compute the 'end' of the first page in virtual space and
that is initial minimum alignement requirement.

Ah.. but I suppose this has to be checked again any time the page size
is reduced, since the page forcing the reduction may not satisfy the
requirement.

Still this restriction should be doable without the inner while loop.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 486d6d7..04071b5 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -66,6 +66,101 @@  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 }
 
 /**
+ * ib_umem_find_pg_bit - Find the page bit to use for phyaddr
+ *
+ * @phyaddr: Physical address after DMA translation
+ * @supported_pgsz: bitmask of HW supported page sizes
+ */
+static int ib_umem_find_pg_bit(unsigned long phyaddr,
+			       unsigned long supported_pgsz)
+{
+	unsigned long num_zeroes;
+	int pg_bit;
+
+	/* Trailing zero bits in the address */
+	num_zeroes = __ffs(phyaddr);
+
+	/* Find page bit such that phyaddr is aligned to the highest supported
+	 * HW page size
+	 */
+	pg_bit = fls64(supported_pgsz & (BIT_ULL(num_zeroes + 1) - 1)) - 1;
+
+	return pg_bit;
+}
+
+/**
+ * ib_umem_find_single_pg_size - Find best HW page size to use for this MR
+ * @umem: umem struct
+ * @supported_pgsz: bitmask of HW supported page sizes
+ *
+ * This helper is intended for HW that support multiple page
+ * sizes but can do only a single page size in an MR.
+ */
+unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
+					  unsigned long supported_pgsz)
+{
+	struct scatterlist *sg;
+	unsigned long dma_addr_start, dma_addr_end;
+	unsigned long uvirt_offset, phy_offset;
+	unsigned long pg_mask, bitmap;
+	int pg_bit_start, pg_bit_end, pg_bit_sg_chunk;
+	int lowest_pg_bit, best_pg_bit;
+	int i;
+
+	if (!supported_pgsz)
+		return 0;
+
+	lowest_pg_bit = __ffs(supported_pgsz);
+	best_pg_bit = fls64(supported_pgsz) - 1;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.orig_nents, i) {
+		dma_addr_start = sg_dma_address(sg);
+		dma_addr_end = sg_dma_address(sg) + sg_dma_len(sg);
+		pg_bit_start = ib_umem_find_pg_bit(dma_addr_start, supported_pgsz);
+		pg_bit_end = ib_umem_find_pg_bit(dma_addr_end, supported_pgsz);
+
+		if (!i) {
+			pg_bit_sg_chunk = max_t(int, pg_bit_start, pg_bit_end);
+			bitmap = supported_pgsz;
+			/* The start offset of the MR into a first _large_ page
+			 * should line up exactly for the user-space virtual buf
+			 * and physical buffer, in order to upgrade the page bit
+			 */
+			while (pg_bit_sg_chunk > PAGE_SHIFT) {
+				pg_mask = ~((1 << pg_bit_sg_chunk) - 1);
+				uvirt_offset = umem->address & ~pg_mask;
+				phy_offset = (dma_addr_start + ib_umem_offset(umem)) &
+					      ~pg_mask;
+				if (uvirt_offset == phy_offset)
+					break;
+
+				/* Retry with next supported page size */
+				clear_bit(pg_bit_sg_chunk, &bitmap);
+				pg_bit_sg_chunk = fls64(bitmap) - 1;
+			}
+		} else if (i == (umem->sg_head.orig_nents - 1)) {
+			/* last SG chunk: Does not matter if MR ends at an
+			 * unaligned offset.
+			 */
+			pg_bit_sg_chunk = pg_bit_start;
+		} else {
+			pg_bit_sg_chunk = min_t(int, pg_bit_start, pg_bit_end);
+		}
+
+		best_pg_bit = min_t(int, best_pg_bit, pg_bit_sg_chunk);
+		if (best_pg_bit == lowest_pg_bit)
+			break;
+	}
+
+	/* best page bit cannot be less than the lowest supported HW size */
+	if (best_pg_bit < lowest_pg_bit)
+		return BIT_ULL(lowest_pg_bit);
+
+	return BIT_ULL(best_pg_bit);
+}
+EXPORT_SYMBOL(ib_umem_find_single_pg_size);
+
+/**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
  * If access flags indicate ODP memory, avoid pinning. Instead, stores
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 5d3755e..24ba6c6 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -86,6 +86,8 @@  void ib_umem_release(struct ib_umem *umem);
 int ib_umem_page_count(struct ib_umem *umem);
 int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 		      size_t length);
+unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
+					  unsigned long supported_pgsz);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -102,6 +104,11 @@  static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offs
 		      		    size_t length) {
 	return -EINVAL;
 }
+static inline int ib_umem_find_single_pg_size(struct ib_umem *umem,
+					      unsigned long supported_pgsz) {
+	return -EINVAL;
+}
+
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 
 #endif /* IB_UMEM_H */