diff mbox series

[rdma-next,v1,3/6] RDMA/umem: Add API to return aligned memory blocks from SGL

Message ID 20190219145745.13476-4-shiraz.saleem@intel.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series Add APIs to get contiguous memory blocks aligned to a HW supported page size | expand

Commit Message

Shiraz Saleem Feb. 19, 2019, 2:57 p.m. UTC
This helper iterates over the SG list and returns contiguous
memory blocks aligned to a HW supported page size.
The implementation is intended to work for HW that support
single page sizes or mixed page sizes in an MR. Drivers
can use this helper to retreive the DMA addresses aligned
to their best supported page size.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/core/umem.c | 90 ++++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_umem.h         | 23 +++++++++++
 2 files changed, 113 insertions(+)

Comments

Jason Gunthorpe Feb. 20, 2019, 4 a.m. UTC | #1
On Tue, Feb 19, 2019 at 08:57:42AM -0600, Shiraz Saleem wrote:
>  
> +struct sg_phys_iter {

This should have umem in the name

> +	struct scatterlist *sg;
> +	unsigned long phyaddr;
> +	unsigned long len;
> +	unsigned long offset;
> +	unsigned long supported_pgsz;
> +	unsigned long remaining;
> +	unsigned int pg_bit;
> +	u8 mixed_pg_support;

This should also be clear if it is a *dma* or *page* iterator, and
which fields are public/private.

Jason
Leon Romanovsky Feb. 20, 2019, 2:55 p.m. UTC | #2
On Tue, Feb 19, 2019 at 08:57:42AM -0600, Shiraz Saleem wrote:
> This helper iterates over the SG list and returns contiguous
> memory blocks aligned to a HW supported page size.
> The implementation is intended to work for HW that support
> single page sizes or mixed page sizes in an MR. Drivers
> can use this helper to retreive the DMA addresses aligned
> to their best supported page size.
>
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> ---
>  drivers/infiniband/core/umem.c | 90 ++++++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h         | 23 +++++++++++
>  2 files changed, 113 insertions(+)
>
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index d3b572e..8c3ec1b 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -214,6 +214,96 @@ unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
>  }
>  EXPORT_SYMBOL(ib_umem_find_single_pg_size);
>
> +static unsigned int ib_umem_find_mixed_pg_bit(struct scatterlist *sgl_head,
> +					      struct sg_phys_iter *sg_phys_iter)
> +{
> +	unsigned long dma_addr_start, dma_addr_end;
> +
> +	dma_addr_start = sg_dma_address(sg_phys_iter->sg);
> +	dma_addr_end = sg_dma_address(sg_phys_iter->sg) +
> +		       sg_dma_len(sg_phys_iter->sg);
> +
> +	if (sg_phys_iter->sg == sgl_head)
> +		return ib_umem_find_pg_bit(dma_addr_end,
> +					   sg_phys_iter->supported_pgsz);
> +	else if (sg_is_last(sg_phys_iter->sg))
> +		return ib_umem_find_pg_bit(dma_addr_start,
> +					   sg_phys_iter->supported_pgsz);
> +	else
> +		return ib_umem_find_pg_bit(sg_phys_iter->phyaddr,
> +					   sg_phys_iter->supported_pgsz);

It is very confusing, at least for me, the usage of first argument of
ib_umem_find_pg_bit(). In previous patch, you wrote that it is
phys address, but here you are mixing physical and DMA addresses.

Thanks
Shiraz Saleem Feb. 23, 2019, 6:47 p.m. UTC | #3
On Tue, Feb 19, 2019 at 09:00:39PM -0700, Jason Gunthorpe wrote:
> On Tue, Feb 19, 2019 at 08:57:42AM -0600, Shiraz Saleem wrote:
> >  
> > +struct sg_phys_iter {
> 
> This should have umem in the name
> 
> > +	struct scatterlist *sg;
> > +	unsigned long phyaddr;
> > +	unsigned long len;
> > +	unsigned long offset;
> > +	unsigned long supported_pgsz;
> > +	unsigned long remaining;
> > +	unsigned int pg_bit;
> > +	u8 mixed_pg_support;
> 
> This should also be clear if it is a *dma* or *page* iterator, and
> which fields are public/private.
> 

Sure. I ll update in the next rev.
Shiraz Saleem Feb. 23, 2019, 6:53 p.m. UTC | #4
On Wed, Feb 20, 2019 at 04:55:39PM +0200, Leon Romanovsky wrote:
> On Tue, Feb 19, 2019 at 08:57:42AM -0600, Shiraz Saleem wrote:
> > This helper iterates over the SG list and returns contiguous
> > memory blocks aligned to a HW supported page size.
> > The implementation is intended to work for HW that support
> > single page sizes or mixed page sizes in an MR. Drivers
> > can use this helper to retreive the DMA addresses aligned
> > to their best supported page size.
> >
> > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > ---
> >  drivers/infiniband/core/umem.c | 90 ++++++++++++++++++++++++++++++++++++++++++
> >  include/rdma/ib_umem.h         | 23 +++++++++++
> >  2 files changed, 113 insertions(+)
> >
> > diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> > index d3b572e..8c3ec1b 100644
> > --- a/drivers/infiniband/core/umem.c
> > +++ b/drivers/infiniband/core/umem.c
> > @@ -214,6 +214,96 @@ unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
> >  }
> >  EXPORT_SYMBOL(ib_umem_find_single_pg_size);
> >
> > +static unsigned int ib_umem_find_mixed_pg_bit(struct scatterlist *sgl_head,
> > +					      struct sg_phys_iter *sg_phys_iter)
> > +{
> > +	unsigned long dma_addr_start, dma_addr_end;
> > +
> > +	dma_addr_start = sg_dma_address(sg_phys_iter->sg);
> > +	dma_addr_end = sg_dma_address(sg_phys_iter->sg) +
> > +		       sg_dma_len(sg_phys_iter->sg);
> > +
> > +	if (sg_phys_iter->sg == sgl_head)
> > +		return ib_umem_find_pg_bit(dma_addr_end,
> > +					   sg_phys_iter->supported_pgsz);
> > +	else if (sg_is_last(sg_phys_iter->sg))
> > +		return ib_umem_find_pg_bit(dma_addr_start,
> > +					   sg_phys_iter->supported_pgsz);
> > +	else
> > +		return ib_umem_find_pg_bit(sg_phys_iter->phyaddr,
> > +					   sg_phys_iter->supported_pgsz);
> 
> It is very confusing, at least for me, the usage of first argument of
> ib_umem_find_pg_bit(). In previous patch, you wrote that it is
> phys address, but here you are mixing physical and DMA addresses.
> 

The helper ib_umem_find_pg_bit() really is generic to work with any addr. I should however, not tie
it to any type in the description.

Shiraz
Gal Pressman March 10, 2019, 12:59 p.m. UTC | #5
On 19-Feb-19 16:57, Shiraz Saleem wrote:
> This helper iterates over the SG list and returns contiguous
> memory blocks aligned to a HW supported page size.
> The implementation is intended to work for HW that support
> single page sizes or mixed page sizes in an MR. Drivers
> can use this helper to retreive the DMA addresses aligned
> to their best supported page size.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>

Hi Shiraz,
General question about this patch.
I'm going to need a very similar iterator API, except it shouldn't go through an
ib_umem object. Instead, a scatterlist would be given explicitly as a parameter.

Do you think there's a way to make this code more generic and then use it for
umem as a specific use case? I'll be glad to help with any changes needed.

Thanks,
Gal
Shiraz Saleem March 12, 2019, 8:34 p.m. UTC | #6
>Subject: Re: [PATCH rdma-next v1 3/6] RDMA/umem: Add API to return aligned
>memory blocks from SGL
>
>On 19-Feb-19 16:57, Shiraz Saleem wrote:
>> This helper iterates over the SG list and returns contiguous memory
>> blocks aligned to a HW supported page size.
>> The implementation is intended to work for HW that support single page
>> sizes or mixed page sizes in an MR. Drivers can use this helper to
>> retreive the DMA addresses aligned to their best supported page size.
>>
>> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
>> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
>> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>
>Hi Shiraz,
>General question about this patch.
>I'm going to need a very similar iterator API, except it shouldn't go through an
>ib_umem object. Instead, a scatterlist would be given explicitly as a parameter.
>
>Do you think there's a way to make this code more generic and then use it for
>umem as a specific use case? I'll be glad to help with any changes needed.
>

I think so. And this is part of my exploration for the next rev.

Presuming you want the iterator on a DMA mapped SGL too? 
What do you intend to use it for? Are you looking to the helper to
break the SGEs to provide EFA_PAGE_SIZE aligned dma addr when
PAGE_SIZE > EFA_PAGE_SIZE?

Shiraz
Gal Pressman March 13, 2019, 7:07 a.m. UTC | #7
On 12-Mar-19 22:34, Saleem, Shiraz wrote:
>> Subject: Re: [PATCH rdma-next v1 3/6] RDMA/umem: Add API to return aligned
>> memory blocks from SGL
>>
>> On 19-Feb-19 16:57, Shiraz Saleem wrote:
>>> This helper iterates over the SG list and returns contiguous memory
>>> blocks aligned to a HW supported page size.
>>> The implementation is intended to work for HW that support single page
>>> sizes or mixed page sizes in an MR. Drivers can use this helper to
>>> retreive the DMA addresses aligned to their best supported page size.
>>>
>>> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
>>> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
>>> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>>
>> Hi Shiraz,
>> General question about this patch.
>> I'm going to need a very similar iterator API, except it shouldn't go through an
>> ib_umem object. Instead, a scatterlist would be given explicitly as a parameter.
>>
>> Do you think there's a way to make this code more generic and then use it for
>> umem as a specific use case? I'll be glad to help with any changes needed.
>>
> 
> I think so. And this is part of my exploration for the next rev.
> 
> Presuming you want the iterator on a DMA mapped SGL too? 
> What do you intend to use it for? Are you looking to the helper to
> break the SGEs to provide EFA_PAGE_SIZE aligned dma addr when
> PAGE_SIZE > EFA_PAGE_SIZE?

Exactly.
I currently have a hard-coded loop in my code that iterates the mapped SG list
in EFA_PAGE_SIZE (4kB) aligned strides, your new API is exactly what EFA needs.

Let me know if I can help in any way and please CC me on your next submission :).

Thanks!
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index d3b572e..8c3ec1b 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -214,6 +214,96 @@  unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
 }
 EXPORT_SYMBOL(ib_umem_find_single_pg_size);
 
+static unsigned int ib_umem_find_mixed_pg_bit(struct scatterlist *sgl_head,
+					      struct sg_phys_iter *sg_phys_iter)
+{
+	unsigned long dma_addr_start, dma_addr_end;
+
+	dma_addr_start = sg_dma_address(sg_phys_iter->sg);
+	dma_addr_end = sg_dma_address(sg_phys_iter->sg) +
+		       sg_dma_len(sg_phys_iter->sg);
+
+	if (sg_phys_iter->sg == sgl_head)
+		return ib_umem_find_pg_bit(dma_addr_end,
+					   sg_phys_iter->supported_pgsz);
+	else if (sg_is_last(sg_phys_iter->sg))
+		return ib_umem_find_pg_bit(dma_addr_start,
+					   sg_phys_iter->supported_pgsz);
+	else
+		return ib_umem_find_pg_bit(sg_phys_iter->phyaddr,
+					   sg_phys_iter->supported_pgsz);
+}
+void ib_umem_start_phys_iter(struct ib_umem *umem,
+			     struct sg_phys_iter *sg_phys_iter,
+			     unsigned long supported_pgsz)
+{
+	memset(sg_phys_iter, 0, sizeof(struct sg_phys_iter));
+	sg_phys_iter->sg = umem->sg_head.sgl;
+	sg_phys_iter->supported_pgsz = supported_pgsz;
+
+	/* Single page support in MR */
+	if (hweight_long(supported_pgsz) == 1)
+		sg_phys_iter->pg_bit = fls64(supported_pgsz) - 1;
+	else
+		sg_phys_iter->mixed_pg_support = true;
+}
+EXPORT_SYMBOL(ib_umem_start_phys_iter);
+
+/**
+ * ib_umem_next_phys_iter - Iterate SG entries in aligned memory blocks
+ * @umem: umem struct
+ * @sg_phys_iter: SG phy iterator
+ * @supported_pgsz: bitmask of HW supported page sizes
+ *
+ * This helper iterates over the SG list and returns memory
+ * blocks aligned to a HW supported page size.
+ *
+ * Each true result returns a contiguous aligned memory blocks
+ * such that:
+ * - pg_bit indicate the alignment of this block such that
+ *   phyaddr & ((1 << pg_bit) - 1) == 0
+ * - All blocks except the starting block has a zero offset
+ *   For the starting block offset indicates the first valid byte
+ *   in the MR, HW should not permit access to bytes earlier that offset.
+ * - For all blocks except the last, len + offset equals 1 << pg_bit.
+ *
+ * False is returned when iteration is completed and blocks have been seen.
+ *
+ */
+bool ib_umem_next_phys_iter(struct ib_umem *umem,
+			    struct sg_phys_iter *sg_phys_iter)
+{
+	unsigned long pg_mask;
+
+	if (!sg_phys_iter->supported_pgsz || !sg_phys_iter->sg)
+		return false;
+
+	if (sg_phys_iter->remaining) {
+		sg_phys_iter->phyaddr += (sg_phys_iter->len + sg_phys_iter->offset);
+	} else {
+		sg_phys_iter->phyaddr = sg_dma_address(sg_phys_iter->sg);
+		sg_phys_iter->remaining = sg_dma_len(sg_phys_iter->sg);
+	}
+
+	/* Mixed page support in MR*/
+	if (sg_phys_iter->mixed_pg_support)
+		sg_phys_iter->pg_bit = ib_umem_find_mixed_pg_bit(umem->sg_head.sgl,
+								 sg_phys_iter);
+
+	pg_mask = ~(BIT_ULL(sg_phys_iter->pg_bit) - 1);
+
+	sg_phys_iter->offset = sg_phys_iter->phyaddr & ~pg_mask;
+	sg_phys_iter->phyaddr = sg_phys_iter->phyaddr & pg_mask;
+	sg_phys_iter->len = min_t(unsigned long, sg_phys_iter->remaining,
+			BIT_ULL(sg_phys_iter->pg_bit) - sg_phys_iter->offset);
+	sg_phys_iter->remaining -= sg_phys_iter->len;
+	if (!sg_phys_iter->remaining)
+		sg_phys_iter->sg = sg_next(sg_phys_iter->sg);
+
+	return true;
+}
+EXPORT_SYMBOL(ib_umem_next_phys_iter);
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 4e186a3..49bd444 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -57,6 +57,17 @@  struct ib_umem {
 	int npages;
 };
 
+struct sg_phys_iter {
+	struct scatterlist *sg;
+	unsigned long phyaddr;
+	unsigned long len;
+	unsigned long offset;
+	unsigned long supported_pgsz;
+	unsigned long remaining;
+	unsigned int pg_bit;
+	u8 mixed_pg_support;
+};
+
 /* Returns the offset of the umem start relative to the first page. */
 static inline int ib_umem_offset(struct ib_umem *umem)
 {
@@ -91,6 +102,11 @@  int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
 					  unsigned long supported_pgsz,
 					  unsigned long uvirt_addr);
+void ib_umem_start_phys_iter(struct ib_umem *umem,
+			     struct sg_phys_iter *sg_phys_iter,
+			     unsigned long supported_pgsz);
+bool ib_umem_next_phys_iter(struct ib_umem *umem,
+			    struct sg_phys_iter *sg_phys_iter);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -113,6 +129,13 @@  static inline int ib_umem_find_single_pg_size(struct ib_umem *umem,
 					      unsigned long uvirt_addr) {
 	return -EINVAL;
 }
+static inline void ib_umem_start_phys_iter(struct ib_umem *umem,
+					   struct sg_phys_iter *sg_phys_iter,
+					   unsigned long supported_pgsz) { }
+static inline bool ib_umem_next_phys_iter(struct ib_umem *umem,
+					  struct sg_phys_iter *sg_phys_iter) {
+	return false;
+}
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */