diff mbox

[RFC,2/2] IB/mlx5: Implement Fast Indirect Memory Registration Feature

Message ID 1412693281-6161-3-git-send-email-sagig@mellanox.com (mailing list archive)
State Rejected
Headers show

Commit Message

Sagi Grimberg Oct. 7, 2014, 2:48 p.m. UTC
This patch implements:
- ib_alloc/free_indir_reg_list() routines
- ib_create_mr() extension for IB_MR_INDIRECT_REG
- ib_post_send() extension for IB_WR_REG_INDIR_MR
  and work completion of IB_WC_REG_INDIR_MR
- Expose mlx5 indirect registration device capabilities

* Nit change in mr_align() static routine to handle void*
instead of __be64.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c      |    2 +
 drivers/infiniband/hw/mlx5/main.c    |    4 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   20 +++++++
 drivers/infiniband/hw/mlx5/mr.c      |   70 ++++++++++++++++++++++-
 drivers/infiniband/hw/mlx5/qp.c      |  104 ++++++++++++++++++++++++++++++++++
 5 files changed, 198 insertions(+), 2 deletions(-)

Comments

Or Gerlitz Oct. 12, 2014, 7:39 p.m. UTC | #1
On 10/7/2014 4:48 PM, Sagi Grimberg wrote:
>
> * Nit change in mr_align() static routine to handle void*
> instead of __be64.

nit comment... any reason not to put in different and unrelated to this 
series patch?

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c


> +static void set_indir_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
> +				  struct ib_send_wr *wr)
> +{
> +	u64 mask;
> +	u32 list_len = wr->wr.indir_reg.indir_list_len;
> +
> +	memset(umr, 0, sizeof(*umr));
> +
> +	umr->klm_octowords = get_klm_octo(list_len * 2);
> +	mask = MLX5_MKEY_MASK_LEN		|
> +		MLX5_MKEY_MASK_PAGE_SIZE	|
> +		MLX5_MKEY_MASK_START_ADDR	|
> +		MLX5_MKEY_MASK_EN_RINVAL	|
> +		MLX5_MKEY_MASK_KEY		|
> +		MLX5_MKEY_MASK_LR		|
> +		MLX5_MKEY_MASK_LW		|
> +		MLX5_MKEY_MASK_RR		|
> +		MLX5_MKEY_MASK_RW		|
> +		MLX5_MKEY_MASK_A		|
> +		MLX5_MKEY_MASK_FREE;
> +
> +	umr->mkey_mask = cpu_to_be64(mask);
> +}

here you basically replicate the majority of the code from 
set_reg_umr_segment - share the common part...


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Oct. 13, 2014, 8:32 a.m. UTC | #2
On 10/12/2014 10:39 PM, Or Gerlitz wrote:
>
> On 10/7/2014 4:48 PM, Sagi Grimberg wrote:
>>
>> * Nit change in mr_align() static routine to handle void*
>> instead of __be64.
>
> nit comment... any reason not to put in different and unrelated to this
> series patch?

I can, But there is no use for this out of this patch context.

>
> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> --- a/drivers/infiniband/hw/mlx5/qp.c
> +++ b/drivers/infiniband/hw/mlx5/qp.c
>
>
>> +static void set_indir_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
>> +                  struct ib_send_wr *wr)
>> +{
>> +    u64 mask;
>> +    u32 list_len = wr->wr.indir_reg.indir_list_len;
>> +
>> +    memset(umr, 0, sizeof(*umr));
>> +
>> +    umr->klm_octowords = get_klm_octo(list_len * 2);
>> +    mask = MLX5_MKEY_MASK_LEN        |
>> +        MLX5_MKEY_MASK_PAGE_SIZE    |
>> +        MLX5_MKEY_MASK_START_ADDR    |
>> +        MLX5_MKEY_MASK_EN_RINVAL    |
>> +        MLX5_MKEY_MASK_KEY        |
>> +        MLX5_MKEY_MASK_LR        |
>> +        MLX5_MKEY_MASK_LW        |
>> +        MLX5_MKEY_MASK_RR        |
>> +        MLX5_MKEY_MASK_RW        |
>> +        MLX5_MKEY_MASK_A        |
>> +        MLX5_MKEY_MASK_FREE;
>> +
>> +    umr->mkey_mask = cpu_to_be64(mask);
>> +}
>
> here you basically replicate the majority of the code from
> set_reg_umr_segment - share the common part...

I've thought about it, but I know Eli prefers not to centralize the
umr control segment (and mkey context) setting routines at this point.
I am not against doing it at all, but let's let Eli comment here.

Eli?

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Oct. 13, 2014, 12:57 p.m. UTC | #3
On 10/13/2014 11:32 AM, Sagi Grimberg wrote:
> mask = MLX5_MKEY_MASK_LEN        |
> +        MLX5_MKEY_MASK_PAGE_SIZE    |
> +        MLX5_MKEY_MASK_START_ADDR    |
> +        MLX5_MKEY_MASK_EN_RINVAL    |
> +        MLX5_MKEY_MASK_KEY        |
> +        MLX5_MKEY_MASK_LR        |
> +        MLX5_MKEY_MASK_LW        |
> +        MLX5_MKEY_MASK_RR        |
> +        MLX5_MKEY_MASK_RW        |
> +        MLX5_MKEY_MASK_A        |
> +        MLX5_MKEY_MASK_FREE; 
@ least let's not repeat this mask = MLX5_MKEY_MASK_YYY | [...] exactly 
twice, BTW I think Eli is OOO this week and I'm not sure we want to stop 
this train just as of this small suggestion I made here



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Oct. 13, 2014, 1 p.m. UTC | #4
On 10/13/2014 11:32 AM, Sagi Grimberg wrote:
> n 10/12/2014 10:39 PM, Or Gerlitz wrote:
>>
>> On 10/7/2014 4:48 PM, Sagi Grimberg wrote:
>>>
>>> * Nit change in mr_align() static routine to handle void*
>>> instead of __be64.
>>
>> nit comment... any reason not to put in different and unrelated to this
>> series patch?
>
> I can, But there is no use for this out of this patch context.
if you really want it to be this way, let it be
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 14, 2014, 5:41 a.m. UTC | #5
On 10/07/14 16:48, Sagi Grimberg wrote:
> -static __be64 *mr_align(__be64 *ptr, int align)
> +static void *mr_align(void *ptr, int align)
>   {
>   	unsigned long mask = align - 1;
>
> -	return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
> +	return (void *)(((unsigned long)ptr + mask) & ~mask);
>   }

Maybe I'm missing something, but why doesn't this function use the 
ALIGN() macro defined in <linux/kernel.h> ? Are you aware that a type 
with name uintptr_t has been defined in <linux/types.h> that is intended 
for casting a pointer to an int ?

 > +struct ib_indir_reg_list *
 > +mlx5_ib_alloc_indir_reg_list(struct ib_device *device,
 > +			     unsigned int max_indir_list_len)

There are three k[zc]alloc() calls in this function. Can these be 
combined into one without increasing the chance too much that this will 
increase the order of the allocation ?

 > +	dsize = sizeof(*mirl->klms) * max_indir_list_len;
 > +	mirl->mapped_ilist = kzalloc(dsize + MLX5_UMR_ALIGN - 1,
 > +				      GFP_KERNEL);

The value of the constant MLX5_UMR_ALIGN is 2048. Does that mean this 
code will always allocate 2 KB more than needed ? Is this really the 
minimum alignment required for this data structure ? How likely is it 
that this code will trigger a higher order allocation ?

 > +	if (unlikely((*seg == qp->sq.qend)))

A minor comment: one pair of parentheses can be left out from the above 
code.

 > +					mlx5_ib_warn(dev, "\n");

Will the warning generated by this code allow a user to find out which 
code triggered that warning ?

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Oct. 14, 2014, 10:50 a.m. UTC | #6
On 10/14/2014 8:41 AM, Bart Van Assche wrote:
> On 10/07/14 16:48, Sagi Grimberg wrote:
>> -static __be64 *mr_align(__be64 *ptr, int align)
>> +static void *mr_align(void *ptr, int align)
>>   {
>>       unsigned long mask = align - 1;
>>
>> -    return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
>> +    return (void *)(((unsigned long)ptr + mask) & ~mask);
>>   }
>
> Maybe I'm missing something, but why doesn't this function use the
> ALIGN() macro defined in <linux/kernel.h> ? Are you aware that a type
> with name uintptr_t has been defined in <linux/types.h> that is intended
> for casting a pointer to an int ?
>
>  > +struct ib_indir_reg_list *
>  > +mlx5_ib_alloc_indir_reg_list(struct ib_device *device,
>  > +                 unsigned int max_indir_list_len)
>
> There are three k[zc]alloc() calls in this function. Can these be
> combined into one without increasing the chance too much that this will
> increase the order of the allocation ?
>
>  > +    dsize = sizeof(*mirl->klms) * max_indir_list_len;
>  > +    mirl->mapped_ilist = kzalloc(dsize + MLX5_UMR_ALIGN - 1,
>  > +                      GFP_KERNEL);
>
> The value of the constant MLX5_UMR_ALIGN is 2048. Does that mean this
> code will always allocate 2 KB more than needed ? Is this really the
> minimum alignment required for this data structure ? How likely is it
> that this code will trigger a higher order allocation ?
>
>  > +    if (unlikely((*seg == qp->sq.qend)))
>
> A minor comment: one pair of parentheses can be left out from the above
> code.
>
>  > +                    mlx5_ib_warn(dev, "\n");
>
> Will the warning generated by this code allow a user to find out which
> code triggered that warning ?
>
> Bart.
>

Hey Bart,

Thanks for the feedback!
Unfortunately I'm caught up in other things at the moment, but I'll
address the comments soon enough... And also I still need to review
your v2 of SRP multichannel. I promise to do it ASAP.

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Oct. 19, 2014, 7:34 p.m. UTC | #7
On 10/14/2014 8:41 AM, Bart Van Assche wrote:
> On 10/07/14 16:48, Sagi Grimberg wrote:
>> -static __be64 *mr_align(__be64 *ptr, int align)
>> +static void *mr_align(void *ptr, int align)
>>   {
>>       unsigned long mask = align - 1;
>>
>> -    return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
>> +    return (void *)(((unsigned long)ptr + mask) & ~mask);
>>   }
>
> Maybe I'm missing something, but why doesn't this function use the
> ALIGN() macro defined in <linux/kernel.h> ? Are you aware that a type
> with name uintptr_t has been defined in <linux/types.h> that is intended
> for casting a pointer to an int ?

Heh, you are right obviously.

I'll fix that to use ALIGN(ptr, mask)

>
>  > +struct ib_indir_reg_list *
>  > +mlx5_ib_alloc_indir_reg_list(struct ib_device *device,
>  > +                 unsigned int max_indir_list_len)
>
> There are three k[zc]alloc() calls in this function. Can these be
> combined into one without increasing the chance too much that this will
> increase the order of the allocation ?

Well, I have 3 allocations:
1. mlx5_indir_reg_list container
2. sg_list: buffer of sg elements for the user to pass
3. mapped_ilist: buffer for HW representation of sg elements.

mapped_ilist buffer has some constraints on it (see below) so I can't
combine it with the other two. I assume I can rearrange the structure to
combine the fist 2 allocations together, but it can trigger a higher
order allocation (unless I use vzalloc, but I'm not sure what is the
benefit here...).

>
>  > +    dsize = sizeof(*mirl->klms) * max_indir_list_len;
>  > +    mirl->mapped_ilist = kzalloc(dsize + MLX5_UMR_ALIGN - 1,
>  > +                      GFP_KERNEL);
>
> The value of the constant MLX5_UMR_ALIGN is 2048. Does that mean this
> code will always allocate 2 KB more than needed ? Is this really the
> minimum alignment required for this data structure ? How likely is it
> that this code will trigger a higher order allocation ?

Well, here I need a variable size physically contiguous allocation with
a start address aligned to 2K (HW requirement). So yes, with this code
given the user allocate more than PAGE_SIZE-2K ib_sges, a higher order
allocation will be triggered.

I allocate 2K more to ensure I can align to 2K, then I set mirl->klms to
point at the aligned address. Any idea on how can I do this without
allocating 2K more?

>
>  > +    if (unlikely((*seg == qp->sq.qend)))
>
> A minor comment: one pair of parentheses can be left out from the above
> code.

Thanks, will fix.

>
>  > +                    mlx5_ib_warn(dev, "\n");
>
> Will the warning generated by this code allow a user to find out which
> code triggered that warning ?

Not at all - and it should be fixed. will do.

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 20, 2014, 7:46 a.m. UTC | #8
On 10/19/14 21:34, Sagi Grimberg wrote:
> On 10/14/2014 8:41 AM, Bart Van Assche wrote:
>>  > +struct ib_indir_reg_list *
>>  > +mlx5_ib_alloc_indir_reg_list(struct ib_device *device,
>>  > +                 unsigned int max_indir_list_len)
>>
>> There are three k[zc]alloc() calls in this function. Can these be
>> combined into one without increasing the chance too much that this will
>> increase the order of the allocation ?
>
> Well, I have 3 allocations:
> 1. mlx5_indir_reg_list container
> 2. sg_list: buffer of sg elements for the user to pass
> 3. mapped_ilist: buffer for HW representation of sg elements.
>
> mapped_ilist buffer has some constraints on it (see below) so I can't
> combine it with the other two. I assume I can rearrange the structure to
> combine the fist 2 allocations together, but it can trigger a higher
> order allocation (unless I use vzalloc, but I'm not sure what is the
> benefit here...).

Since vmalloc() is about 1000 times slower than kmalloc() I think we 
should try to avoid vmalloc() and its variants. Combining multiple 
kmalloc() calls into a single kmalloc() call helps performance and 
reduces the number of error paths. But since the time needed for a 
single kmalloc() call is only a few microseconds combining multiple 
kmalloc() calls reduces performance if combining kmalloc() calls 
increases the allocation order from one page to multiple pages.

>>  > +    dsize = sizeof(*mirl->klms) * max_indir_list_len;
>>  > +    mirl->mapped_ilist = kzalloc(dsize + MLX5_UMR_ALIGN - 1,
>>  > +                      GFP_KERNEL);
>>
>> The value of the constant MLX5_UMR_ALIGN is 2048. Does that mean this
>> code will always allocate 2 KB more than needed ? Is this really the
>> minimum alignment required for this data structure ? How likely is it
>> that this code will trigger a higher order allocation ?
>
> Well, here I need a variable size physically contiguous allocation with
> a start address aligned to 2K (HW requirement). So yes, with this code
> given the user allocate more than PAGE_SIZE-2K ib_sges, a higher order
> allocation will be triggered.
>
> I allocate 2K more to ensure I can align to 2K, then I set mirl->klms to
> point at the aligned address. Any idea on how can I do this without
> allocating 2K more?

Specifying a minimum memory alignment requirement is possible with 
kmem_cache_create(). However, kmem_caches only support allocation of 
fixed size objects.

In mm/slab_common.c one can see that for SLAB and SLUB allocations of >= 
256 bytes are guaranteed to be aligned on a boundary equal to the 
smallest power of two that is larger than or equal to the allocation 
size. Unfortunately this does not hold for the SLOB allocator. So I 
think the only change that can be made in this code without complicating 
it too much is changing the "MLX5_UMR_ALIGN - 1" into 
"max(MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0)".

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Oct. 21, 2014, 9:32 a.m. UTC | #9
On 10/20/2014 10:46 AM, Bart Van Assche wrote:
> On 10/19/14 21:34, Sagi Grimberg wrote:
>> On 10/14/2014 8:41 AM, Bart Van Assche wrote:
>>>  > +struct ib_indir_reg_list *
>>>  > +mlx5_ib_alloc_indir_reg_list(struct ib_device *device,
>>>  > +                 unsigned int max_indir_list_len)
>>>
>>> There are three k[zc]alloc() calls in this function. Can these be
>>> combined into one without increasing the chance too much that this will
>>> increase the order of the allocation ?
>>
>> Well, I have 3 allocations:
>> 1. mlx5_indir_reg_list container
>> 2. sg_list: buffer of sg elements for the user to pass
>> 3. mapped_ilist: buffer for HW representation of sg elements.
>>
>> mapped_ilist buffer has some constraints on it (see below) so I can't
>> combine it with the other two. I assume I can rearrange the structure to
>> combine the fist 2 allocations together, but it can trigger a higher
>> order allocation (unless I use vzalloc, but I'm not sure what is the
>> benefit here...).
>
> Since vmalloc() is about 1000 times slower than kmalloc() I think we
> should try to avoid vmalloc() and its variants. Combining multiple
> kmalloc() calls into a single kmalloc() call helps performance and
> reduces the number of error paths. But since the time needed for a
> single kmalloc() call is only a few microseconds combining multiple
> kmalloc() calls reduces performance if combining kmalloc() calls
> increases the allocation order from one page to multiple pages.

Right, so I think keeping these separated is OK agreed?

>
>>>  > +    dsize = sizeof(*mirl->klms) * max_indir_list_len;
>>>  > +    mirl->mapped_ilist = kzalloc(dsize + MLX5_UMR_ALIGN - 1,
>>>  > +                      GFP_KERNEL);
>>>
>>> The value of the constant MLX5_UMR_ALIGN is 2048. Does that mean this
>>> code will always allocate 2 KB more than needed ? Is this really the
>>> minimum alignment required for this data structure ? How likely is it
>>> that this code will trigger a higher order allocation ?
>>
>> Well, here I need a variable size physically contiguous allocation with
>> a start address aligned to 2K (HW requirement). So yes, with this code
>> given the user allocate more than PAGE_SIZE-2K ib_sges, a higher order
>> allocation will be triggered.
>>
>> I allocate 2K more to ensure I can align to 2K, then I set mirl->klms to
>> point at the aligned address. Any idea on how can I do this without
>> allocating 2K more?
>
> Specifying a minimum memory alignment requirement is possible with
> kmem_cache_create(). However, kmem_caches only support allocation of
> fixed size objects.
>
> In mm/slab_common.c one can see that for SLAB and SLUB allocations of >=
> 256 bytes are guaranteed to be aligned on a boundary equal to the
> smallest power of two that is larger than or equal to the allocation
> size. Unfortunately this does not hold for the SLOB allocator. So I
> think the only change that can be made in this code without complicating
> it too much is changing the "MLX5_UMR_ALIGN - 1" into
> "max(MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0)".

I can give that a go...

Thanks!

P.S.
Since you didn't give any comments on the API itself should I conclude
that you find it clear and you agree with it?

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Oct. 21, 2014, 10:54 a.m. UTC | #10
On 10/21/14 11:32, Sagi Grimberg wrote:
> Since you didn't give any comments on the API itself should I conclude
> that you find it clear and you agree with it?

Mostly. The only aspect of which I think that it is missing in the API 
is the alignment requirement for the addresses in 
ib_indir_reg_list.sg_list. Is byte alignment sufficient ? If so, will 
all HCA vendors be able to support this or do we perhaps need to add a 
member in the device attributes structure that declares what the 
alignment requirements are ?

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Oct. 21, 2014, 10:59 a.m. UTC | #11
On 10/21/2014 1:54 PM, Bart Van Assche wrote:
> On 10/21/14 11:32, Sagi Grimberg wrote:
>> Since you didn't give any comments on the API itself should I conclude
>> that you find it clear and you agree with it?
>
> Mostly. The only aspect of which I think that it is missing in the API
> is the alignment requirement for the addresses in
> ib_indir_reg_list.sg_list. Is byte alignment sufficient ?

Yes.

> If so, will
> all HCA vendors be able to support this or do we perhaps need to add a
> member in the device attributes structure that declares what the
> alignment requirements are ?
>

This feature is designed to remove any alignment constraints on memory
registration. I don't see any point in allowing the API to relax the
constraints instead.

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eli Cohen Oct. 21, 2014, 1:12 p.m. UTC | #12
On Mon, Oct 13, 2014 at 11:32:09AM +0300, Sagi Grimberg wrote:
> >
> >>+static void set_indir_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
> >>+                  struct ib_send_wr *wr)
> >>+{
> >>+    u64 mask;
> >>+    u32 list_len = wr->wr.indir_reg.indir_list_len;
> >>+
> >>+    memset(umr, 0, sizeof(*umr));
> >>+
> >>+    umr->klm_octowords = get_klm_octo(list_len * 2);
> >>+    mask = MLX5_MKEY_MASK_LEN        |
> >>+        MLX5_MKEY_MASK_PAGE_SIZE    |
> >>+        MLX5_MKEY_MASK_START_ADDR    |
> >>+        MLX5_MKEY_MASK_EN_RINVAL    |
> >>+        MLX5_MKEY_MASK_KEY        |
> >>+        MLX5_MKEY_MASK_LR        |
> >>+        MLX5_MKEY_MASK_LW        |
> >>+        MLX5_MKEY_MASK_RR        |
> >>+        MLX5_MKEY_MASK_RW        |
> >>+        MLX5_MKEY_MASK_A        |
> >>+        MLX5_MKEY_MASK_FREE;
> >>+
> >>+    umr->mkey_mask = cpu_to_be64(mask);
> >>+}
> >
> >here you basically replicate the majority of the code from
> >set_reg_umr_segment - share the common part...
> 
> I've thought about it, but I know Eli prefers not to centralize the
> umr control segment (and mkey context) setting routines at this point.
> I am not against doing it at all, but let's let Eli comment here.
> 

We have some versions of this mask building but they are not
identical. Note that we build the mask at compile time so it does not
affect code size and it will make future changes easier.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eli Cohen Oct. 21, 2014, 2:20 p.m. UTC | #13
On Sun, Oct 19, 2014 at 10:34:04PM +0300, Sagi Grimberg wrote:
> >
> > > +                    mlx5_ib_warn(dev, "\n");
> >
> >Will the warning generated by this code allow a user to find out which
> >code triggered that warning ?
> 
> Not at all - and it should be fixed. will do.
> 

This macro expands to function name and line number using __func__ and
__LINE__ so better leave it as is if you care to know what failed.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Oct. 21, 2014, 2:30 p.m. UTC | #14
On 10/21/2014 5:20 PM, Eli Cohen wrote:
> On Sun, Oct 19, 2014 at 10:34:04PM +0300, Sagi Grimberg wrote:
>>>
>>>> +                    mlx5_ib_warn(dev, "\n");
>>>
>>> Will the warning generated by this code allow a user to find out which
>>> code triggered that warning ?
>>
>> Not at all - and it should be fixed. will do.
>>
>
> This macro expands to function name and line number using __func__ and
> __LINE__ so better leave it as is if you care to know what failed.
>

Yes, but we can also give a sentence of what failed in the function.
the sources are not always available...
I don't mind modifying it.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index e405627..7ca730c 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -111,6 +111,8 @@  static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
 	case IB_WR_FAST_REG_MR:
 		return IB_WC_FAST_REG_MR;
 
+	case IB_WR_REG_INDIR_MR:
+		return IB_WC_REG_INDIR_MR;
 	default:
 		pr_warn("unknown completion status\n");
 		return 0;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d8907b2..d834b77 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -194,6 +194,7 @@  static int mlx5_ib_query_device(struct ib_device *ibdev,
 	if (flags & MLX5_DEV_CAP_FLAG_XRC)
 		props->device_cap_flags |= IB_DEVICE_XRC;
 	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	props->device_cap_flags |= IB_DEVICE_INDIR_REGISTRATION;
 	if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) {
 		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
 		/* At this stage no support for signature handover */
@@ -231,6 +232,7 @@  static int mlx5_ib_query_device(struct ib_device *ibdev,
 	props->max_srq_wr	   = dev->mdev->caps.max_srq_wqes - 1;
 	props->max_srq_sge	   = max_rq_sg - 1;
 	props->max_fast_reg_page_list_len = (unsigned int)-1;
+	props->max_indir_reg_mr_list_len = (unsigned int)-1;
 	props->local_ca_ack_delay  = dev->mdev->caps.local_ca_ack_delay;
 	props->atomic_cap	   = IB_ATOMIC_NONE;
 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
@@ -1354,6 +1356,8 @@  static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
 	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
 	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
+	dev->ib_dev.alloc_indir_reg_list = mlx5_ib_alloc_indir_reg_list;
+	dev->ib_dev.free_indir_reg_list  = mlx5_ib_free_indir_reg_list;
 
 	if (mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC) {
 		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 386780f..3b6ed0f 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -275,6 +275,13 @@  struct mlx5_ib_fast_reg_page_list {
 	dma_addr_t			map;
 };
 
+struct mlx5_ib_indir_reg_list {
+	struct ib_indir_reg_list        ib_irl;
+	void                           *mapped_ilist;
+	struct mlx5_klm                *klms;
+	dma_addr_t                      map;
+};
+
 struct mlx5_ib_umr_context {
 	enum ib_wc_status	status;
 	struct completion	done;
@@ -444,6 +451,12 @@  static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_pag
 	return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl);
 }
 
+static inline struct mlx5_ib_indir_reg_list *
+to_mindir_list(struct ib_indir_reg_list *ib_irl)
+{
+	return container_of(ib_irl, struct mlx5_ib_indir_reg_list, ib_irl);
+}
+
 struct mlx5_ib_ah {
 	struct ib_ah		ibah;
 	struct mlx5_av		av;
@@ -511,6 +524,13 @@  struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
 struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
 							       int page_list_len);
 void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+struct ib_indir_reg_list *
+mlx5_ib_alloc_indir_reg_list(struct ib_device *device,
+			     unsigned int max_indir_list_len);
+void mlx5_ib_free_indir_reg_list(struct ib_device *device,
+				 struct ib_indir_reg_list *indir_list);
+
 struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc,
 				 struct ib_fmr_attr *fmr_attr);
 int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 80b3c63..6fb7cc3 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -47,11 +47,11 @@  enum {
 	MLX5_UMR_ALIGN	= 2048
 };
 
-static __be64 *mr_align(__be64 *ptr, int align)
+static void *mr_align(void *ptr, int align)
 {
 	unsigned long mask = align - 1;
 
-	return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
+	return (void *)(((unsigned long)ptr + mask) & ~mask);
 }
 
 static int order2idx(struct mlx5_ib_dev *dev, int order)
@@ -1059,6 +1059,9 @@  struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
 		++mr->sig->sigerr_count;
 	}
 
+	if (mr_init_attr->flags & IB_MR_INDIRECT_REG)
+		access_mode = MLX5_ACCESS_MODE_KLM;
+
 	in->seg.flags = MLX5_PERM_UMR_EN | access_mode;
 	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in),
 				    NULL, NULL, NULL);
@@ -1248,3 +1251,66 @@  int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 done:
 	return ret;
 }
+
+struct ib_indir_reg_list *
+mlx5_ib_alloc_indir_reg_list(struct ib_device *device,
+			     unsigned int max_indir_list_len)
+{
+	struct device *ddev = device->dma_device;
+	struct mlx5_ib_indir_reg_list *mirl = NULL;
+	int dsize;
+	int err;
+
+	mirl = kzalloc(sizeof(*mirl), GFP_KERNEL);
+	if (!mirl)
+		return ERR_PTR(-ENOMEM);
+
+	mirl->ib_irl.sg_list = kcalloc(max_indir_list_len,
+				       sizeof(*mirl->ib_irl.sg_list),
+				       GFP_KERNEL);
+	if (!mirl->ib_irl.sg_list) {
+		err = -ENOMEM;
+		goto err_sg_list;
+	}
+
+	dsize = sizeof(*mirl->klms) * max_indir_list_len;
+	mirl->mapped_ilist = kzalloc(dsize + MLX5_UMR_ALIGN - 1,
+				      GFP_KERNEL);
+	if (!mirl->mapped_ilist) {
+		err = -ENOMEM;
+		goto err_mapped_list;
+	}
+
+	mirl->klms = mr_align(mirl->mapped_ilist, MLX5_UMR_ALIGN);
+	mirl->map = dma_map_single(ddev, mirl->klms,
+				   dsize, DMA_TO_DEVICE);
+	if (dma_mapping_error(ddev, mirl->map)) {
+		err = -ENOMEM;
+		goto err_dma_map;
+	}
+
+	return &mirl->ib_irl;
+err_dma_map:
+	kfree(mirl->mapped_ilist);
+err_mapped_list:
+	kfree(mirl->ib_irl.sg_list);
+err_sg_list:
+	kfree(mirl);
+
+	return ERR_PTR(err);
+}
+
+void
+mlx5_ib_free_indir_reg_list(struct ib_device *device,
+			    struct ib_indir_reg_list *indir_list)
+{
+	struct mlx5_ib_indir_reg_list *mirl = to_mindir_list(indir_list);
+	struct device *ddev = device->dma_device;
+	int dsize;
+
+	dsize = sizeof(*mirl->klms) * indir_list->max_indir_list_len;
+	dma_unmap_single(ddev, mirl->map, dsize, DMA_TO_DEVICE);
+	kfree(mirl->mapped_ilist);
+	kfree(mirl->ib_irl.sg_list);
+	kfree(mirl);
+}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index d7f35e9..a9c74e6 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -65,6 +65,7 @@  static const u32 mlx5_ib_opcode[] = {
 	[IB_WR_SEND_WITH_INV]			= MLX5_OPCODE_SEND_INVAL,
 	[IB_WR_LOCAL_INV]			= MLX5_OPCODE_UMR,
 	[IB_WR_FAST_REG_MR]			= MLX5_OPCODE_UMR,
+	[IB_WR_REG_INDIR_MR]			= MLX5_OPCODE_UMR,
 	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_MASKED_CS,
 	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_MASKED_FA,
 	[MLX5_IB_WR_UMR]			= MLX5_OPCODE_UMR,
@@ -2346,6 +2347,96 @@  static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size,
 	return 0;
 }
 
+static void set_indir_mkey_segment(struct mlx5_mkey_seg *seg,
+				   struct ib_send_wr *wr, u32 pdn)
+{
+	u32 list_len = wr->wr.indir_reg.indir_list_len;
+
+	memset(seg, 0, sizeof(*seg));
+
+	seg->flags = get_umr_flags(wr->wr.indir_reg.access_flags) |
+				   MLX5_ACCESS_MODE_KLM;
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
+			   mlx5_mkey_variant(wr->wr.indir_reg.mkey));
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | pdn);
+	seg->len = cpu_to_be64(wr->wr.indir_reg.length);
+	seg->start_addr = cpu_to_be64(wr->wr.indir_reg.iova_start);
+	seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(list_len * 2)));
+}
+
+static void set_indir_data_seg(struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+			       u32 pa_key, void **seg, int *size)
+{
+	struct mlx5_wqe_data_seg *data = *seg;
+	struct mlx5_ib_indir_reg_list *mirl;
+	struct ib_sge *sg_list = wr->wr.indir_reg.indir_list->sg_list;
+	u32 list_len = wr->wr.indir_reg.indir_list_len;
+	int i;
+
+	mirl = to_mindir_list(wr->wr.indir_reg.indir_list);
+	for (i = 0; i < list_len; i++) {
+		mirl->klms[i].va = cpu_to_be64(sg_list[i].addr);
+		mirl->klms[i].key = cpu_to_be32(sg_list[i].lkey);
+		mirl->klms[i].bcount = cpu_to_be32(sg_list[i].length);
+	}
+
+	data->byte_count = cpu_to_be32(ALIGN(sizeof(struct mlx5_klm) *
+				       list_len, 64));
+	data->lkey = cpu_to_be32(pa_key);
+	data->addr = cpu_to_be64(mirl->map);
+	*seg += sizeof(*data);
+	*size += sizeof(*data) / 16;
+}
+
+static void set_indir_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				  struct ib_send_wr *wr)
+{
+	u64 mask;
+	u32 list_len = wr->wr.indir_reg.indir_list_len;
+
+	memset(umr, 0, sizeof(*umr));
+
+	umr->klm_octowords = get_klm_octo(list_len * 2);
+	mask = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_A		|
+		MLX5_MKEY_MASK_FREE;
+
+	umr->mkey_mask = cpu_to_be64(mask);
+}
+
+static int set_indir_reg_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+			    void **seg, int *size)
+{
+	struct mlx5_ib_pd *pd = get_pd(qp);
+
+	if (unlikely(wr->send_flags & IB_SEND_INLINE))
+		return -EINVAL;
+
+	set_indir_umr_segment(*seg, wr);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	set_indir_mkey_segment(*seg, wr, pd->pdn);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	set_indir_data_seg(wr, qp, pd->pa_lkey, seg, size);
+
+	return 0;
+}
+
 static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
 {
 	__be32 *p = NULL;
@@ -2557,6 +2648,19 @@  int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				num_sge = 0;
 				break;
 
+			case IB_WR_REG_INDIR_MR:
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				qp->sq.wr_data[idx] = IB_WR_REG_INDIR_MR;
+				ctrl->imm = cpu_to_be32(wr->wr.indir_reg.mkey);
+				err = set_indir_reg_wr(wr, qp, &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				num_sge = 0;
+				break;
+
 			case IB_WR_REG_SIG_MR:
 				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
 				mr = to_mmr(wr->wr.sig_handover.sig_mr);