diff mbox series

[V4,for-next,1/3] RDMA/hns: Add mtr support for mixed multihop addressing

Message ID 1559285867-29529-2-git-send-email-oulijun@huawei.com (mailing list archive)
State Superseded
Headers show
Series Add mix multihop addressing for supporting 32K | expand

Commit Message

Lijun Ou May 31, 2019, 6:57 a.m. UTC
Currently, the MTT(memory translate table) design required a buffer
space must has the same hopnum, but the hip08 hw can support mixed
hopnum config in a buffer space.

This patch adds the MTR(memory translate region) design for supporting
mixed multihop.

Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
---
V1->V2:
1. Remove the package for kernel primitives and directly use it
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  36 +++
 drivers/infiniband/hw/hns/hns_roce_hem.c    | 460 ++++++++++++++++++++++++++++
 drivers/infiniband/hw/hns/hns_roce_hem.h    |  14 +
 drivers/infiniband/hw/hns/hns_roce_mr.c     | 121 ++++++++
 4 files changed, 631 insertions(+)

Comments

Jason Gunthorpe June 7, 2019, 4:48 p.m. UTC | #1
On Fri, May 31, 2019 at 02:57:45PM +0800, Lijun Ou wrote:
> +
> +static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev,
> +			      struct hns_roce_mtr *mtr, dma_addr_t *bufs,
> +			      struct hns_roce_buf_region *r)
> +{
> +	int offset;
> +	int count;
> +	int npage;
> +	u64 *mtts;
> +	int end;
> +	int i;
> +
> +	offset = r->offset;
> +	end = offset + r->count;
> +	npage = 0;
> +	while (offset < end) {
> +		mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
> +						  offset, &count, NULL);
> +		if (!mtts)
> +			return -ENOBUFS;
> +
> +		/* Save page addr, low 12 bits : 0 */
> +		for (i = 0; i < count; i++) {
> +			if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
> +				mtts[i] = cpu_to_le64(bufs[npage] >>
> +							PAGE_ADDR_SHIFT);
> +			else
> +				mtts[i] = cpu_to_le64(bufs[npage]);
> +
> +			npage++;
> +		}
> +		offset += count;
> +	}
> +
> +	/* Memory barrier */
> +	mb();

Didn't we talk about this already? Comments for all memory barriers
have to be very good.

Be really sure you are using the right barrier type for the right
thing, because I won't take patches that get this stuff wrong.

> +int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
> +		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
> +{
> +	u64 *mtts = mtt_buf;
> +	int mtt_count;
> +	int total = 0;
> +	u64 *addr;
> +	int npage;
> +	int left;
> +
> +	if (mtts == NULL || mtt_max < 1)
> +		goto done;
> +
> +	left = mtt_max;
> +	while (left > 0) {
> +		mtt_count = 0;
> +		addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
> +						  offset + total,
> +						  &mtt_count, NULL);
> +		if (!addr || !mtt_count)
> +			goto done;
> +
> +		npage = min(mtt_count, left);
> +		memcpy(&mtts[total], addr, BA_BYTE_LEN * npage);
> +		left -= npage;
> +		total += npage;
> +	}
> +
> +done:
> +	if (base_addr)
> +		*base_addr = mtr->hem_list.root_ba;
> +
> +	return total;
> +}
> +EXPORT_SYMBOL_GPL(hns_roce_mtr_find);

Why is there an EXPROT_SYMBOL in a IB driver? I see many in
hns. Please send a patch to remove all of them and respin this.

Jason
wangxi June 8, 2019, 2:24 a.m. UTC | #2
在 2019/6/8 0:48, Jason Gunthorpe 写道:
> On Fri, May 31, 2019 at 02:57:45PM +0800, Lijun Ou wrote:
>> +
>> +static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev,
>> +			      struct hns_roce_mtr *mtr, dma_addr_t *bufs,
>> +			      struct hns_roce_buf_region *r)
>> +{
>> +	int offset;
>> +	int count;
>> +	int npage;
>> +	u64 *mtts;
>> +	int end;
>> +	int i;
>> +
>> +	offset = r->offset;
>> +	end = offset + r->count;
>> +	npage = 0;
>> +	while (offset < end) {
>> +		mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
>> +						  offset, &count, NULL);
>> +		if (!mtts)
>> +			return -ENOBUFS;
>> +
>> +		/* Save page addr, low 12 bits : 0 */
>> +		for (i = 0; i < count; i++) {
>> +			if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
>> +				mtts[i] = cpu_to_le64(bufs[npage] >>
>> +							PAGE_ADDR_SHIFT);
>> +			else
>> +				mtts[i] = cpu_to_le64(bufs[npage]);
>> +
>> +			npage++;
>> +		}
>> +		offset += count;
>> +	}
>> +
>> +	/* Memory barrier */
>> +	mb();
> 
> Didn't we talk about this already? Comments for all memory barriers
> have to be very good.
> 
> Be really sure you are using the right barrier type for the right
> thing, because I won't take patches that get this stuff wrong.
> 
ok,thanks, I will append more comments at here.
>> +int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
>> +		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
>> +{
>> +	u64 *mtts = mtt_buf;
>> +	int mtt_count;
>> +	int total = 0;
>> +	u64 *addr;
>> +	int npage;
>> +	int left;
>> +
>> +	if (mtts == NULL || mtt_max < 1)
>> +		goto done;
>> +
>> +	left = mtt_max;
>> +	while (left > 0) {
>> +		mtt_count = 0;
>> +		addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
>> +						  offset + total,
>> +						  &mtt_count, NULL);
>> +		if (!addr || !mtt_count)
>> +			goto done;
>> +
>> +		npage = min(mtt_count, left);
>> +		memcpy(&mtts[total], addr, BA_BYTE_LEN * npage);
>> +		left -= npage;
>> +		total += npage;
>> +	}
>> +
>> +done:
>> +	if (base_addr)
>> +		*base_addr = mtr->hem_list.root_ba;
>> +
>> +	return total;
>> +}
>> +EXPORT_SYMBOL_GPL(hns_roce_mtr_find);
> 
> Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> hns. Please send a patch to remove all of them and respin this.
> 
There are 2 modules in our ib driver, one is hns_roce.ko, another
is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
this function defined in hns_roce.ko, and invoked in hns_roce_hw_v2.ko.
> Jason
> _______________________________________________
> Linuxarm mailing list
> Linuxarm@huawei.com
> http://hulk.huawei.com/mailman/listinfo/linuxarm
> .
>
wangxi June 8, 2019, 7:05 a.m. UTC | #3
在 2019/6/8 0:48, Jason Gunthorpe 写道:
> On Fri, May 31, 2019 at 02:57:45PM +0800, Lijun Ou wrote:
>> +
>> +static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev,
>> +			      struct hns_roce_mtr *mtr, dma_addr_t *bufs,
>> +			      struct hns_roce_buf_region *r)
>> +{
>> +	int offset;
>> +	int count;
>> +	int npage;
>> +	u64 *mtts;
>> +	int end;
>> +	int i;
>> +
>> +	offset = r->offset;
>> +	end = offset + r->count;
>> +	npage = 0;
>> +	while (offset < end) {
>> +		mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
>> +						  offset, &count, NULL);
>> +		if (!mtts)
>> +			return -ENOBUFS;
>> +
>> +		/* Save page addr, low 12 bits : 0 */
>> +		for (i = 0; i < count; i++) {
>> +			if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
>> +				mtts[i] = cpu_to_le64(bufs[npage] >>
>> +							PAGE_ADDR_SHIFT);
>> +			else
>> +				mtts[i] = cpu_to_le64(bufs[npage]);
>> +
>> +			npage++;
>> +		}
>> +		offset += count;
>> +	}
>> +
>> +	/* Memory barrier */
>> +	mb();
> 
> Didn't we talk about this already? Comments for all memory barriers
> have to be very good.
> 
> Be really sure you are using the right barrier type for the right
> thing, because I won't take patches that get this stuff wrong.
> 
Very thanks for pointing out this problem. After analyzing the code flow,
I found that here just need a wmb, and there is already a wmb operation
after this function is called. so I will delete it directly.

>> +int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
>> +		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
>> +{
>> +	u64 *mtts = mtt_buf;
>> +	int mtt_count;
>> +	int total = 0;
>> +	u64 *addr;
>> +	int npage;
>> +	int left;
>> +
>> +	if (mtts == NULL || mtt_max < 1)
>> +		goto done;
>> +
>> +	left = mtt_max;
>> +	while (left > 0) {
>> +		mtt_count = 0;
>> +		addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
>> +						  offset + total,
>> +						  &mtt_count, NULL);
>> +		if (!addr || !mtt_count)
>> +			goto done;
>> +
>> +		npage = min(mtt_count, left);
>> +		memcpy(&mtts[total], addr, BA_BYTE_LEN * npage);
>> +		left -= npage;
>> +		total += npage;
>> +	}
>> +
>> +done:
>> +	if (base_addr)
>> +		*base_addr = mtr->hem_list.root_ba;
>> +
>> +	return total;
>> +}
>> +EXPORT_SYMBOL_GPL(hns_roce_mtr_find);
> 
> Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> hns. Please send a patch to remove all of them and respin this.
> 
> Jason
> _______________________________________________
> Linuxarm mailing list
> Linuxarm@huawei.com
> http://hulk.huawei.com/mailman/listinfo/linuxarm
> .
>
Jason Gunthorpe June 10, 2019, 1:27 p.m. UTC | #4
On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:

> > Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> > hns. Please send a patch to remove all of them and respin this.
> > 
> There are 2 modules in our ib driver, one is hns_roce.ko, another
> is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
> this function defined in hns_roce.ko, and invoked in
> hns_roce_hw_v2.ko.

seems unnecessarily complicated

Jason
wangxi June 11, 2019, 2:37 a.m. UTC | #5
在 2019/6/10 21:27, Jason Gunthorpe 写道:
> On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:
> 
>>> Why is there an EXPROT_SYMBOL in a IB driver? I see many in
>>> hns. Please send a patch to remove all of them and respin this.
>>>
>> There are 2 modules in our ib driver, one is hns_roce.ko, another
>> is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
>> this function defined in hns_roce.ko, and invoked in
>> hns_roce_hw_v2.ko.
> 
> seems unnecessarily complicated
> 
> Jason
> .
> 
Hi,Jason,

The hns ib driver was originally designed for the hip06. When designing the
driver for the new hardware hip08, in order to maximize the reuse of the
existing hip06 code, the common part of the code is separated into the
hns_roce.ko, and the hardware difference code is defined into hns_roce_hw_v1.ko
for hip06 and hns_roce_hw_v2.ko for hip08.

The mtr code is designed as a public part in this patchset, so it is defined
in hns_roce.ko. It can be used for hi16xx series hardware with mixed mutihop
addressing feature. Currently, hip08 supports this feature, so it is be called
in hns_roce_hw_v2.ko.

Xi Wang
Leon Romanovsky June 11, 2019, 5:56 a.m. UTC | #6
On Tue, Jun 11, 2019 at 10:37:48AM +0800, wangxi wrote:
>
>
> 在 2019/6/10 21:27, Jason Gunthorpe 写道:
> > On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:
> >
> >>> Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> >>> hns. Please send a patch to remove all of them and respin this.
> >>>
> >> There are 2 modules in our ib driver, one is hns_roce.ko, another
> >> is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
> >> this function defined in hns_roce.ko, and invoked in
> >> hns_roce_hw_v2.ko.
> >
> > seems unnecessarily complicated
> >
> > Jason
> > .
> >
> Hi,Jason,
>
> The hns ib driver was originally designed for the hip06. When designing the
> driver for the new hardware hip08, in order to maximize the reuse of the
> existing hip06 code, the common part of the code is separated into the
> hns_roce.ko, and the hardware difference code is defined into hns_roce_hw_v1.ko
> for hip06 and hns_roce_hw_v2.ko for hip08.
>
> The mtr code is designed as a public part in this patchset, so it is defined
> in hns_roce.ko. It can be used for hi16xx series hardware with mixed mutihop
> addressing feature. Currently, hip08 supports this feature, so it is be called
> in hns_roce_hw_v2.ko.

Combine v1 and v2 into one driver (.ko) and change initialization to
call v1 or v2 accordingly. The rest is handled by ib_device_ops
structure.

Thanks

>
> Xi Wang
>
John Garry June 12, 2019, 8:51 a.m. UTC | #7
On 11/06/2019 06:56, Leon Romanovsky wrote:
> On Tue, Jun 11, 2019 at 10:37:48AM +0800, wangxi wrote:
>>
>>
>> 在 2019/6/10 21:27, Jason Gunthorpe 写道:
>>> On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:
>>>
>>>>> Why is there an EXPROT_SYMBOL in a IB driver? I see many in
>>>>> hns. Please send a patch to remove all of them and respin this.
>>>>>
>>>> There are 2 modules in our ib driver, one is hns_roce.ko, another
>>>> is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
>>>> this function defined in hns_roce.ko, and invoked in
>>>> hns_roce_hw_v2.ko.
>>>
>>> seems unnecessarily complicated
>>>
>>> Jason
>>> .
>>>
>> Hi,Jason,
>>
>> The hns ib driver was originally designed for the hip06. When designing the
>> driver for the new hardware hip08, in order to maximize the reuse of the
>> existing hip06 code, the common part of the code is separated into the
>> hns_roce.ko, and the hardware difference code is defined into hns_roce_hw_v1.ko
>> for hip06 and hns_roce_hw_v2.ko for hip08.
>>
>> The mtr code is designed as a public part in this patchset, so it is defined
>> in hns_roce.ko. It can be used for hi16xx series hardware with mixed mutihop
>> addressing feature. Currently, hip08 supports this feature, so it is be called
>> in hns_roce_hw_v2.ko.
>
> Combine v1 and v2 into one driver (.ko) and change initialization to
> call v1 or v2 accordingly. The rest is handled by ib_device_ops
> structure.
>

Is there a rule which says that a driver cannot export symbols? Module 
stacking is useful for more complex drivers, in that a hw-specific 
implementation may plug into common driver. This helps code reuse.

In addition to this, v1 hw is a platform device driver and depends on 
HNS, while v2 hw is for a PCI device and depends on PCI && HNS3. 
Attempts to combine into a single ko would introduce messy dependencies 
and ifdefs.

Thanks,
John


> Thanks
>
>>
>> Xi Wang
>>
> _______________________________________________
> Linuxarm mailing list
> Linuxarm@huawei.com
> http://hulk.huawei.com/mailman/listinfo/linuxarm
>
Jason Gunthorpe June 12, 2019, 11:52 a.m. UTC | #8
On Wed, Jun 12, 2019 at 09:51:59AM +0100, John Garry wrote:
> On 11/06/2019 06:56, Leon Romanovsky wrote:
> > On Tue, Jun 11, 2019 at 10:37:48AM +0800, wangxi wrote:
> > > 
> > > 
> > > 在 2019/6/10 21:27, Jason Gunthorpe 写道:
> > > > On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:
> > > > 
> > > > > > Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> > > > > > hns. Please send a patch to remove all of them and respin this.
> > > > > > 
> > > > > There are 2 modules in our ib driver, one is hns_roce.ko, another
> > > > > is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
> > > > > this function defined in hns_roce.ko, and invoked in
> > > > > hns_roce_hw_v2.ko.
> > > > 
> > > > seems unnecessarily complicated
> > > > 
> > > > Jason
> > > > .
> > > > 
> > > Hi,Jason,
> > > 
> > > The hns ib driver was originally designed for the hip06. When designing the
> > > driver for the new hardware hip08, in order to maximize the reuse of the
> > > existing hip06 code, the common part of the code is separated into the
> > > hns_roce.ko, and the hardware difference code is defined into hns_roce_hw_v1.ko
> > > for hip06 and hns_roce_hw_v2.ko for hip08.
> > > 
> > > The mtr code is designed as a public part in this patchset, so it is defined
> > > in hns_roce.ko. It can be used for hi16xx series hardware with mixed mutihop
> > > addressing feature. Currently, hip08 supports this feature, so it is be called
> > > in hns_roce_hw_v2.ko.
> > 
> > Combine v1 and v2 into one driver (.ko) and change initialization to
> > call v1 or v2 accordingly. The rest is handled by ib_device_ops
> > structure.
> > 
> 
> Is there a rule which says that a driver cannot export symbols? Module
> stacking is useful for more complex drivers, in that a hw-specific
> implementation may plug into common driver. This helps code reuse.

Generally we do not like to see leaf drivers be so complicated that
they need to export symbols. A multi-module driver is generally an
over engineered thing to do, few drivers would be so big for that to
make any sense.

> In addition to this, v1 hw is a platform device driver and depends on HNS,
> while v2 hw is for a PCI device and depends on PCI && HNS3. Attempts to
> combine into a single ko would introduce messy dependencies and ifdefs.

I suspect it would not be any different from how it is today. Do
everything the same, just have one module not three. module_init/etc
already take care of conditional compilation of the entire .c file via
Makefile

Jason
John Garry June 12, 2019, 1:29 p.m. UTC | #9
>>> Combine v1 and v2 into one driver (.ko) and change initialization to
>>> call v1 or v2 accordingly. The rest is handled by ib_device_ops
>>> structure.
>>>
>>
>> Is there a rule which says that a driver cannot export symbols? Module
>> stacking is useful for more complex drivers, in that a hw-specific
>> implementation may plug into common driver. This helps code reuse.
>

Hi Jason,

> Generally we do not like to see leaf drivers be so complicated that
> they need to export symbols.A multi-module driver is generally an
> over engineered thing to do, few drivers would be so big for that to
> make any sense.
>

I wouldn't say that any driver needs to have multiple modules simply 
because it is big.

But in the case of this driver, the requirement of supporting different 
hw revisions, dependencies on different network drivers, and also the 
need to support either platform or pci device driver was the motivation.

>> In addition to this, v1 hw is a platform device driver and depends on HNS,
>> while v2 hw is for a PCI device and depends on PCI && HNS3. Attempts to
>> combine into a single ko would introduce messy dependencies and ifdefs.
>
> I suspect it would not be any different from how it is today. Do
> everything the same, just have one module not three. module_init/etc
> already take care of conditional compilation of the entire .c file via
> Makefile
>

We can try it, but I'm not sure how much the complexity of the driver 
will be reduced.

Cheers,
John

> Jason
>
> .
>
Salil Mehta June 12, 2019, 3:12 p.m. UTC | #10
> From: linux-rdma-owner@vger.kernel.org
> [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> Sent: Wednesday, June 12, 2019 12:52 PM
> To: John Garry <john.garry@huawei.com>
> Cc: Leon Romanovsky <leon@kernel.org>; wangxi (M) <wangxi11@huawei.com>;
> linux-rdma@vger.kernel.org; dledford@redhat.com; Linuxarm
> <linuxarm@huawei.com>
> Subject: Re: [PATCH V4 for-next 1/3] RDMA/hns: Add mtr support for mixed
> multihop addressing
> 
> On Wed, Jun 12, 2019 at 09:51:59AM +0100, John Garry wrote:
> > On 11/06/2019 06:56, Leon Romanovsky wrote:
> > > On Tue, Jun 11, 2019 at 10:37:48AM +0800, wangxi wrote:
> > > >
> > > >
> > > > 在 2019/6/10 21:27, Jason Gunthorpe 写道:
> > > > > On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:
> > > > >
> > > > > > > Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> > > > > > > hns. Please send a patch to remove all of them and respin this.
> > > > > > >
> > > > > > There are 2 modules in our ib driver, one is hns_roce.ko, another
> > > > > > is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
> > > > > > this function defined in hns_roce.ko, and invoked in
> > > > > > hns_roce_hw_v2.ko.

[...]

> > In addition to this, v1 hw is a platform device driver and depends on HNS,
> > while v2 hw is for a PCI device and depends on PCI && HNS3. Attempts to
> > combine into a single ko would introduce messy dependencies and ifdefs.
> 
> I suspect it would not be any different from how it is today. Do
> everything the same, just have one module not three. module_init/etc
> already take care of conditional compilation of the entire .c file via
> Makefile

Okay. I see your point. 

As I understand, the code within v1 and v2 will almost never be required on
the same system since they belong to different types of hardware (platform/PCI).
Dependency between V1, V2 and common code is something like below

         [V1]    [V2]
          |         |
          |_______|
               |
         hns_roce_xxx

Therefore, we could always make V1+[hns_roce_xxx] OR V2+[hns_roce_xxx] as
single module .ko at compile time by making them optional in Makefile.

Salil.
Jason Gunthorpe June 12, 2019, 4:22 p.m. UTC | #11
On Wed, Jun 12, 2019 at 03:12:24PM +0000, Salil Mehta wrote:
> > From: linux-rdma-owner@vger.kernel.org
> > [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> > Sent: Wednesday, June 12, 2019 12:52 PM
> > To: John Garry <john.garry@huawei.com>
> > Cc: Leon Romanovsky <leon@kernel.org>; wangxi (M) <wangxi11@huawei.com>;
> > linux-rdma@vger.kernel.org; dledford@redhat.com; Linuxarm
> > <linuxarm@huawei.com>
> > Subject: Re: [PATCH V4 for-next 1/3] RDMA/hns: Add mtr support for mixed
> > multihop addressing
> > 
> > On Wed, Jun 12, 2019 at 09:51:59AM +0100, John Garry wrote:
> > > On 11/06/2019 06:56, Leon Romanovsky wrote:
> > > > On Tue, Jun 11, 2019 at 10:37:48AM +0800, wangxi wrote:
> > > > >
> > > > >
> > > > > 在 2019/6/10 21:27, Jason Gunthorpe 写道:
> > > > > > On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:
> > > > > >
> > > > > > > > Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> > > > > > > > hns. Please send a patch to remove all of them and respin this.
> > > > > > > >
> > > > > > > There are 2 modules in our ib driver, one is hns_roce.ko, another
> > > > > > > is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
> > > > > > > this function defined in hns_roce.ko, and invoked in
> > > > > > > hns_roce_hw_v2.ko.
> 
> [...]
> 
> > > In addition to this, v1 hw is a platform device driver and depends on HNS,
> > > while v2 hw is for a PCI device and depends on PCI && HNS3. Attempts to
> > > combine into a single ko would introduce messy dependencies and ifdefs.
> > 
> > I suspect it would not be any different from how it is today. Do
> > everything the same, just have one module not three. module_init/etc
> > already take care of conditional compilation of the entire .c file via
> > Makefile
> 
> Okay. I see your point. 
> 
> As I understand, the code within v1 and v2 will almost never be required on
> the same system since they belong to different types of hardware (platform/PCI).
> Dependency between V1, V2 and common code is something like below

platform and PCI can co-exist in the kernel, there is no reason to
worry about them co-existing in the same driver other than loaded code
size, which doesn't seem like a big deal here.

About the only thing that might give me pause is that v2 depends on a
different ethernet module than v1.. But that depends alot on size too,
there is no harm to load a useless ethernet module if it is not large.

Jason
Salil Mehta June 13, 2019, 10:14 a.m. UTC | #12
> From: linux-rdma-owner@vger.kernel.org
> [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> Sent: Wednesday, June 12, 2019 5:23 PM
> To: Salil Mehta <salil.mehta@huawei.com>
> 
> On Wed, Jun 12, 2019 at 03:12:24PM +0000, Salil Mehta wrote:
> > > From: linux-rdma-owner@vger.kernel.org
> > > [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> > > Sent: Wednesday, June 12, 2019 12:52 PM
> > > To: John Garry <john.garry@huawei.com>
> > > Cc: Leon Romanovsky <leon@kernel.org>; wangxi (M) <wangxi11@huawei.com>;
> > > linux-rdma@vger.kernel.org; dledford@redhat.com; Linuxarm
> > > <linuxarm@huawei.com>
> > > Subject: Re: [PATCH V4 for-next 1/3] RDMA/hns: Add mtr support for mixed
> > > multihop addressing
> > >
> > > On Wed, Jun 12, 2019 at 09:51:59AM +0100, John Garry wrote:
> > > > On 11/06/2019 06:56, Leon Romanovsky wrote:
> > > > > On Tue, Jun 11, 2019 at 10:37:48AM +0800, wangxi wrote:
> > > > > >
> > > > > >
> > > > > > 在 2019/6/10 21:27, Jason Gunthorpe 写道:
> > > > > > > On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:
> > > > > > >
> > > > > > > > > Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> > > > > > > > > hns. Please send a patch to remove all of them and respin this.
> > > > > > > > >
> > > > > > > > There are 2 modules in our ib driver, one is hns_roce.ko, another
> > > > > > > > is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
> > > > > > > > this function defined in hns_roce.ko, and invoked in
> > > > > > > > hns_roce_hw_v2.ko.
> >
> > [...]
> >
> > > > In addition to this, v1 hw is a platform device driver and depends on HNS,
> > > > while v2 hw is for a PCI device and depends on PCI && HNS3. Attempts to
> > > > combine into a single ko would introduce messy dependencies and ifdefs.
> > >
> > > I suspect it would not be any different from how it is today. Do
> > > everything the same, just have one module not three. module_init/etc
> > > already take care of conditional compilation of the entire .c file via
> > > Makefile
> >
> > Okay. I see your point.
> >
> > As I understand, the code within v1 and v2 will almost never be required on
> > the same system since they belong to different types of hardware (platform/PCI).
> > Dependency between V1, V2 and common code is something like below
> 
> platform and PCI can co-exist in the kernel, there is no reason to
> worry about them co-existing in the same driver other than loaded code
> size, which doesn't seem like a big deal here.


Ok. I take your point. If we have an options then we would rather prefer to
reduce the size as well other than converting it to a single .ko.


> About the only thing that might give me pause is that v2 depends on a
> different ethernet module than v1.. But that depends alot on size too,
> there is no harm to load a useless ethernet module if it is not large.

Yes, you are right. That is another concern. It would be very messy
to load HNS & HNS3 Ethernet driver together when we know they will not
be used and have big differences in their hardware. Also, HNS3 Ethernet
driver is big in size.

As such,
           V1 RoCE ---> HNS Ethernet Driver
           V2 RoCE ---> HNS3 Ethernet Driver


Salil.
Jason Gunthorpe June 13, 2019, 11:59 p.m. UTC | #13
On Thu, Jun 13, 2019 at 10:14:18AM +0000, Salil Mehta wrote:
> > From: linux-rdma-owner@vger.kernel.org
> > [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> > Sent: Wednesday, June 12, 2019 5:23 PM
> > To: Salil Mehta <salil.mehta@huawei.com>
> > 
> > On Wed, Jun 12, 2019 at 03:12:24PM +0000, Salil Mehta wrote:
> > > > From: linux-rdma-owner@vger.kernel.org
> > > > [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Jason Gunthorpe
> > > > Sent: Wednesday, June 12, 2019 12:52 PM
> > > > To: John Garry <john.garry@huawei.com>
> > > > Cc: Leon Romanovsky <leon@kernel.org>; wangxi (M) <wangxi11@huawei.com>;
> > > > linux-rdma@vger.kernel.org; dledford@redhat.com; Linuxarm
> > > > <linuxarm@huawei.com>
> > > > Subject: Re: [PATCH V4 for-next 1/3] RDMA/hns: Add mtr support for mixed
> > > > multihop addressing
> > > >
> > > > On Wed, Jun 12, 2019 at 09:51:59AM +0100, John Garry wrote:
> > > > > On 11/06/2019 06:56, Leon Romanovsky wrote:
> > > > > > On Tue, Jun 11, 2019 at 10:37:48AM +0800, wangxi wrote:
> > > > > > >
> > > > > > >
> > > > > > > 在 2019/6/10 21:27, Jason Gunthorpe 写道:
> > > > > > > > On Sat, Jun 08, 2019 at 10:24:15AM +0800, wangxi wrote:
> > > > > > > >
> > > > > > > > > > Why is there an EXPROT_SYMBOL in a IB driver? I see many in
> > > > > > > > > > hns. Please send a patch to remove all of them and respin this.
> > > > > > > > > >
> > > > > > > > > There are 2 modules in our ib driver, one is hns_roce.ko, another
> > > > > > > > > is hns_roce_hw_v2.ko. all extern symbols are named like hns_roce_xxx,
> > > > > > > > > this function defined in hns_roce.ko, and invoked in
> > > > > > > > > hns_roce_hw_v2.ko.
> > >
> > > [...]
> > >
> > > > > In addition to this, v1 hw is a platform device driver and depends on HNS,
> > > > > while v2 hw is for a PCI device and depends on PCI && HNS3. Attempts to
> > > > > combine into a single ko would introduce messy dependencies and ifdefs.
> > > >
> > > > I suspect it would not be any different from how it is today. Do
> > > > everything the same, just have one module not three. module_init/etc
> > > > already take care of conditional compilation of the entire .c file via
> > > > Makefile
> > >
> > > Okay. I see your point.
> > >
> > > As I understand, the code within v1 and v2 will almost never be required on
> > > the same system since they belong to different types of hardware (platform/PCI).
> > > Dependency between V1, V2 and common code is something like below
> > 
> > platform and PCI can co-exist in the kernel, there is no reason to
> > worry about them co-existing in the same driver other than loaded code
> > size, which doesn't seem like a big deal here.
> 
> 
> Ok. I take your point. If we have an options then we would rather prefer to
> reduce the size as well other than converting it to a single .ko.

Generally modules should be reasonable, in this case the hns RDMA
driver looks to be around maybe 100k, which is not really big enough to
be worth splitting IMHO.
> 
> > About the only thing that might give me pause is that v2 depends on a
> > different ethernet module than v1.. But that depends alot on size too,
> > there is no harm to load a useless ethernet module if it is not large.
> 
> Yes, you are right. That is another concern. It would be very messy
> to load HNS & HNS3 Ethernet driver together when we know they will not
> be used and have big differences in their hardware. Also, HNS3 Ethernet
> driver is big in size.
> 
> As such,
>            V1 RoCE ---> HNS Ethernet Driver
>            V2 RoCE ---> HNS3 Ethernet Driver

This is a reasonable explanation

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index ce23338..0ec5a24 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -341,6 +341,29 @@  struct hns_roce_mtt {
 	enum hns_roce_mtt_type	mtt_type;
 };
 
+struct hns_roce_buf_region {
+	int offset; /* page offset */
+	u32 count; /* page count*/
+	int hopnum; /* addressing hop num */
+};
+
+#define HNS_ROCE_MAX_BT_REGION	3
+#define HNS_ROCE_MAX_BT_LEVEL	3
+struct hns_roce_hem_list {
+	struct list_head root_bt;
+	/* link all bt dma mem by hop config */
+	struct list_head mid_bt[HNS_ROCE_MAX_BT_REGION][HNS_ROCE_MAX_BT_LEVEL];
+	struct list_head btm_bt; /* link all bottom bt in @mid_bt */
+	dma_addr_t root_ba; /* pointer to the root ba table */
+	int bt_pg_shift;
+};
+
+/* memory translate region */
+struct hns_roce_mtr {
+	struct hns_roce_hem_list hem_list;
+	int buf_pg_shift;
+};
+
 struct hns_roce_mw {
 	struct ib_mw		ibmw;
 	u32			pdn;
@@ -1111,6 +1134,19 @@  void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev,
 int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev,
 			   struct hns_roce_mtt *mtt, struct hns_roce_buf *buf);
 
+void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift,
+		       int buf_pg_shift);
+int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			dma_addr_t **bufs, struct hns_roce_buf_region *regions,
+			int region_cnt);
+void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_mtr *mtr);
+
+/* hns roce hw need current block and next block addr from mtt */
+#define MTT_MIN_COUNT	 2
+int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr);
+
 int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index 157c84a..8006ef4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -1157,3 +1157,463 @@  void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev)
 					   &hr_dev->mr_table.mtt_cqe_table);
 	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table);
 }
+
+struct roce_hem_item {
+	struct list_head list; /* link all hems in the same bt level */
+	struct list_head sibling; /* link all hems in last hop for mtt */
+	void *addr;
+	dma_addr_t dma_addr;
+	size_t count; /* max ba numbers */
+	int start; /* start buf offset in this hem */
+	int end; /* end buf offset in this hem */
+};
+
+static struct roce_hem_item *hem_list_alloc_item(struct hns_roce_dev *hr_dev,
+						   int start, int end,
+						   int count, bool exist_bt,
+						   int bt_level)
+{
+	struct roce_hem_item *hem;
+
+	hem = kzalloc(sizeof(*hem), GFP_KERNEL);
+	if (!hem)
+		return NULL;
+
+	if (exist_bt) {
+		hem->addr = dma_alloc_coherent(hr_dev->dev,
+						   count * BA_BYTE_LEN,
+						   &hem->dma_addr, GFP_KERNEL);
+		if (!hem->addr) {
+			kfree(hem);
+			return NULL;
+		}
+	}
+
+	hem->count = count;
+	hem->start = start;
+	hem->end = end;
+	INIT_LIST_HEAD(&hem->list);
+	INIT_LIST_HEAD(&hem->sibling);
+
+	return hem;
+}
+
+static void hem_list_free_item(struct hns_roce_dev *hr_dev,
+			       struct roce_hem_item *hem, bool exist_bt)
+{
+	if (exist_bt)
+		dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN,
+				  hem->addr, hem->dma_addr);
+	kfree(hem);
+}
+
+static void hem_list_free_all(struct hns_roce_dev *hr_dev,
+			      struct list_head *head, bool exist_bt)
+{
+	struct roce_hem_item *hem, *temp_hem;
+
+	list_for_each_entry_safe(hem, temp_hem, head, list) {
+		list_del(&hem->list);
+		hem_list_free_item(hr_dev, hem, exist_bt);
+	}
+}
+
+static void hem_list_link_bt(struct hns_roce_dev *hr_dev, void *base_addr,
+			     u64 table_addr)
+{
+	*(u64 *)(base_addr) = table_addr;
+}
+
+/* assign L0 table address to hem from root bt */
+static void hem_list_assign_bt(struct hns_roce_dev *hr_dev,
+			       struct roce_hem_item *hem, void *cpu_addr,
+			       u64 phy_addr)
+{
+	hem->addr = cpu_addr;
+	hem->dma_addr = (dma_addr_t)phy_addr;
+}
+
+static inline bool hem_list_page_is_in_range(struct roce_hem_item *hem,
+					     int offset)
+{
+	return (hem->start <= offset && offset <= hem->end);
+}
+
+static struct roce_hem_item *hem_list_search_item(struct list_head *ba_list,
+						    int page_offset)
+{
+	struct roce_hem_item *hem, *temp_hem;
+	struct roce_hem_item *found = NULL;
+
+	list_for_each_entry_safe(hem, temp_hem, ba_list, list) {
+		if (hem_list_page_is_in_range(hem, page_offset)) {
+			found = hem;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static bool hem_list_is_bottom_bt(int hopnum, int bt_level)
+{
+	/*
+	 * hopnum    base address table levels
+	 * 0		L0(buf)
+	 * 1		L0 -> buf
+	 * 2		L0 -> L1 -> buf
+	 * 3		L0 -> L1 -> L2 -> buf
+	 */
+	return bt_level >= (hopnum ? hopnum - 1 : hopnum);
+}
+
+/**
+ * calc base address entries num
+ * @hopnum: num of mutihop addressing
+ * @bt_level: base address table level
+ * @unit: ba entries per bt page
+ */
+static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit)
+{
+	u32 step;
+	int max;
+	int i;
+
+	if (hopnum <= bt_level)
+		return 0;
+	/*
+	 * hopnum  bt_level   range
+	 * 1	      0       unit
+	 * ------------
+	 * 2	      0       unit * unit
+	 * 2	      1       unit
+	 * ------------
+	 * 3	      0       unit * unit * unit
+	 * 3	      1       unit * unit
+	 * 3	      2       unit
+	 */
+	step = 1;
+	max = hopnum - bt_level;
+	for (i = 0; i < max; i++)
+		step = step * unit;
+
+	return step;
+}
+
+/**
+ * calc the root ba entries which could cover all regions
+ * @regions: buf region array
+ * @region_cnt: array size of @regions
+ * @unit: ba entries per bt page
+ */
+int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions,
+				   int region_cnt, int unit)
+{
+	struct hns_roce_buf_region *r;
+	int total = 0;
+	int step;
+	int i;
+
+	for (i = 0; i < region_cnt; i++) {
+		r = (struct hns_roce_buf_region *)&regions[i];
+		if (r->hopnum > 1) {
+			step = hem_list_calc_ba_range(r->hopnum, 1, unit);
+			if (step > 0)
+				total += (r->count + step - 1) / step;
+		} else {
+			total += r->count;
+		}
+	}
+
+	return total;
+}
+
+static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev,
+				 const struct hns_roce_buf_region *r, int unit,
+				 int offset, struct list_head *mid_bt,
+				 struct list_head *btm_bt)
+{
+	struct roce_hem_item *hem_ptrs[HNS_ROCE_MAX_BT_LEVEL] = { NULL };
+	struct list_head temp_list[HNS_ROCE_MAX_BT_LEVEL];
+	struct roce_hem_item *cur, *pre;
+	const int hopnum = r->hopnum;
+	int start_aligned;
+	int distance;
+	int ret = 0;
+	int max_ofs;
+	int level;
+	u32 step;
+	int end;
+
+	if (hopnum <= 1)
+		return 0;
+
+	if (hopnum > HNS_ROCE_MAX_BT_LEVEL) {
+		dev_err(hr_dev->dev, "invalid hopnum %d!\n", hopnum);
+		return -EINVAL;
+	}
+
+	if (offset < r->offset) {
+		dev_err(hr_dev->dev, "invalid offset %d,min %d!\n",
+			offset, r->offset);
+		return -EINVAL;
+	}
+
+	distance = offset - r->offset;
+	max_ofs = r->offset + r->count - 1;
+	for (level = 0; level < hopnum; level++)
+		INIT_LIST_HEAD(&temp_list[level]);
+
+	/* config L1 bt to last bt and link them to corresponding parent */
+	for (level = 1; level < hopnum; level++) {
+		cur = hem_list_search_item(&mid_bt[level], offset);
+		if (cur) {
+			hem_ptrs[level] = cur;
+			continue;
+		}
+
+		step = hem_list_calc_ba_range(hopnum, level, unit);
+		if (step < 1) {
+			ret = -EINVAL;
+			goto err_exit;
+		}
+
+		start_aligned = (distance / step) * step + r->offset;
+		end = min_t(int, start_aligned + step - 1, max_ofs);
+		cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit,
+					  true, level);
+		if (!cur) {
+			ret = -ENOMEM;
+			goto err_exit;
+		}
+		hem_ptrs[level] = cur;
+		list_add(&cur->list, &temp_list[level]);
+		if (hem_list_is_bottom_bt(hopnum, level))
+			list_add(&cur->sibling, &temp_list[0]);
+
+		/* link bt to parent bt */
+		if (level > 1) {
+			pre = hem_ptrs[level - 1];
+			step = (cur->start - pre->start) / step * BA_BYTE_LEN;
+			hem_list_link_bt(hr_dev, pre->addr + step,
+					 cur->dma_addr);
+		}
+	}
+
+	list_splice(&temp_list[0], btm_bt);
+	for (level = 1; level < hopnum; level++)
+		list_splice(&temp_list[level], &mid_bt[level]);
+
+	return 0;
+
+err_exit:
+	for (level = 1; level < hopnum; level++)
+		hem_list_free_all(hr_dev, &temp_list[level], true);
+
+	return ret;
+}
+
+static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_hem_list *hem_list, int unit,
+				  const struct hns_roce_buf_region *regions,
+				  int region_cnt)
+{
+	struct roce_hem_item *hem, *temp_hem, *root_hem;
+	struct list_head temp_list[HNS_ROCE_MAX_BT_REGION];
+	const struct hns_roce_buf_region *r;
+	struct list_head temp_root;
+	struct list_head temp_btm;
+	void *cpu_base;
+	u64 phy_base;
+	int ret = 0;
+	int offset;
+	int total;
+	int step;
+	int i;
+
+	r = &regions[0];
+	root_hem = hem_list_search_item(&hem_list->root_bt, r->offset);
+	if (root_hem)
+		return 0;
+
+	INIT_LIST_HEAD(&temp_root);
+	total = r->offset;
+	/* indicate to last region */
+	r = &regions[region_cnt - 1];
+	root_hem = hem_list_alloc_item(hr_dev, total, r->offset + r->count - 1,
+				       unit, true, 0);
+	if (!root_hem)
+		return -ENOMEM;
+	list_add(&root_hem->list, &temp_root);
+
+	hem_list->root_ba = root_hem->dma_addr;
+
+	INIT_LIST_HEAD(&temp_btm);
+	for (i = 0; i < region_cnt; i++)
+		INIT_LIST_HEAD(&temp_list[i]);
+
+	total = 0;
+	for (i = 0; i < region_cnt && total < unit; i++) {
+		r = &regions[i];
+		if (!r->count)
+			continue;
+
+		/* all regions's mid[x][0] shared the root_bt's trunk */
+		cpu_base = root_hem->addr + total * BA_BYTE_LEN;
+		phy_base = root_hem->dma_addr + total * BA_BYTE_LEN;
+
+		/* if hopnum is 0 or 1, cut a new fake hem from the root bt
+		 * which's address share to all regions.
+		 */
+		if (hem_list_is_bottom_bt(r->hopnum, 0)) {
+			hem = hem_list_alloc_item(hr_dev, r->offset,
+						  r->offset + r->count - 1,
+						  r->count, false, 0);
+			if (!hem) {
+				ret = -ENOMEM;
+				goto err_exit;
+			}
+			hem_list_assign_bt(hr_dev, hem, cpu_base, phy_base);
+			list_add(&hem->list, &temp_list[i]);
+			list_add(&hem->sibling, &temp_btm);
+			total += r->count;
+		} else {
+			step = hem_list_calc_ba_range(r->hopnum, 1, unit);
+			if (step < 1) {
+				ret = -EINVAL;
+				goto err_exit;
+			}
+			/* if exist mid bt, link L1 to L0 */
+			list_for_each_entry_safe(hem, temp_hem,
+					  &hem_list->mid_bt[i][1], list) {
+				offset = hem->start / step * BA_BYTE_LEN;
+				hem_list_link_bt(hr_dev, cpu_base + offset,
+						 hem->dma_addr);
+				total++;
+			}
+		}
+	}
+
+	list_splice(&temp_btm, &hem_list->btm_bt);
+	list_splice(&temp_root, &hem_list->root_bt);
+	for (i = 0; i < region_cnt; i++)
+		list_splice(&temp_list[i], &hem_list->mid_bt[i][0]);
+
+	return 0;
+
+err_exit:
+	for (i = 0; i < region_cnt; i++)
+		hem_list_free_all(hr_dev, &temp_list[i], false);
+
+	hem_list_free_all(hr_dev, &temp_root, true);
+
+	return ret;
+}
+
+/* construct the base address table and link them by address hop config */
+int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_hem_list *hem_list,
+			      const struct hns_roce_buf_region *regions,
+			      int region_cnt)
+{
+	const struct hns_roce_buf_region *r;
+	int ofs, end;
+	int ret = 0;
+	int unit;
+	int i;
+
+	if (region_cnt > HNS_ROCE_MAX_BT_REGION) {
+		dev_err(hr_dev->dev, "invalid region region_cnt %d!\n",
+			region_cnt);
+		return -EINVAL;
+	}
+
+	unit = (1 << hem_list->bt_pg_shift) / BA_BYTE_LEN;
+	for (i = 0; i < region_cnt; i++) {
+		r = &regions[i];
+		if (!r->count)
+			continue;
+
+		end = r->offset + r->count;
+		for (ofs = r->offset; ofs < end; ofs += unit) {
+			ret = hem_list_alloc_mid_bt(hr_dev, r, unit, ofs,
+						    hem_list->mid_bt[i],
+						    &hem_list->btm_bt);
+			if (ret) {
+				dev_err(hr_dev->dev,
+					"alloc hem trunk fail ret=%d!\n", ret);
+				goto err_alloc;
+			}
+		}
+	}
+
+	ret = hem_list_alloc_root_bt(hr_dev, hem_list, unit, regions,
+				     region_cnt);
+	if (ret)
+		dev_err(hr_dev->dev, "alloc hem root fail ret=%d!\n", ret);
+	else
+		return 0;
+
+err_alloc:
+	hns_roce_hem_list_release(hr_dev, hem_list);
+
+	return ret;
+}
+
+void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_hem_list *hem_list)
+{
+	int i, j;
+
+	for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++)
+		for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++)
+			hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j],
+					  j != 0);
+
+	hem_list_free_all(hr_dev, &hem_list->root_bt, true);
+	INIT_LIST_HEAD(&hem_list->btm_bt);
+	hem_list->root_ba = 0;
+}
+
+void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list,
+			    int bt_page_order)
+{
+	int i, j;
+
+	INIT_LIST_HEAD(&hem_list->root_bt);
+	INIT_LIST_HEAD(&hem_list->btm_bt);
+	for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++)
+		for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++)
+			INIT_LIST_HEAD(&hem_list->mid_bt[i][j]);
+
+	hem_list->bt_pg_shift = bt_page_order;
+}
+
+void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_hem_list *hem_list,
+				 int offset, int *mtt_cnt, u64 *phy_addr)
+{
+	struct list_head *head = &hem_list->btm_bt;
+	struct roce_hem_item *hem, *temp_hem;
+	void *cpu_base = NULL;
+	u64 phy_base = 0;
+	int nr = 0;
+
+	list_for_each_entry_safe(hem, temp_hem, head, sibling) {
+		if (hem_list_page_is_in_range(hem, offset)) {
+			nr = offset - hem->start;
+			cpu_base = hem->addr + nr * BA_BYTE_LEN;
+			phy_base = hem->dma_addr + nr * BA_BYTE_LEN;
+			nr = hem->end + 1 - offset;
+			break;
+		}
+	}
+
+	if (mtt_cnt)
+		*mtt_cnt = nr;
+
+	if (phy_addr)
+		*phy_addr = phy_base;
+
+	return cpu_base;
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h
index d9d6689..e865fc8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.h
@@ -133,6 +133,20 @@  int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
 			   struct hns_roce_hem_mhop *mhop);
 bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type);
 
+void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list,
+			    int bt_page_order);
+int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions,
+				   int region_cnt, int unit);
+int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_hem_list *hem_list,
+			      const struct hns_roce_buf_region *regions,
+			      int region_cnt);
+void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_hem_list *hem_list);
+void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_hem_list *hem_list,
+				 int offset, int *mtt_cnt, u64 *phy_addr);
+
 static inline void hns_roce_hem_first(struct hns_roce_hem *hem,
 				      struct hns_roce_hem_iter *iter)
 {
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 38ed4ac..7bdc34f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -1496,3 +1496,124 @@  int hns_roce_dealloc_mw(struct ib_mw *ibmw)
 
 	return 0;
 }
+
+void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift,
+		       int buf_pg_shift)
+{
+	hns_roce_hem_list_init(&mtr->hem_list, bt_pg_shift);
+	mtr->buf_pg_shift = buf_pg_shift;
+}
+
+void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_mtr *mtr)
+{
+	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
+}
+EXPORT_SYMBOL_GPL(hns_roce_mtr_cleanup);
+
+static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_mtr *mtr, dma_addr_t *bufs,
+			      struct hns_roce_buf_region *r)
+{
+	int offset;
+	int count;
+	int npage;
+	u64 *mtts;
+	int end;
+	int i;
+
+	offset = r->offset;
+	end = offset + r->count;
+	npage = 0;
+	while (offset < end) {
+		mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
+						  offset, &count, NULL);
+		if (!mtts)
+			return -ENOBUFS;
+
+		/* Save page addr, low 12 bits : 0 */
+		for (i = 0; i < count; i++) {
+			if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+				mtts[i] = cpu_to_le64(bufs[npage] >>
+							PAGE_ADDR_SHIFT);
+			else
+				mtts[i] = cpu_to_le64(bufs[npage]);
+
+			npage++;
+		}
+		offset += count;
+	}
+
+	/* Memory barrier */
+	mb();
+
+	return 0;
+}
+
+int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			dma_addr_t **bufs, struct hns_roce_buf_region *regions,
+			int region_cnt)
+{
+	struct hns_roce_buf_region *r;
+	int ret;
+	int i;
+
+	ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, regions,
+					region_cnt);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < region_cnt; i++) {
+		r = &regions[i];
+		ret = hns_roce_write_mtr(hr_dev, mtr, bufs[i], r);
+		if (ret) {
+			dev_err(hr_dev->dev,
+				"write mtr[%d/%d] err %d,offset=%d.\n",
+				i, region_cnt, ret,  r->offset);
+			goto err_write;
+		}
+	}
+
+	return 0;
+
+err_write:
+	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
+
+	return ret;
+}
+
+int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
+{
+	u64 *mtts = mtt_buf;
+	int mtt_count;
+	int total = 0;
+	u64 *addr;
+	int npage;
+	int left;
+
+	if (mtts == NULL || mtt_max < 1)
+		goto done;
+
+	left = mtt_max;
+	while (left > 0) {
+		mtt_count = 0;
+		addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
+						  offset + total,
+						  &mtt_count, NULL);
+		if (!addr || !mtt_count)
+			goto done;
+
+		npage = min(mtt_count, left);
+		memcpy(&mtts[total], addr, BA_BYTE_LEN * npage);
+		left -= npage;
+		total += npage;
+	}
+
+done:
+	if (base_addr)
+		*base_addr = mtr->hem_list.root_ba;
+
+	return total;
+}
+EXPORT_SYMBOL_GPL(hns_roce_mtr_find);