diff mbox series

[rdma-rc,1/1] RDMA/rxe: Fix the failure of ibv_query_device() and ibv_query_device_ex() tests

Message ID 20250302215444.3742072-1-yanjun.zhu@linux.dev (mailing list archive)
State Accepted
Headers show
Series [rdma-rc,1/1] RDMA/rxe: Fix the failure of ibv_query_device() and ibv_query_device_ex() tests | expand

Commit Message

Zhu Yanjun March 2, 2025, 9:54 p.m. UTC
In rdma-core, the following failures appear.

"
$ ./build/bin/run_tests.py -k device
ssssssss....FF........s
======================================================================
FAIL: test_query_device (tests.test_device.DeviceTest.test_query_device)
Test ibv_query_device()
----------------------------------------------------------------------
Traceback (most recent call last):
   File "/home/ubuntu/rdma-core/tests/test_device.py", line 63, in
   test_query_device
     self.verify_device_attr(attr, dev)
   File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
   verify_device_attr
     assert attr.sys_image_guid != 0
            ^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError

======================================================================
FAIL: test_query_device_ex (tests.test_device.DeviceTest.test_query_device_ex)
Test ibv_query_device_ex()
----------------------------------------------------------------------
Traceback (most recent call last):
   File "/home/ubuntu/rdma-core/tests/test_device.py", line 222, in
   test_query_device_ex
     self.verify_device_attr(attr_ex.orig_attr, dev)
   File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
   verify_device_attr
     assert attr.sys_image_guid != 0
            ^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
"

The root cause is: before a net device is set with rxe, this net device
is used to generate a sys_image_guid.

Fixes: 2ac5415022d1 ("RDMA/rxe: Remove the direct link to net_device")
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
 drivers/infiniband/sw/rxe/rxe.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

Comments

Daisuke Matsuda (Fujitsu) March 3, 2025, 4:21 a.m. UTC | #1
On Mon, March 3, 2025 6:55 AM Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
> 
> In rdma-core, the following failures appear.
> 
> "
> $ ./build/bin/run_tests.py -k device
> ssssssss....FF........s
> ======================================================================
> FAIL: test_query_device (tests.test_device.DeviceTest.test_query_device)
> Test ibv_query_device()
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>    File "/home/ubuntu/rdma-core/tests/test_device.py", line 63, in
>    test_query_device
>      self.verify_device_attr(attr, dev)
>    File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
>    verify_device_attr
>      assert attr.sys_image_guid != 0
>             ^^^^^^^^^^^^^^^^^^^^^^^^
> AssertionError
> 
> ======================================================================
> FAIL: test_query_device_ex (tests.test_device.DeviceTest.test_query_device_ex)
> Test ibv_query_device_ex()
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>    File "/home/ubuntu/rdma-core/tests/test_device.py", line 222, in
>    test_query_device_ex
>      self.verify_device_attr(attr_ex.orig_attr, dev)
>    File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
>    verify_device_attr
>      assert attr.sys_image_guid != 0
>             ^^^^^^^^^^^^^^^^^^^^^^^^
> AssertionError
> "
> 
> The root cause is: before a net device is set with rxe, this net device
> is used to generate a sys_image_guid.

I have tested this patch, and the problem I reported last week is now gone.
The fix looks good. Thanks!

Tested-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
Reviewed-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>

> 
> Fixes: 2ac5415022d1 ("RDMA/rxe: Remove the direct link to net_device")
> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
> ---
>  drivers/infiniband/sw/rxe/rxe.c | 25 ++++++-------------------
>  1 file changed, 6 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 1ba4a0c8726a..e27478fe9456 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -38,10 +38,8 @@ void rxe_dealloc(struct ib_device *ib_dev)
>  }
> 
>  /* initialize rxe device parameters */
> -static void rxe_init_device_param(struct rxe_dev *rxe)
> +static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>  {
> -	struct net_device *ndev;
> -
>  	rxe->max_inline_data			= RXE_MAX_INLINE_DATA;
> 
>  	rxe->attr.vendor_id			= RXE_VENDOR_ID;
> @@ -74,15 +72,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
>  	rxe->attr.max_pkeys			= RXE_MAX_PKEYS;
>  	rxe->attr.local_ca_ack_delay		= RXE_LOCAL_CA_ACK_DELAY;
> 
> -	ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
> -	if (!ndev)
> -		return;
> -
>  	addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
>  			ndev->dev_addr);
> 
> -	dev_put(ndev);
> -
>  	rxe->max_ucontext			= RXE_MAX_UCONTEXT;
>  }
> 
> @@ -115,18 +107,13 @@ static void rxe_init_port_param(struct rxe_port *port)
>  /* initialize port state, note IB convention that HCA ports are always
>   * numbered from 1
>   */
> -static void rxe_init_ports(struct rxe_dev *rxe)
> +static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev)
>  {
>  	struct rxe_port *port = &rxe->port;
> -	struct net_device *ndev;
> 
>  	rxe_init_port_param(port);
> -	ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
> -	if (!ndev)
> -		return;
>  	addrconf_addr_eui48((unsigned char *)&port->port_guid,
>  			    ndev->dev_addr);
> -	dev_put(ndev);
>  	spin_lock_init(&port->port_lock);
>  }
> 
> @@ -144,12 +131,12 @@ static void rxe_init_pools(struct rxe_dev *rxe)
>  }
> 
>  /* initialize rxe device state */
> -static void rxe_init(struct rxe_dev *rxe)
> +static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev)
>  {
>  	/* init default device parameters */
> -	rxe_init_device_param(rxe);
> +	rxe_init_device_param(rxe, ndev);
> 
> -	rxe_init_ports(rxe);
> +	rxe_init_ports(rxe, ndev);
>  	rxe_init_pools(rxe);
> 
>  	/* init pending mmap list */
> @@ -184,7 +171,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>  int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
>  			struct net_device *ndev)
>  {
> -	rxe_init(rxe);
> +	rxe_init(rxe, ndev);
>  	rxe_set_mtu(rxe, mtu);
> 
>  	return rxe_register_device(rxe, ibdev_name, ndev);
> --
> 2.34.1
>
Zhu Yanjun March 3, 2025, 7:27 a.m. UTC | #2
在 2025/3/3 5:21, Daisuke Matsuda (Fujitsu) 写道:
> On Mon, March 3, 2025 6:55 AM Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
>>
>> In rdma-core, the following failures appear.
>>
>> "
>> $ ./build/bin/run_tests.py -k device
>> ssssssss....FF........s
>> ======================================================================
>> FAIL: test_query_device (tests.test_device.DeviceTest.test_query_device)
>> Test ibv_query_device()
>> ----------------------------------------------------------------------
>> Traceback (most recent call last):
>>     File "/home/ubuntu/rdma-core/tests/test_device.py", line 63, in
>>     test_query_device
>>       self.verify_device_attr(attr, dev)
>>     File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
>>     verify_device_attr
>>       assert attr.sys_image_guid != 0
>>              ^^^^^^^^^^^^^^^^^^^^^^^^
>> AssertionError
>>
>> ======================================================================
>> FAIL: test_query_device_ex (tests.test_device.DeviceTest.test_query_device_ex)
>> Test ibv_query_device_ex()
>> ----------------------------------------------------------------------
>> Traceback (most recent call last):
>>     File "/home/ubuntu/rdma-core/tests/test_device.py", line 222, in
>>     test_query_device_ex
>>       self.verify_device_attr(attr_ex.orig_attr, dev)
>>     File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
>>     verify_device_attr
>>       assert attr.sys_image_guid != 0
>>              ^^^^^^^^^^^^^^^^^^^^^^^^
>> AssertionError
>> "
>>
>> The root cause is: before a net device is set with rxe, this net device
>> is used to generate a sys_image_guid.
> 
> I have tested this patch, and the problem I reported last week is now gone.
> The fix looks good. Thanks!

Thanks a lot.

Zhu Yanjun

> 
> Tested-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
> Reviewed-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
> 
>>
>> Fixes: 2ac5415022d1 ("RDMA/rxe: Remove the direct link to net_device")
>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>> ---
>>   drivers/infiniband/sw/rxe/rxe.c | 25 ++++++-------------------
>>   1 file changed, 6 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>> index 1ba4a0c8726a..e27478fe9456 100644
>> --- a/drivers/infiniband/sw/rxe/rxe.c
>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>> @@ -38,10 +38,8 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>   }
>>
>>   /* initialize rxe device parameters */
>> -static void rxe_init_device_param(struct rxe_dev *rxe)
>> +static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
>>   {
>> -	struct net_device *ndev;
>> -
>>   	rxe->max_inline_data			= RXE_MAX_INLINE_DATA;
>>
>>   	rxe->attr.vendor_id			= RXE_VENDOR_ID;
>> @@ -74,15 +72,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
>>   	rxe->attr.max_pkeys			= RXE_MAX_PKEYS;
>>   	rxe->attr.local_ca_ack_delay		= RXE_LOCAL_CA_ACK_DELAY;
>>
>> -	ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
>> -	if (!ndev)
>> -		return;
>> -
>>   	addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
>>   			ndev->dev_addr);
>>
>> -	dev_put(ndev);
>> -
>>   	rxe->max_ucontext			= RXE_MAX_UCONTEXT;
>>   }
>>
>> @@ -115,18 +107,13 @@ static void rxe_init_port_param(struct rxe_port *port)
>>   /* initialize port state, note IB convention that HCA ports are always
>>    * numbered from 1
>>    */
>> -static void rxe_init_ports(struct rxe_dev *rxe)
>> +static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev)
>>   {
>>   	struct rxe_port *port = &rxe->port;
>> -	struct net_device *ndev;
>>
>>   	rxe_init_port_param(port);
>> -	ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
>> -	if (!ndev)
>> -		return;
>>   	addrconf_addr_eui48((unsigned char *)&port->port_guid,
>>   			    ndev->dev_addr);
>> -	dev_put(ndev);
>>   	spin_lock_init(&port->port_lock);
>>   }
>>
>> @@ -144,12 +131,12 @@ static void rxe_init_pools(struct rxe_dev *rxe)
>>   }
>>
>>   /* initialize rxe device state */
>> -static void rxe_init(struct rxe_dev *rxe)
>> +static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev)
>>   {
>>   	/* init default device parameters */
>> -	rxe_init_device_param(rxe);
>> +	rxe_init_device_param(rxe, ndev);
>>
>> -	rxe_init_ports(rxe);
>> +	rxe_init_ports(rxe, ndev);
>>   	rxe_init_pools(rxe);
>>
>>   	/* init pending mmap list */
>> @@ -184,7 +171,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>   int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
>>   			struct net_device *ndev)
>>   {
>> -	rxe_init(rxe);
>> +	rxe_init(rxe, ndev);
>>   	rxe_set_mtu(rxe, mtu);
>>
>>   	return rxe_register_device(rxe, ibdev_name, ndev);
>> --
>> 2.34.1
>>
>
Leon Romanovsky March 3, 2025, 7:07 p.m. UTC | #3
On Sun, 02 Mar 2025 22:54:44 +0100, Zhu Yanjun wrote:
> In rdma-core, the following failures appear.
> 
> "
> $ ./build/bin/run_tests.py -k device
> ssssssss....FF........s
> ======================================================================
> FAIL: test_query_device (tests.test_device.DeviceTest.test_query_device)
> Test ibv_query_device()
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>    File "/home/ubuntu/rdma-core/tests/test_device.py", line 63, in
>    test_query_device
>      self.verify_device_attr(attr, dev)
>    File "/home/ubuntu/rdma-core/tests/test_device.py", line 200, in
>    verify_device_attr
>      assert attr.sys_image_guid != 0
>             ^^^^^^^^^^^^^^^^^^^^^^^^
> AssertionError
> 
> [...]

Applied, thanks!

[1/1] RDMA/rxe: Fix the failure of ibv_query_device() and ibv_query_device_ex() tests
      https://git.kernel.org/rdma/rdma/c/8ce2eb9dfac874

Best regards,
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 1ba4a0c8726a..e27478fe9456 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -38,10 +38,8 @@  void rxe_dealloc(struct ib_device *ib_dev)
 }
 
 /* initialize rxe device parameters */
-static void rxe_init_device_param(struct rxe_dev *rxe)
+static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
 {
-	struct net_device *ndev;
-
 	rxe->max_inline_data			= RXE_MAX_INLINE_DATA;
 
 	rxe->attr.vendor_id			= RXE_VENDOR_ID;
@@ -74,15 +72,9 @@  static void rxe_init_device_param(struct rxe_dev *rxe)
 	rxe->attr.max_pkeys			= RXE_MAX_PKEYS;
 	rxe->attr.local_ca_ack_delay		= RXE_LOCAL_CA_ACK_DELAY;
 
-	ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
-	if (!ndev)
-		return;
-
 	addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
 			ndev->dev_addr);
 
-	dev_put(ndev);
-
 	rxe->max_ucontext			= RXE_MAX_UCONTEXT;
 }
 
@@ -115,18 +107,13 @@  static void rxe_init_port_param(struct rxe_port *port)
 /* initialize port state, note IB convention that HCA ports are always
  * numbered from 1
  */
-static void rxe_init_ports(struct rxe_dev *rxe)
+static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev)
 {
 	struct rxe_port *port = &rxe->port;
-	struct net_device *ndev;
 
 	rxe_init_port_param(port);
-	ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
-	if (!ndev)
-		return;
 	addrconf_addr_eui48((unsigned char *)&port->port_guid,
 			    ndev->dev_addr);
-	dev_put(ndev);
 	spin_lock_init(&port->port_lock);
 }
 
@@ -144,12 +131,12 @@  static void rxe_init_pools(struct rxe_dev *rxe)
 }
 
 /* initialize rxe device state */
-static void rxe_init(struct rxe_dev *rxe)
+static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev)
 {
 	/* init default device parameters */
-	rxe_init_device_param(rxe);
+	rxe_init_device_param(rxe, ndev);
 
-	rxe_init_ports(rxe);
+	rxe_init_ports(rxe, ndev);
 	rxe_init_pools(rxe);
 
 	/* init pending mmap list */
@@ -184,7 +171,7 @@  void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
 int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
 			struct net_device *ndev)
 {
-	rxe_init(rxe);
+	rxe_init(rxe, ndev);
 	rxe_set_mtu(rxe, mtu);
 
 	return rxe_register_device(rxe, ibdev_name, ndev);