diff mbox series

[rdma-next,1/1] RDMA/mana_ib: Set correct device into ib

Message ID 1719311307-7920-1-git-send-email-kotaranov@linux.microsoft.com (mailing list archive)
State Superseded
Headers show
Series [rdma-next,1/1] RDMA/mana_ib: Set correct device into ib | expand

Commit Message

Konstantin Taranov June 25, 2024, 10:28 a.m. UTC
From: Konstantin Taranov <kotaranov@microsoft.com>

When mc->ports[0] is not slave, use it in the set_netdev.
When mana is used in netvsc, the stored net devices in mana
are slaves and GIDs should be taken from their master devices.
In the baremetal case, the mc->ports devices will not be slaves.

Fixes: 8b184e4f1c32 ("RDMA/mana_ib: Enable RoCE on port 1")
Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com>
---
 drivers/infiniband/hw/mana/device.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

Comments

Leon Romanovsky June 26, 2024, 5:47 a.m. UTC | #1
On Tue, Jun 25, 2024 at 03:28:27AM -0700, Konstantin Taranov wrote:
> From: Konstantin Taranov <kotaranov@microsoft.com>
> 
> When mc->ports[0] is not slave, use it in the set_netdev.
> When mana is used in netvsc, the stored net devices in mana
> are slaves and GIDs should be taken from their master devices.
> In the baremetal case, the mc->ports devices will not be slaves.

I wonder, why do you have "... | IFF_SLAVE" in __netvsc_vf_setup() in a
first place? Isn't IFF_SLAVE is supposed to be set by bond driver?

> 
> Fixes: 8b184e4f1c32 ("RDMA/mana_ib: Enable RoCE on port 1")
> Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com>
> ---
>  drivers/infiniband/hw/mana/device.c | 19 ++++++++++++-------
>  1 file changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
> index b07a8e2e838f..5395306a86e8 100644
> --- a/drivers/infiniband/hw/mana/device.c
> +++ b/drivers/infiniband/hw/mana/device.c
> @@ -11,6 +11,8 @@ MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver");
>  MODULE_LICENSE("GPL");
>  MODULE_IMPORT_NS(NET_MANA);
>  
> +#define mana_ndev_is_slave(dev)   (((dev)->flags & IFF_SLAVE) == IFF_SLAVE)

There is no need in macro for one line of code and there is no need in "==",
as the result will be boolean.

> +
>  static const struct ib_device_ops mana_ib_dev_ops = {
>  	.owner = THIS_MODULE,
>  	.driver_id = RDMA_DRIVER_MANA,
> @@ -56,7 +58,7 @@ static int mana_ib_probe(struct auxiliary_device *adev,
>  {
>  	struct mana_adev *madev = container_of(adev, struct mana_adev, adev);
>  	struct gdma_dev *mdev = madev->mdev;
> -	struct net_device *upper_ndev;
> +	struct net_device *ndev;
>  	struct mana_context *mc;
>  	struct mana_ib_dev *dev;
>  	u8 mac_addr[ETH_ALEN];
> @@ -85,16 +87,19 @@ static int mana_ib_probe(struct auxiliary_device *adev,
>  	dev->ib_dev.dev.parent = mdev->gdma_context->dev;
>  
>  	rcu_read_lock(); /* required to get upper dev */
> -	upper_ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]);
> -	if (!upper_ndev) {
> +	if (mana_ndev_is_slave(mc->ports[0]))
> +		ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]);
> +	else
> +		ndev = mc->ports[0];
> +	if (!ndev) {
>  		rcu_read_unlock();
>  		ret = -ENODEV;
> -		ibdev_err(&dev->ib_dev, "Failed to get master netdev");
> +		ibdev_err(&dev->ib_dev, "Failed to get netdev for port 1");
>  		goto free_ib_device;
>  	}
> -	ether_addr_copy(mac_addr, upper_ndev->dev_addr);
> -	addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, upper_ndev->dev_addr);
> -	ret = ib_device_set_netdev(&dev->ib_dev, upper_ndev, 1);
> +	ether_addr_copy(mac_addr, ndev->dev_addr);
> +	addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr);
> +	ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1);
>  	rcu_read_unlock();
>  	if (ret) {
>  		ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret);
> -- 
> 2.43.0
> 
>
Konstantin Taranov June 26, 2024, 9:05 a.m. UTC | #2
> > When mc->ports[0] is not slave, use it in the set_netdev.
> > When mana is used in netvsc, the stored net devices in mana are slaves
> > and GIDs should be taken from their master devices.
> > In the baremetal case, the mc->ports devices will not be slaves.
> 
> I wonder, why do you have "... | IFF_SLAVE" in __netvsc_vf_setup() in a first
> place? Isn't IFF_SLAVE is supposed to be set by bond driver?
> 

I guess it is just a valid use of the IFF_SLAVE bit. In the bond case it is also set
as a BOND netdev. The IFF_SLAVE helps to show users that another master
netdev should be used for networking. But I am not an expert in netvsc.

Actually, another alternative solution for mana_ib is always set the slave device,
but in the GID mgmt code we need the following patch. The problem is that it may require 
testing/confirmation from other ib providers as in the worst case some GIDs will not be listed.

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index d5131b3ba8ab..0f20b4e2d1c2 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -141,6 +141,8 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de
        return BONDING_SLAVE_STATE_NA;
 }

+#define netdev_is_slave(dev)   (((dev)->flags & IFF_SLAVE) == IFF_SLAVE)
+
 #define REQUIRED_BOND_STATES           (BONDING_SLAVE_STATE_ACTIVE |   \
                                         BONDING_SLAVE_STATE_NA)
 static bool
@@ -157,11 +159,14 @@ is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port,
        real_dev = rdma_vlan_dev_real_dev(cookie);
        if (!real_dev)
                real_dev = cookie;
-
+       /*
+        * When rdma netdevice is used in netvsc, the master netdevice should
+        * be considered for GIDs. Therefore, ignore slave rdma netdevices.
+        */
        res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
               (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
                REQUIRED_BOND_STATES)) ||
-              real_dev == rdma_ndev);
+              (real_dev == rdma_ndev && !netdev_is_slave(real_dev)));

        rcu_read_unlock();
        return res;
@@ -211,12 +216,14 @@ is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port,

        /*
         * When rdma netdevice is used in bonding, bonding master netdevice
-        * should be considered for default GIDs. Therefore, ignore slave rdma
-        * netdevices when bonding is considered.
+        * should be considered for default GIDs.
+        * When rdma netdevice is used in netvsc, the master netdevice should
+        * be considered for defauld GIDs. Therefore, ignore slave rdma
+        * netdevices.
         * Additionally when event(cookie) netdevice is bond master device,
         * make sure that it the upper netdevice of rdma netdevice.
         */
-       res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) ||
+       res = ((cookie_ndev == rdma_ndev && !netdev_is_slave(rdma_ndev)) ||
               (netif_is_bond_master(cookie_ndev) &&
                rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)));

> > +#define mana_ndev_is_slave(dev)   (((dev)->flags & IFF_SLAVE) ==
> IFF_SLAVE)
> 
> There is no need in macro for one line of code and there is no need in "==",
> as the result will be boolean.
> 

Sure, can address in v2. I just saw a similar macro in another kernel file.
Leon Romanovsky June 26, 2024, 12:11 p.m. UTC | #3
On Wed, Jun 26, 2024 at 09:05:05AM +0000, Konstantin Taranov wrote:
> > > When mc->ports[0] is not slave, use it in the set_netdev.
> > > When mana is used in netvsc, the stored net devices in mana are slaves
> > > and GIDs should be taken from their master devices.
> > > In the baremetal case, the mc->ports devices will not be slaves.
> > 
> > I wonder, why do you have "... | IFF_SLAVE" in __netvsc_vf_setup() in a first
> > place? Isn't IFF_SLAVE is supposed to be set by bond driver?
> > 
> 
> I guess it is just a valid use of the IFF_SLAVE bit. In the bond case it is also set
> as a BOND netdev. The IFF_SLAVE helps to show users that another master
> netdev should be used for networking. But I am not an expert in netvsc.

The thing is that netvsc is virtual device like many others, but it is
the only one who uses IFF_SLAVE bit. The comment around that bit says
"slave of a load balancer.", which is not the case according to the
Hyper-V documentation.
https://learn.microsoft.com/en-us/windows-hardware/drivers/network/overview-of-hyper-v

You will need to get Ack from netdev maintainers to rely on IFF_SLAVE
bit in the way you are relying on it now.

> 
> Actually, another alternative solution for mana_ib is always set the slave device,
> but in the GID mgmt code we need the following patch. The problem is that it may require 
> testing/confirmation from other ib providers as in the worst case some GIDs will not be listed.

is_eth_active_slave_of_bonding_rcu() is for bonding.

> 
> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
> index d5131b3ba8ab..0f20b4e2d1c2 100644
> --- a/drivers/infiniband/core/roce_gid_mgmt.c
> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> @@ -141,6 +141,8 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de
>         return BONDING_SLAVE_STATE_NA;
>  }
> 
> +#define netdev_is_slave(dev)   (((dev)->flags & IFF_SLAVE) == IFF_SLAVE)
> +
>  #define REQUIRED_BOND_STATES           (BONDING_SLAVE_STATE_ACTIVE |   \
>                                          BONDING_SLAVE_STATE_NA)
>  static bool
> @@ -157,11 +159,14 @@ is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port,
>         real_dev = rdma_vlan_dev_real_dev(cookie);
>         if (!real_dev)
>                 real_dev = cookie;
> -
> +       /*
> +        * When rdma netdevice is used in netvsc, the master netdevice should
> +        * be considered for GIDs. Therefore, ignore slave rdma netdevices.
> +        */
>         res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
>                (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
>                 REQUIRED_BOND_STATES)) ||
> -              real_dev == rdma_ndev);
> +              (real_dev == rdma_ndev && !netdev_is_slave(real_dev)));
> 
>         rcu_read_unlock();
>         return res;
> @@ -211,12 +216,14 @@ is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port,
> 
>         /*
>          * When rdma netdevice is used in bonding, bonding master netdevice
> -        * should be considered for default GIDs. Therefore, ignore slave rdma
> -        * netdevices when bonding is considered.
> +        * should be considered for default GIDs.
> +        * When rdma netdevice is used in netvsc, the master netdevice should
> +        * be considered for defauld GIDs. Therefore, ignore slave rdma
> +        * netdevices.
>          * Additionally when event(cookie) netdevice is bond master device,
>          * make sure that it the upper netdevice of rdma netdevice.
>          */
> -       res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) ||
> +       res = ((cookie_ndev == rdma_ndev && !netdev_is_slave(rdma_ndev)) ||
>                (netif_is_bond_master(cookie_ndev) &&
>                 rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)));
> 
> > > +#define mana_ndev_is_slave(dev)   (((dev)->flags & IFF_SLAVE) ==
> > IFF_SLAVE)
> > 
> > There is no need in macro for one line of code and there is no need in "==",
> > as the result will be boolean.
> > 
> 
> Sure, can address in v2. I just saw a similar macro in another kernel file.

I grepped too and this is why it caused me to wonder why it is not used
except small number of places.

Thanks

> 
>
Stephen Hemminger June 26, 2024, 3:27 p.m. UTC | #4
On Wed, 26 Jun 2024 15:11:18 +0300
Leon Romanovsky <leon@kernel.org> wrote:

> On Wed, Jun 26, 2024 at 09:05:05AM +0000, Konstantin Taranov wrote:
> > > > When mc->ports[0] is not slave, use it in the set_netdev.
> > > > When mana is used in netvsc, the stored net devices in mana are slaves
> > > > and GIDs should be taken from their master devices.
> > > > In the baremetal case, the mc->ports devices will not be slaves.  
> > > 
> > > I wonder, why do you have "... | IFF_SLAVE" in __netvsc_vf_setup() in a first
> > > place? Isn't IFF_SLAVE is supposed to be set by bond driver?
> > >   
> > 
> > I guess it is just a valid use of the IFF_SLAVE bit. In the bond case it is also set
> > as a BOND netdev. The IFF_SLAVE helps to show users that another master
> > netdev should be used for networking. But I am not an expert in netvsc.  
> 
> The thing is that netvsc is virtual device like many others, but it is
> the only one who uses IFF_SLAVE bit. The comment around that bit says
> "slave of a load balancer.", which is not the case according to the
> Hyper-V documentation.
> https://learn.microsoft.com/en-us/windows-hardware/drivers/network/overview-of-hyper-v
> 
> You will need to get Ack from netdev maintainers to rely on IFF_SLAVE
> bit in the way you are relying on it now.

This is used to tell userspace tools to not interact directly with the device.
For example, it is used when VF is connected to netvsc device.
It prevents things like IPv6 local address, and Network Manager won't modify device.
Leon Romanovsky June 26, 2024, 3:33 p.m. UTC | #5
On Wed, Jun 26, 2024 at 08:27:31AM -0700, Stephen Hemminger wrote:
> On Wed, 26 Jun 2024 15:11:18 +0300
> Leon Romanovsky <leon@kernel.org> wrote:
> 
> > On Wed, Jun 26, 2024 at 09:05:05AM +0000, Konstantin Taranov wrote:
> > > > > When mc->ports[0] is not slave, use it in the set_netdev.
> > > > > When mana is used in netvsc, the stored net devices in mana are slaves
> > > > > and GIDs should be taken from their master devices.
> > > > > In the baremetal case, the mc->ports devices will not be slaves.  
> > > > 
> > > > I wonder, why do you have "... | IFF_SLAVE" in __netvsc_vf_setup() in a first
> > > > place? Isn't IFF_SLAVE is supposed to be set by bond driver?
> > > >   
> > > 
> > > I guess it is just a valid use of the IFF_SLAVE bit. In the bond case it is also set
> > > as a BOND netdev. The IFF_SLAVE helps to show users that another master
> > > netdev should be used for networking. But I am not an expert in netvsc.  
> > 
> > The thing is that netvsc is virtual device like many others, but it is
> > the only one who uses IFF_SLAVE bit. The comment around that bit says
> > "slave of a load balancer.", which is not the case according to the
> > Hyper-V documentation.
> > https://learn.microsoft.com/en-us/windows-hardware/drivers/network/overview-of-hyper-v
> > 
> > You will need to get Ack from netdev maintainers to rely on IFF_SLAVE
> > bit in the way you are relying on it now.
> 
> This is used to tell userspace tools to not interact directly with the device.
> For example, it is used when VF is connected to netvsc device.
> It prevents things like IPv6 local address, and Network Manager won't modify device.

You described how hyper-v uses it, but I'm interested to get acknowledgment
that it is a valid use case for IFF_SLAVE, despite sentence written in the comment.

Thanks
Stephen Hemminger June 26, 2024, 4:10 p.m. UTC | #6
On Wed, 26 Jun 2024 18:33:54 +0300
Leon Romanovsky <leon@kernel.org> wrote:

> On Wed, Jun 26, 2024 at 08:27:31AM -0700, Stephen Hemminger wrote:
> > On Wed, 26 Jun 2024 15:11:18 +0300
> > Leon Romanovsky <leon@kernel.org> wrote:
> >   
> > > On Wed, Jun 26, 2024 at 09:05:05AM +0000, Konstantin Taranov wrote:  
> > > > > > When mc->ports[0] is not slave, use it in the set_netdev.
> > > > > > When mana is used in netvsc, the stored net devices in mana are slaves
> > > > > > and GIDs should be taken from their master devices.
> > > > > > In the baremetal case, the mc->ports devices will not be slaves.    
> > > > > 
> > > > > I wonder, why do you have "... | IFF_SLAVE" in __netvsc_vf_setup() in a first
> > > > > place? Isn't IFF_SLAVE is supposed to be set by bond driver?
> > > > >     
> > > > 
> > > > I guess it is just a valid use of the IFF_SLAVE bit. In the bond case it is also set
> > > > as a BOND netdev. The IFF_SLAVE helps to show users that another master
> > > > netdev should be used for networking. But I am not an expert in netvsc.    
> > > 
> > > The thing is that netvsc is virtual device like many others, but it is
> > > the only one who uses IFF_SLAVE bit. The comment around that bit says
> > > "slave of a load balancer.", which is not the case according to the
> > > Hyper-V documentation.
> > > https://learn.microsoft.com/en-us/windows-hardware/drivers/network/overview-of-hyper-v
> > > 
> > > You will need to get Ack from netdev maintainers to rely on IFF_SLAVE
> > > bit in the way you are relying on it now.  
> > 
> > This is used to tell userspace tools to not interact directly with the device.
> > For example, it is used when VF is connected to netvsc device.
> > It prevents things like IPv6 local address, and Network Manager won't modify device.  
> 
> You described how hyper-v uses it, but I'm interested to get acknowledgment
> that it is a valid use case for IFF_SLAVE, despite sentence written in the comment.

There is no documented semantics around any of the IF flags, only historical precedent used by
bond, team and bridge drivers. Initially Hyper-V VF used bonding but it was impossibly
difficult to make this work across all versions of Linux, so transparent VF support
was added instead. Ideally, the VF device could be hidden from userspace but that
required more kernel modifications than would be accepted.
Konstantin Taranov June 26, 2024, 6:19 p.m. UTC | #7
> > > > On Wed, Jun 26, 2024 at 09:05:05AM +0000, Konstantin Taranov wrote:
> > > > > > > When mc->ports[0] is not slave, use it in the set_netdev.
> > > > > > > When mana is used in netvsc, the stored net devices in mana
> > > > > > > are slaves and GIDs should be taken from their master devices.
> > > > > > > In the baremetal case, the mc->ports devices will not be slaves.
> > > > > >
> > > > > > I wonder, why do you have "... | IFF_SLAVE" in
> > > > > > __netvsc_vf_setup() in a first place? Isn't IFF_SLAVE is supposed to
> be set by bond driver?
> > > > > >
> > > > >
> > > > > I guess it is just a valid use of the IFF_SLAVE bit. In the bond
> > > > > case it is also set as a BOND netdev. The IFF_SLAVE helps to show
> users that another master
> > > > > netdev should be used for networking. But I am not an expert in
> netvsc.
> > > >
> > > > The thing is that netvsc is virtual device like many others, but
> > > > it is the only one who uses IFF_SLAVE bit. The comment around that
> > > > bit says "slave of a load balancer.", which is not the case
> > > > according to the Hyper-V documentation.
> > > >
> > > > You will need to get Ack from netdev maintainers to rely on
> > > > IFF_SLAVE bit in the way you are relying on it now.
> > >
> > > This is used to tell userspace tools to not interact directly with the device.
> > > For example, it is used when VF is connected to netvsc device.
> > > It prevents things like IPv6 local address, and Network Manager won't
> modify device.
> >
> > You described how hyper-v uses it, but I'm interested to get
> > acknowledgment that it is a valid use case for IFF_SLAVE, despite sentence
> written in the comment.
> 
> There is no documented semantics around any of the IF flags, only historical
> precedent used by bond, team and bridge drivers. Initially Hyper-V VF used
> bonding but it was impossibly difficult to make this work across all versions of
> Linux, so transparent VF support was added instead. Ideally, the VF device
> could be hidden from userspace but that required more kernel modifications
> than would be accepted.

Thanks Stephen for the explanation!

I am also CCing Haiyang, who maintains Hyper-V netvsc.
Haiyang Zhang June 26, 2024, 6:29 p.m. UTC | #8
> -----Original Message-----
> From: Konstantin Taranov <kotaranov@microsoft.com>
> Sent: Wednesday, June 26, 2024 2:20 PM
> To: Stephen Hemminger <stephen@networkplumber.org>; Leon Romanovsky
> <leon@kernel.org>; Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Konstantin Taranov <kotaranov@linux.microsoft.com>; Wei Hu
> <weh@microsoft.com>; sharmaajay@microsoft.com; Long Li
> <longli@microsoft.com>; jgg@ziepe.ca; linux-rdma@vger.kernel.org; linux-
> netdev <netdev@vger.kernel.org>
> Subject: Re: [PATCH rdma-next 1/1] RDMA/mana_ib: Set correct device into
> ib
> 
> > > > > On Wed, Jun 26, 2024 at 09:05:05AM +0000, Konstantin Taranov
> wrote:
> > > > > > > > When mc->ports[0] is not slave, use it in the set_netdev.
> > > > > > > > When mana is used in netvsc, the stored net devices in mana
> > > > > > > > are slaves and GIDs should be taken from their master
> devices.
> > > > > > > > In the baremetal case, the mc->ports devices will not be
> slaves.
> > > > > > >
> > > > > > > I wonder, why do you have "... | IFF_SLAVE" in
> > > > > > > __netvsc_vf_setup() in a first place? Isn't IFF_SLAVE is
> supposed to
> > be set by bond driver?
> > > > > > >
> > > > > >
> > > > > > I guess it is just a valid use of the IFF_SLAVE bit. In the
> bond
> > > > > > case it is also set as a BOND netdev. The IFF_SLAVE helps to
> show
> > users that another master
> > > > > > netdev should be used for networking. But I am not an expert in
> > netvsc.
> > > > >
> > > > > The thing is that netvsc is virtual device like many others, but
> > > > > it is the only one who uses IFF_SLAVE bit. The comment around
> that
> > > > > bit says "slave of a load balancer.", which is not the case
> > > > > according to the Hyper-V documentation.
> > > > >
> > > > > You will need to get Ack from netdev maintainers to rely on
> > > > > IFF_SLAVE bit in the way you are relying on it now.
> > > >
> > > > This is used to tell userspace tools to not interact directly with
> the device.
> > > > For example, it is used when VF is connected to netvsc device.
> > > > It prevents things like IPv6 local address, and Network Manager
> won't
> > modify device.
> > >
> > > You described how hyper-v uses it, but I'm interested to get
> > > acknowledgment that it is a valid use case for IFF_SLAVE, despite
> sentence
> > written in the comment.
> >
> > There is no documented semantics around any of the IF flags, only
> historical
> > precedent used by bond, team and bridge drivers. Initially Hyper-V VF
> used
> > bonding but it was impossibly difficult to make this work across all
> versions of
> > Linux, so transparent VF support was added instead. Ideally, the VF
> device
> > could be hidden from userspace but that required more kernel
> modifications
> > than would be accepted.
> 
> Thanks Stephen for the explanation!
> 
> I am also CCing Haiyang, who maintains Hyper-V netvsc.
> 

Yes, netvsc sets the IFF_SLAVE on VF for the bonding.

Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
Leon Romanovsky June 27, 2024, 9:57 a.m. UTC | #9
On Wed, Jun 26, 2024 at 06:29:02PM +0000, Haiyang Zhang wrote:
> 
> 
> > -----Original Message-----
> > From: Konstantin Taranov <kotaranov@microsoft.com>
> > Sent: Wednesday, June 26, 2024 2:20 PM
> > To: Stephen Hemminger <stephen@networkplumber.org>; Leon Romanovsky
> > <leon@kernel.org>; Haiyang Zhang <haiyangz@microsoft.com>
> > Cc: Konstantin Taranov <kotaranov@linux.microsoft.com>; Wei Hu
> > <weh@microsoft.com>; sharmaajay@microsoft.com; Long Li
> > <longli@microsoft.com>; jgg@ziepe.ca; linux-rdma@vger.kernel.org; linux-
> > netdev <netdev@vger.kernel.org>
> > Subject: Re: [PATCH rdma-next 1/1] RDMA/mana_ib: Set correct device into
> > ib
> > 
> > > > > > On Wed, Jun 26, 2024 at 09:05:05AM +0000, Konstantin Taranov
> > wrote:
> > > > > > > > > When mc->ports[0] is not slave, use it in the set_netdev.
> > > > > > > > > When mana is used in netvsc, the stored net devices in mana
> > > > > > > > > are slaves and GIDs should be taken from their master
> > devices.
> > > > > > > > > In the baremetal case, the mc->ports devices will not be
> > slaves.
> > > > > > > >
> > > > > > > > I wonder, why do you have "... | IFF_SLAVE" in
> > > > > > > > __netvsc_vf_setup() in a first place? Isn't IFF_SLAVE is
> > supposed to
> > > be set by bond driver?
> > > > > > > >
> > > > > > >
> > > > > > > I guess it is just a valid use of the IFF_SLAVE bit. In the
> > bond
> > > > > > > case it is also set as a BOND netdev. The IFF_SLAVE helps to
> > show
> > > users that another master
> > > > > > > netdev should be used for networking. But I am not an expert in
> > > netvsc.
> > > > > >
> > > > > > The thing is that netvsc is virtual device like many others, but
> > > > > > it is the only one who uses IFF_SLAVE bit. The comment around
> > that
> > > > > > bit says "slave of a load balancer.", which is not the case
> > > > > > according to the Hyper-V documentation.
> > > > > >
> > > > > > You will need to get Ack from netdev maintainers to rely on
> > > > > > IFF_SLAVE bit in the way you are relying on it now.
> > > > >
> > > > > This is used to tell userspace tools to not interact directly with
> > the device.
> > > > > For example, it is used when VF is connected to netvsc device.
> > > > > It prevents things like IPv6 local address, and Network Manager
> > won't
> > > modify device.
> > > >
> > > > You described how hyper-v uses it, but I'm interested to get
> > > > acknowledgment that it is a valid use case for IFF_SLAVE, despite
> > sentence
> > > written in the comment.
> > >
> > > There is no documented semantics around any of the IF flags, only
> > historical
> > > precedent used by bond, team and bridge drivers. Initially Hyper-V VF
> > used
> > > bonding but it was impossibly difficult to make this work across all
> > versions of
> > > Linux, so transparent VF support was added instead. Ideally, the VF
> > device
> > > could be hidden from userspace but that required more kernel
> > modifications
> > > than would be accepted.
> > 
> > Thanks Stephen for the explanation!
> > 
> > I am also CCing Haiyang, who maintains Hyper-V netvsc.
> > 
> 
> Yes, netvsc sets the IFF_SLAVE on VF for the bonding.
> 
> Acked-by: Haiyang Zhang <haiyangz@microsoft.com>

Konstantin.

I don't want to be first and only one driver that uses this flag
outside of netdev. So please add new function to netdev part of mana
driver to return right ndev.

Something like that:
struct net_device *mana__get_netdev(struct mana_context *mc)
{
	struct net_device *ndev;

	if (mana_ndev_is_slave(mc->ports[0]))
		ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]);
	else
		ndev = mc->ports[0];

	return ndev;
}

And get Acks from netdev maintainers (Jakub, David, Eric, Paolo).
Konstantin Taranov June 27, 2024, 10:44 a.m. UTC | #10
> > > > > > > > > > When mc->ports[0] is not slave, use it in the set_netdev.
> > > > > > > > > > When mana is used in netvsc, the stored net devices in
> > > > > > > > > > mana are slaves and GIDs should be taken from their
> > > > > > > > > > master
> > > devices.
> > > > > > > > > > In the baremetal case, the mc->ports devices will not
> > > > > > > > > > be
> > > slaves.
> > > > > > > > >
> > > > > > > > > I wonder, why do you have "... | IFF_SLAVE" in
> > > > > > > > > __netvsc_vf_setup() in a first place? Isn't IFF_SLAVE is
> > > supposed to
> > > > be set by bond driver?
> > > > > > > > >
> > > > > > > >
> > > > > > > > I guess it is just a valid use of the IFF_SLAVE bit. In
> > > > > > > > the
> > > bond
> > > > > > > > case it is also set as a BOND netdev. The IFF_SLAVE helps
> > > > > > > > to
> > > show
> > > > users that another master
> > > > > > > > netdev should be used for networking. But I am not an
> > > > > > > > expert in
> > > > netvsc.
> > > > > > >
> > > > > > > The thing is that netvsc is virtual device like many others,
> > > > > > > but it is the only one who uses IFF_SLAVE bit. The comment
> > > > > > > around
> > > that
> > > > > > > bit says "slave of a load balancer.", which is not the case
> > > > > > > according to the Hyper-V documentation.
> > > > > > >
> > > > > > > You will need to get Ack from netdev maintainers to rely on
> > > > > > > IFF_SLAVE bit in the way you are relying on it now.
> > > > > >
> > > > > > This is used to tell userspace tools to not interact directly
> > > > > > with
> > > the device.
> > > > > > For example, it is used when VF is connected to netvsc device.
> > > > > > It prevents things like IPv6 local address, and Network
> > > > > > Manager
> > > won't
> > > > modify device.
> > > > >
> > > > > You described how hyper-v uses it, but I'm interested to get
> > > > > acknowledgment that it is a valid use case for IFF_SLAVE,
> > > > > despite
> > > sentence
> > > > written in the comment.
> > > >
> > > > There is no documented semantics around any of the IF flags, only
> > > historical
> > > > precedent used by bond, team and bridge drivers. Initially Hyper-V
> > > > VF
> > > used
> > > > bonding but it was impossibly difficult to make this work across
> > > > all
> > > versions of
> > > > Linux, so transparent VF support was added instead. Ideally, the
> > > > VF
> > > device
> > > > could be hidden from userspace but that required more kernel
> > > modifications
> > > > than would be accepted.
> > >
> > > Thanks Stephen for the explanation!
> > >
> > > I am also CCing Haiyang, who maintains Hyper-V netvsc.
> > >
> >
> > Yes, netvsc sets the IFF_SLAVE on VF for the bonding.
> >
> > Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
> 
> Konstantin.
> 
> I don't want to be first and only one driver that uses this flag outside of
> netdev. So please add new function to netdev part of mana driver to return
> right ndev.
> 
> Something like that:
> struct net_device *mana__get_netdev(struct mana_context *mc) {
> 	struct net_device *ndev;
> 
> 	if (mana_ndev_is_slave(mc->ports[0]))
> 		ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]);
> 	else
> 		ndev = mc->ports[0];
> 
> 	return ndev;
> }
> 
> And get Acks from netdev maintainers (Jakub, David, Eric, Paolo).

Ok. Makes sense.
I will just call it more exact:
mana_get_not_slave_netdev_rcu()
Stephen Hemminger June 27, 2024, 3:27 p.m. UTC | #11
On Thu, 27 Jun 2024 10:44:02 +0000
Konstantin Taranov <kotaranov@microsoft.com> wrote:

> > 
> > I don't want to be first and only one driver that uses this flag outside of
> > netdev. So please add new function to netdev part of mana driver to return
> > right ndev.
> > 
> > Something like that:
> > struct net_device *mana__get_netdev(struct mana_context *mc) {
> > 	struct net_device *ndev;
> > 
> > 	if (mana_ndev_is_slave(mc->ports[0]))
> > 		ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]);
> > 	else
> > 		ndev = mc->ports[0];
> > 
> > 	return ndev;
> > }
> > 
> > And get Acks from netdev maintainers (Jakub, David, Eric, Paolo).  
> 
> Ok. Makes sense.
> I will just call it more exact:
> mana_get_not_slave_netdev_rcu()

Please don't introduce more usages of the term "slave".
Better to stick to VF.
Long Li Nov. 21, 2024, 12:03 a.m. UTC | #12
> >
> > Actually, another alternative solution for mana_ib is always set the
> > slave device, but in the GID mgmt code we need the following patch.
> > The problem is that it may require testing/confirmation from other ib providers
> as in the worst case some GIDs will not be listed.
> 
> is_eth_active_slave_of_bonding_rcu() is for bonding.

Sorry, need to bring this issue up again.

This patch has broken user-space programs (e.g DPDK) that requires to export a kernel device to user-mode.

With this patch, the RDMA driver grabbed a reference from the master device, it's impossible to move the master device to user-mode.

I think the root cause is that the individual driver should not decide on which (master or slave) address should be used for GID. roce_gid_mgmt.c should handle this situation.

I think Konstantin's suggestion makes sense, how about we do this (don't need to define netdev_is_slave(dev)):

--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -161,7 +161,7 @@ is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port,
        res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
               (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
                REQUIRED_BOND_STATES)) ||
-              real_dev == rdma_ndev);
+              (real_dev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)));

        rcu_read_unlock();
        return res;


is_eth_port_of_netdev_filter() should not return true if this netdev is a bonded slave. In this case, only use the address of its bonded master.

Thanks,

Long
Parav Pandit Nov. 25, 2024, 3:56 p.m. UTC | #13
> From: Long Li <longli@microsoft.com>
> Sent: Thursday, November 21, 2024 5:34 AM
> 
> > >
> > > Actually, another alternative solution for mana_ib is always set the
> > > slave device, but in the GID mgmt code we need the following patch.
> > > The problem is that it may require testing/confirmation from other
> > > ib providers
> > as in the worst case some GIDs will not be listed.
> >
> > is_eth_active_slave_of_bonding_rcu() is for bonding.
> 
> Sorry, need to bring this issue up again.
> 
> This patch has broken user-space programs (e.g DPDK) that requires to
> export a kernel device to user-mode.
> 
> With this patch, the RDMA driver grabbed a reference from the master
> device, it's impossible to move the master device to user-mode.
> 
> I think the root cause is that the individual driver should not decide on which
> (master or slave) address should be used for GID. roce_gid_mgmt.c should
> handle this situation.
> 
> I think Konstantin's suggestion makes sense, how about we do this (don't
> need to define netdev_is_slave(dev)):
> 
> --- a/drivers/infiniband/core/roce_gid_mgmt.c
> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> @@ -161,7 +161,7 @@ is_eth_port_of_netdev_filter(struct ib_device
> *ib_dev, u32 port,
>         res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
>                (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
>                 REQUIRED_BOND_STATES)) ||
> -              real_dev == rdma_ndev);
> +              (real_dev == rdma_ndev &&
> + !netif_is_bond_slave(rdma_ndev)));
> 
>         rcu_read_unlock();
>         return res;
> 
> 
> is_eth_port_of_netdev_filter() should not return true if this netdev is a
> bonded slave. In this case, only use the address of its bonded master.
> 
Right. This change makes sense to me.
I don't have a setup presently to verify it to ensure I didn't miss a corner case.
Leon,
Can you or others please test the regression once with the formal patch?
Leon Romanovsky Nov. 25, 2024, 8:10 p.m. UTC | #14
On Mon, Nov 25, 2024 at 03:56:01PM +0000, Parav Pandit wrote:
> 
> 
> > From: Long Li <longli@microsoft.com>
> > Sent: Thursday, November 21, 2024 5:34 AM
> > 
> > > >
> > > > Actually, another alternative solution for mana_ib is always set the
> > > > slave device, but in the GID mgmt code we need the following patch.
> > > > The problem is that it may require testing/confirmation from other
> > > > ib providers
> > > as in the worst case some GIDs will not be listed.
> > >
> > > is_eth_active_slave_of_bonding_rcu() is for bonding.
> > 
> > Sorry, need to bring this issue up again.
> > 
> > This patch has broken user-space programs (e.g DPDK) that requires to
> > export a kernel device to user-mode.
> > 
> > With this patch, the RDMA driver grabbed a reference from the master
> > device, it's impossible to move the master device to user-mode.
> > 
> > I think the root cause is that the individual driver should not decide on which
> > (master or slave) address should be used for GID. roce_gid_mgmt.c should
> > handle this situation.
> > 
> > I think Konstantin's suggestion makes sense, how about we do this (don't
> > need to define netdev_is_slave(dev)):
> > 
> > --- a/drivers/infiniband/core/roce_gid_mgmt.c
> > +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> > @@ -161,7 +161,7 @@ is_eth_port_of_netdev_filter(struct ib_device
> > *ib_dev, u32 port,
> >         res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
> >                (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
> >                 REQUIRED_BOND_STATES)) ||
> > -              real_dev == rdma_ndev);
> > +              (real_dev == rdma_ndev &&
> > + !netif_is_bond_slave(rdma_ndev)));
> > 
> >         rcu_read_unlock();
> >         return res;
> > 
> > 
> > is_eth_port_of_netdev_filter() should not return true if this netdev is a
> > bonded slave. In this case, only use the address of its bonded master.
> > 
> Right. This change makes sense to me.
> I don't have a setup presently to verify it to ensure I didn't miss a corner case.
> Leon,
> Can you or others please test the regression once with the formal patch?

Sure, once Long will send the patch, I'll make sure that it is tested.

Thanks

>
Long Li Nov. 27, 2024, 7:46 p.m. UTC | #15
> > > I think Konstantin's suggestion makes sense, how about we do this
> > > (don't need to define netdev_is_slave(dev)):
> > >
> > > --- a/drivers/infiniband/core/roce_gid_mgmt.c
> > > +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> > > @@ -161,7 +161,7 @@ is_eth_port_of_netdev_filter(struct ib_device
> > > *ib_dev, u32 port,
> > >         res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
> > >                (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
> > >                 REQUIRED_BOND_STATES)) ||
> > > -              real_dev == rdma_ndev);
> > > +              (real_dev == rdma_ndev &&
> > > + !netif_is_bond_slave(rdma_ndev)));
> > >
> > >         rcu_read_unlock();
> > >         return res;
> > >
> > >
> > > is_eth_port_of_netdev_filter() should not return true if this netdev
> > > is a bonded slave. In this case, only use the address of its bonded master.
> > >
> > Right. This change makes sense to me.
> > I don't have a setup presently to verify it to ensure I didn't miss a corner case.
> > Leon,
> > Can you or others please test the regression once with the formal patch?
> 
> Sure, once Long will send the patch, I'll make sure that it is tested.
> 
> Thanks
> 

I posted patches for discussion.
https://lore.kernel.org/linux-rdma/1732736619-19941-1-git-send-email-longli@linuxonhyperv.com/T/#t

Thank you,
Long
Leon Romanovsky Nov. 28, 2024, 9:39 a.m. UTC | #16
On Wed, Nov 27, 2024 at 07:46:39PM +0000, Long Li wrote:
> 
> > > > I think Konstantin's suggestion makes sense, how about we do this
> > > > (don't need to define netdev_is_slave(dev)):
> > > >
> > > > --- a/drivers/infiniband/core/roce_gid_mgmt.c
> > > > +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> > > > @@ -161,7 +161,7 @@ is_eth_port_of_netdev_filter(struct ib_device
> > > > *ib_dev, u32 port,
> > > >         res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
> > > >                (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
> > > >                 REQUIRED_BOND_STATES)) ||
> > > > -              real_dev == rdma_ndev);
> > > > +              (real_dev == rdma_ndev &&
> > > > + !netif_is_bond_slave(rdma_ndev)));
> > > >
> > > >         rcu_read_unlock();
> > > >         return res;
> > > >
> > > >
> > > > is_eth_port_of_netdev_filter() should not return true if this netdev
> > > > is a bonded slave. In this case, only use the address of its bonded master.
> > > >
> > > Right. This change makes sense to me.
> > > I don't have a setup presently to verify it to ensure I didn't miss a corner case.
> > > Leon,
> > > Can you or others please test the regression once with the formal patch?
> > 
> > Sure, once Long will send the patch, I'll make sure that it is tested.
> > 
> > Thanks
> > 
> 
> I posted patches for discussion.
> https://lore.kernel.org/linux-rdma/1732736619-19941-1-git-send-email-longli@linuxonhyperv.com/T/#t

Please resend these patches as series with cover letter and don't embed
extra patch (the one which is not numbered) into the series.

Thanks

> 
> Thank you,
> Long
>
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index b07a8e2e838f..5395306a86e8 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -11,6 +11,8 @@  MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver");
 MODULE_LICENSE("GPL");
 MODULE_IMPORT_NS(NET_MANA);
 
+#define mana_ndev_is_slave(dev)   (((dev)->flags & IFF_SLAVE) == IFF_SLAVE)
+
 static const struct ib_device_ops mana_ib_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_MANA,
@@ -56,7 +58,7 @@  static int mana_ib_probe(struct auxiliary_device *adev,
 {
 	struct mana_adev *madev = container_of(adev, struct mana_adev, adev);
 	struct gdma_dev *mdev = madev->mdev;
-	struct net_device *upper_ndev;
+	struct net_device *ndev;
 	struct mana_context *mc;
 	struct mana_ib_dev *dev;
 	u8 mac_addr[ETH_ALEN];
@@ -85,16 +87,19 @@  static int mana_ib_probe(struct auxiliary_device *adev,
 	dev->ib_dev.dev.parent = mdev->gdma_context->dev;
 
 	rcu_read_lock(); /* required to get upper dev */
-	upper_ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]);
-	if (!upper_ndev) {
+	if (mana_ndev_is_slave(mc->ports[0]))
+		ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]);
+	else
+		ndev = mc->ports[0];
+	if (!ndev) {
 		rcu_read_unlock();
 		ret = -ENODEV;
-		ibdev_err(&dev->ib_dev, "Failed to get master netdev");
+		ibdev_err(&dev->ib_dev, "Failed to get netdev for port 1");
 		goto free_ib_device;
 	}
-	ether_addr_copy(mac_addr, upper_ndev->dev_addr);
-	addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, upper_ndev->dev_addr);
-	ret = ib_device_set_netdev(&dev->ib_dev, upper_ndev, 1);
+	ether_addr_copy(mac_addr, ndev->dev_addr);
+	addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr);
+	ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1);
 	rcu_read_unlock();
 	if (ret) {
 		ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret);