diff mbox series

[V6,mlx5-next,11/16] RDMA/core: Add LAG functionality

Message ID 20200426071717.17088-12-maorg@mellanox.com (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series Add support to get xmit slave | expand

Commit Message

Maor Gottlieb April 26, 2020, 7:17 a.m. UTC
Add support to get the RoCE LAG xmit slave by building skb
of the RoCE packet and call to master_get_xmit_slave.
If driver wants to get the slave assume all slaves are available,
then need to set RDMA_LAG_FLAGS_HASH_ALL_SLAVES in flags.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/Makefile |   2 +-
 drivers/infiniband/core/lag.c    | 141 +++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h          |   1 +
 include/rdma/lag.h               |  23 +++++
 4 files changed, 166 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/lag.c
 create mode 100644 include/rdma/lag.h

Comments

Jason Gunthorpe April 28, 2020, 11:15 p.m. UTC | #1
On Sun, Apr 26, 2020 at 10:17:12AM +0300, Maor Gottlieb wrote:
> +int rdma_lag_get_ah_roce_slave(struct ib_device *device,
> +			       struct rdma_ah_attr *ah_attr,
> +			       struct net_device **xmit_slave)

Please do not use ** and also return int. The function should return
net_device directly and use ERR_PTR() 

> +{
> +	struct net_device *master;
> +	struct net_device *slave;
> +	int err = 0;
> +
> +	*xmit_slave = NULL;
> +	if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE &&
> +	      ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP))
> +		return 0;
> +
> +	rcu_read_lock();
> +	master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr);
> +	if (IS_ERR(master)) {
> +		err = PTR_ERR(master);
> +		goto unlock;
> +	}
> +	dev_hold(master);

What is the point of this dev_hold? This whole thing is under
rcu_read_lock()

> +
> +	if (!netif_is_bond_master(master))
> +		goto put;
> +
> +	slave = rdma_get_xmit_slave_udp(device, master, ah_attr);

IMHO it is probably better to keep with the dev_hold and drop the RCU
while doing rdma_build_skb so that the allocation in here doesn't have
to be atomic. This isn't performance sensitive so the extra atomic for
the dev_hold is better than the unnecessary GFP_ATOMIC allocation

> +	if (!slave) {
> +		ibdev_warn(device, "Failed to get lag xmit slave\n");
> +		err =  -EINVAL;
> +		goto put;
> +	}
> +
> +	dev_hold(slave);

And I think the dev_hold should be in the rdma_get_xmit_slave_udp() as
things called 'get' really ought to return with references.

Jason
Jason Gunthorpe April 28, 2020, 11:30 p.m. UTC | #2
On Tue, Apr 28, 2020 at 08:15:25PM -0300, Jason Gunthorpe wrote:
> On Sun, Apr 26, 2020 at 10:17:12AM +0300, Maor Gottlieb wrote:
> > +int rdma_lag_get_ah_roce_slave(struct ib_device *device,
> > +			       struct rdma_ah_attr *ah_attr,
> > +			       struct net_device **xmit_slave)
> 
> Please do not use ** and also return int. The function should return
> net_device directly and use ERR_PTR() 
> 
> > +{
> > +	struct net_device *master;
> > +	struct net_device *slave;
> > +	int err = 0;
> > +
> > +	*xmit_slave = NULL;
> > +	if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE &&
> > +	      ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP))
> > +		return 0;
> > +
> > +	rcu_read_lock();
> > +	master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr);
> > +	if (IS_ERR(master)) {
> > +		err = PTR_ERR(master);
> > +		goto unlock;
> > +	}
> > +	dev_hold(master);
> 
> What is the point of this dev_hold? This whole thing is under
> rcu_read_lock()
> 
> > +
> > +	if (!netif_is_bond_master(master))
> > +		goto put;
> > +
> > +	slave = rdma_get_xmit_slave_udp(device, master, ah_attr);
> 
> IMHO it is probably better to keep with the dev_hold and drop the RCU
> while doing rdma_build_skb so that the allocation in here doesn't have
> to be atomic. This isn't performance sensitive so the extra atomic for
> the dev_hold is better than the unnecessary GFP_ATOMIC allocation

Though if you do this be mindful that the create_ah call site is
conditionally non-sleeping, the best thing to do would be to make the
GFP_ATOMIC conditional on !RDMA_CREATE_AH_SLEEPABLE - ie pass in a gfp
flags argument.

Jason
Maor Gottlieb April 29, 2020, 9:01 a.m. UTC | #3
On 4/29/2020 2:30 AM, Jason Gunthorpe wrote:
> On Tue, Apr 28, 2020 at 08:15:25PM -0300, Jason Gunthorpe wrote:
>> On Sun, Apr 26, 2020 at 10:17:12AM +0300, Maor Gottlieb wrote:
>>> +int rdma_lag_get_ah_roce_slave(struct ib_device *device,
>>> +			       struct rdma_ah_attr *ah_attr,
>>> +			       struct net_device **xmit_slave)
>> Please do not use ** and also return int. The function should return
>> net_device directly and use ERR_PTR()

How about return NULL in failure as well (will add debug print)? Not 
fail the flow if we didn't succeed to get the slave, let the lower 
driver to do it if it would like to.
>>
>>> +{
>>> +	struct net_device *master;
>>> +	struct net_device *slave;
>>> +	int err = 0;
>>> +
>>> +	*xmit_slave = NULL;
>>> +	if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE &&
>>> +	      ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP))
>>> +		return 0;
>>> +
>>> +	rcu_read_lock();
>>> +	master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr);
>>> +	if (IS_ERR(master)) {
>>> +		err = PTR_ERR(master);
>>> +		goto unlock;
>>> +	}
>>> +	dev_hold(master);
>> What is the point of this dev_hold? This whole thing is under
>> rcu_read_lock()
>>
>>> +
>>> +	if (!netif_is_bond_master(master))
>>> +		goto put;
>>> +
>>> +	slave = rdma_get_xmit_slave_udp(device, master, ah_attr);
>> IMHO it is probably better to keep with the dev_hold and drop the RCU
>> while doing rdma_build_skb so that the allocation in here doesn't have
>> to be atomic. This isn't performance sensitive so the extra atomic for
>> the dev_hold is better than the unnecessary GFP_ATOMIC allocation
> Though if you do this be mindful that the create_ah call site is
> conditionally non-sleeping, the best thing to do would be to make the
> GFP_ATOMIC conditional on !RDMA_CREATE_AH_SLEEPABLE - ie pass in a gfp
> flags argument.
>
> Jason

Will go with your suggestion above.
Jason Gunthorpe April 29, 2020, 3:01 p.m. UTC | #4
On Wed, Apr 29, 2020 at 12:01:07PM +0300, Maor Gottlieb wrote:
> 
> On 4/29/2020 2:30 AM, Jason Gunthorpe wrote:
> > On Tue, Apr 28, 2020 at 08:15:25PM -0300, Jason Gunthorpe wrote:
> > > On Sun, Apr 26, 2020 at 10:17:12AM +0300, Maor Gottlieb wrote:
> > > > +int rdma_lag_get_ah_roce_slave(struct ib_device *device,
> > > > +			       struct rdma_ah_attr *ah_attr,
> > > > +			       struct net_device **xmit_slave)
> > > Please do not use ** and also return int. The function should return
> > > net_device directly and use ERR_PTR()
> 
> How about return NULL in failure as well (will add debug print)? Not fail
> the flow if we didn't succeed to get the slave, let the lower driver to do
> it if it would like to.

A NULL return indicating success but 'not found' is fine.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d1b14887960e..870f0fcd54d5 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -12,7 +12,7 @@  ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
 				nldev.o restrack.o counters.o ib_core_uverbs.o \
-				trace.o
+				trace.o lag.o
 
 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c
new file mode 100644
index 000000000000..85050b6e67dc
--- /dev/null
+++ b/drivers/infiniband/core/lag.c
@@ -0,0 +1,141 @@ 
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
+ */
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/lag.h>
+
+static struct sk_buff *rdma_build_skb(struct ib_device *device,
+				      struct net_device *netdev,
+				      struct rdma_ah_attr *ah_attr)
+{
+	struct ipv6hdr *ip6h;
+	struct sk_buff *skb;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+	struct udphdr *uh;
+	u8 smac[ETH_ALEN];
+	bool is_ipv4;
+	int hdr_len;
+
+	is_ipv4 = ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw);
+	hdr_len = ETH_HLEN + sizeof(struct udphdr) + LL_RESERVED_SPACE(netdev);
+	hdr_len += is_ipv4 ? sizeof(struct iphdr) : sizeof(struct ipv6hdr);
+
+	skb = alloc_skb(hdr_len, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	skb->dev = netdev;
+	skb_reserve(skb, hdr_len);
+	skb_push(skb, sizeof(struct udphdr));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+	uh->source = htons(0xC000);
+	uh->dest = htons(ROCE_V2_UDP_DPORT);
+	uh->len = htons(sizeof(struct udphdr));
+
+	if (is_ipv4) {
+		skb_push(skb, sizeof(struct iphdr));
+		skb_reset_network_header(skb);
+		iph = ip_hdr(skb);
+		iph->frag_off = 0;
+		iph->version = 4;
+		iph->protocol = IPPROTO_UDP;
+		iph->ihl = 0x5;
+		iph->tot_len = htons(sizeof(struct udphdr) + sizeof(struct
+								    iphdr));
+		memcpy(&iph->saddr, ah_attr->grh.sgid_attr->gid.raw + 12,
+		       sizeof(struct in_addr));
+		memcpy(&iph->daddr, ah_attr->grh.dgid.raw + 12,
+		       sizeof(struct in_addr));
+	} else {
+		skb_push(skb, sizeof(struct ipv6hdr));
+		skb_reset_network_header(skb);
+		ip6h = ipv6_hdr(skb);
+		ip6h->version = 6;
+		ip6h->nexthdr = IPPROTO_UDP;
+		memcpy(&ip6h->flow_lbl, &ah_attr->grh.flow_label,
+		       sizeof(*ip6h->flow_lbl));
+		memcpy(&ip6h->saddr, ah_attr->grh.sgid_attr->gid.raw,
+		       sizeof(struct in6_addr));
+		memcpy(&ip6h->daddr, ah_attr->grh.dgid.raw,
+		       sizeof(struct in6_addr));
+	}
+
+	skb_push(skb, sizeof(struct ethhdr));
+	skb_reset_mac_header(skb);
+	eth = eth_hdr(skb);
+	skb->protocol = eth->h_proto = htons(is_ipv4 ? ETH_P_IP : ETH_P_IPV6);
+	rdma_read_gid_l2_fields(ah_attr->grh.sgid_attr, NULL, smac);
+	memcpy(eth->h_source, smac, ETH_ALEN);
+	memcpy(eth->h_dest, ah_attr->roce.dmac, ETH_ALEN);
+
+	return skb;
+}
+
+static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
+						  struct net_device *master,
+						  struct rdma_ah_attr *ah_attr)
+{
+	struct net_device *slave;
+	struct sk_buff *skb;
+
+	skb = rdma_build_skb(device, master, ah_attr);
+	if (!skb)
+		return NULL;
+
+	slave = netdev_get_xmit_slave(master, skb,
+				      !!(device->lag_flags &
+					 RDMA_LAG_FLAGS_HASH_ALL_SLAVES));
+	kfree_skb(skb);
+	return slave;
+}
+
+void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave)
+{
+	if (xmit_slave)
+		dev_put(xmit_slave);
+}
+
+int rdma_lag_get_ah_roce_slave(struct ib_device *device,
+			       struct rdma_ah_attr *ah_attr,
+			       struct net_device **xmit_slave)
+{
+	struct net_device *master;
+	struct net_device *slave;
+	int err = 0;
+
+	*xmit_slave = NULL;
+	if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE &&
+	      ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP))
+		return 0;
+
+	rcu_read_lock();
+	master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr);
+	if (IS_ERR(master)) {
+		err = PTR_ERR(master);
+		goto unlock;
+	}
+	dev_hold(master);
+
+	if (!netif_is_bond_master(master))
+		goto put;
+
+	slave = rdma_get_xmit_slave_udp(device, master, ah_attr);
+	if (!slave) {
+		ibdev_warn(device, "Failed to get lag xmit slave\n");
+		err =  -EINVAL;
+		goto put;
+	}
+
+	dev_hold(slave);
+	*xmit_slave = slave;
+put:
+	dev_put(master);
+unlock:
+	rcu_read_unlock();
+	return err;
+}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 20ea26810349..e6c18ec0365a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2714,6 +2714,7 @@  struct ib_device {
 	/* Used by iWarp CM */
 	char iw_ifname[IFNAMSIZ];
 	u32 iw_driver_flags;
+	u32 lag_flags;
 };
 
 struct ib_client_nl_info;
diff --git a/include/rdma/lag.h b/include/rdma/lag.h
new file mode 100644
index 000000000000..8f78e2c0fd7c
--- /dev/null
+++ b/include/rdma/lag.h
@@ -0,0 +1,23 @@ 
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_LAG_H_
+#define _RDMA_LAG_H_
+
+#include <net/lag.h>
+
+struct ib_device;
+struct rdma_ah_attr;
+
+enum rdma_lag_flags {
+	RDMA_LAG_FLAGS_HASH_ALL_SLAVES = 1 << 0
+};
+
+void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave);
+int rdma_lag_get_ah_roce_slave(struct ib_device *device,
+			       struct rdma_ah_attr *ah_attr,
+			       struct net_device **xmit_slave);
+
+#endif /* _RDMA_LAG_H_ */