diff mbox series

[for-rc] IB/rxe: Improve loopback marking

Message ID 20190124144434.24910-1-kamalheib1@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series [for-rc] IB/rxe: Improve loopback marking | expand

Commit Message

Kamal Heib Jan. 24, 2019, 2:44 p.m. UTC
Currently a packet is marked for loopback only if the source and
destination addresses match, This is not enough when multiple gids are
present in rxe device's gid table and the traffic is from one gid to
another, this patch fixes this by marking the packet for loopback if the
destination address found in the rxe's netdevice address list.

Fixes: 8700e3e7c485 ("Soft RoCE driver")
Signed-off-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Signed-off-by: Kamal Heib <kamalheib1@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_net.c | 45 +++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

Comments

Yuval Shaia Jan. 24, 2019, 3:10 p.m. UTC | #1
On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
> Currently a packet is marked for loopback only if the source and
> destination addresses match, This is not enough when multiple gids are

s/match,/equals.

> present in rxe device's gid table and the traffic is from one gid to
> another, this patch fixes this by marking the packet for loopback if the
> destination address found in the rxe's netdevice address list.

s/another, this patch fixes this/ another. Fix it

> 
> Fixes: 8700e3e7c485 ("Soft RoCE driver")
> Signed-off-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> Signed-off-by: Kamal Heib <kamalheib1@gmail.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_net.c | 45 +++++++++++++++++++++++++++--
>  1 file changed, 43 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index 8fd03ae20efc..6ee984b7793b 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -368,6 +368,25 @@ static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
>  	ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
>  }
>  
> +static inline bool same_rxe4(struct net_device *ndev, struct in_addr *daddr)
> +{
> +	struct in_device *in_dev;
> +	bool same_rxe = false;
> +
> +	in_dev = in_dev_get(ndev);
> +	if (!in_dev)
> +		return false;
> +
> +	for_ifa(in_dev)
> +		if (!memcmp(&ifa->ifa_address, daddr, sizeof(*daddr))) {
> +			same_rxe = true;
> +			break;
> +		}
> +	endfor_ifa(in_dev);
> +	in_dev_put(in_dev);
> +	return same_rxe;
> +}
> +
>  static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
>  		    struct rxe_av *av)
>  {
> @@ -384,7 +403,7 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
>  		return -EHOSTUNREACH;
>  	}
>  
> -	if (!memcmp(saddr, daddr, sizeof(*daddr)))
> +	if (same_rxe4(skb->dev, daddr))
>  		pkt->mask |= RXE_LOOPBACK_MASK;
>  
>  	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
> @@ -397,6 +416,28 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
>  	return 0;
>  }
>  
> +static inline bool same_rxe6(struct net_device *ndev, struct in6_addr *daddr)
> +{
> +	struct inet6_dev *in6_dev;
> +	struct inet6_ifaddr *ifp;
> +	bool same_rxe = false;
> +
> +	in6_dev = in6_dev_get(ndev);
> +	if (!in6_dev)
> +		return false;
> +
> +	read_lock_bh(&in6_dev->lock);

To me at first glance it is unclear why lock is not needed when traveling
the v4 list. Suggesting to comment on that.

> +	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
> +		if (!memcmp(&ifp->addr, daddr, sizeof(*daddr))) {
> +			same_rxe = true;
> +			break;
> +		}
> +	}
> +	read_unlock_bh(&in6_dev->lock);
> +	in6_dev_put(in6_dev);
> +	return same_rxe;
> +}
> +
>  static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
>  		    struct rxe_av *av)
>  {
> @@ -411,7 +452,7 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
>  		return -EHOSTUNREACH;
>  	}
>  
> -	if (!memcmp(saddr, daddr, sizeof(*daddr)))
> +	if (same_rxe6(skb->dev, daddr))
>  		pkt->mask |= RXE_LOOPBACK_MASK;

Thanks Kamal!

Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>

Will give it a short test today.

>  
>  	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
> -- 
> 2.20.1
>
Yuval Shaia Jan. 24, 2019, 3:41 p.m. UTC | #2
On Thu, Jan 24, 2019 at 05:10:18PM +0200, Yuval Shaia wrote:
> On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
> > Currently a packet is marked for loopback only if the source and
> > destination addresses match, This is not enough when multiple gids are
> 
> s/match,/equals.
> 
> > present in rxe device's gid table and the traffic is from one gid to
> > another, this patch fixes this by marking the packet for loopback if the
> > destination address found in the rxe's netdevice address list.
> 
> s/another, this patch fixes this/ another. Fix it
> 
> > 
> > Fixes: 8700e3e7c485 ("Soft RoCE driver")
> > Signed-off-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> > Signed-off-by: Kamal Heib <kamalheib1@gmail.com>
> > ---
> >  drivers/infiniband/sw/rxe/rxe_net.c | 45 +++++++++++++++++++++++++++--
> >  1 file changed, 43 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> > index 8fd03ae20efc..6ee984b7793b 100644
> > --- a/drivers/infiniband/sw/rxe/rxe_net.c
> > +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> > @@ -368,6 +368,25 @@ static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
> >  	ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
> >  }
> >  
> > +static inline bool same_rxe4(struct net_device *ndev, struct in_addr *daddr)
> > +{
> > +	struct in_device *in_dev;
> > +	bool same_rxe = false;
> > +
> > +	in_dev = in_dev_get(ndev);
> > +	if (!in_dev)
> > +		return false;
> > +
> > +	for_ifa(in_dev)
> > +		if (!memcmp(&ifa->ifa_address, daddr, sizeof(*daddr))) {
> > +			same_rxe = true;
> > +			break;
> > +		}
> > +	endfor_ifa(in_dev);
> > +	in_dev_put(in_dev);
> > +	return same_rxe;
> > +}
> > +
> >  static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
> >  		    struct rxe_av *av)
> >  {
> > @@ -384,7 +403,7 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
> >  		return -EHOSTUNREACH;
> >  	}
> >  
> > -	if (!memcmp(saddr, daddr, sizeof(*daddr)))
> > +	if (same_rxe4(skb->dev, daddr))
> >  		pkt->mask |= RXE_LOOPBACK_MASK;
> >  
> >  	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
> > @@ -397,6 +416,28 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
> >  	return 0;
> >  }
> >  
> > +static inline bool same_rxe6(struct net_device *ndev, struct in6_addr *daddr)
> > +{
> > +	struct inet6_dev *in6_dev;
> > +	struct inet6_ifaddr *ifp;
> > +	bool same_rxe = false;
> > +
> > +	in6_dev = in6_dev_get(ndev);
> > +	if (!in6_dev)
> > +		return false;
> > +
> > +	read_lock_bh(&in6_dev->lock);
> 
> To me at first glance it is unclear why lock is not needed when traveling
> the v4 list. Suggesting to comment on that.
> 
> > +	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
> > +		if (!memcmp(&ifp->addr, daddr, sizeof(*daddr))) {
> > +			same_rxe = true;
> > +			break;
> > +		}
> > +	}
> > +	read_unlock_bh(&in6_dev->lock);
> > +	in6_dev_put(in6_dev);
> > +	return same_rxe;
> > +}
> > +
> >  static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
> >  		    struct rxe_av *av)
> >  {
> > @@ -411,7 +452,7 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
> >  		return -EHOSTUNREACH;
> >  	}
> >  
> > -	if (!memcmp(saddr, daddr, sizeof(*daddr)))
> > +	if (same_rxe6(skb->dev, daddr))
> >  		pkt->mask |= RXE_LOOPBACK_MASK;
> 
> Thanks Kamal!
> 
> Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
> 
> Will give it a short test today.

Tested both RC and UD as well as with qemu guest with pvrdma device.

Tested-by: Yuval Shaia <yuval.shaia@oracle.com>

(btw, nice to finally move to 5.0 -:))

> 
> >  
> >  	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
> > -- 
> > 2.20.1
> >
Jason Gunthorpe Jan. 24, 2019, 6:30 p.m. UTC | #3
On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
> Currently a packet is marked for loopback only if the source and
> destination addresses match, This is not enough when multiple gids are
> present in rxe device's gid table and the traffic is from one gid to
> another, this patch fixes this by marking the packet for loopback if the
> destination address found in the rxe's netdevice address list.

Shouldn't loop back detection be based on the DMAC/SMAC being equal
and nothing else?

Jason
Yuval Shaia Jan. 24, 2019, 6:50 p.m. UTC | #4
On Thu, Jan 24, 2019 at 11:30:48AM -0700, Jason Gunthorpe wrote:
> On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
> > Currently a packet is marked for loopback only if the source and
> > destination addresses match, This is not enough when multiple gids are
> > present in rxe device's gid table and the traffic is from one gid to
> > another, this patch fixes this by marking the packet for loopback if the
> > destination address found in the rxe's netdevice address list.
> 
> Shouldn't loop back detection be based on the DMAC/SMAC being equal
> and nothing else?
> 
> Jason

No because even if DMAC != SMAC (and in our case dgid != sgid) we can still
can end up on the same rxe device.

As the commit message trying to explain, take for example one rxe device
with two gids one send to the second, this should be processed as loopback,
right?

Yuval
Jason Gunthorpe Jan. 24, 2019, 6:59 p.m. UTC | #5
On Thu, Jan 24, 2019 at 08:50:47PM +0200, Yuval Shaia wrote:
> On Thu, Jan 24, 2019 at 11:30:48AM -0700, Jason Gunthorpe wrote:
> > On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
> > > Currently a packet is marked for loopback only if the source and
> > > destination addresses match, This is not enough when multiple gids are
> > > present in rxe device's gid table and the traffic is from one gid to
> > > another, this patch fixes this by marking the packet for loopback if the
> > > destination address found in the rxe's netdevice address list.
> > 
> > Shouldn't loop back detection be based on the DMAC/SMAC being equal
> > and nothing else?
> > 
> > Jason
> 
> No because even if DMAC != SMAC (and in our case dgid != sgid) we can still
> can end up on the same rxe device.
> 
> As the commit message trying to explain, take for example one rxe device
> with two gids one send to the second, this should be processed as loopback,
> right?

rxe sits on top of a netdev - so it has *EXACTLY ONE* mac
address. Anything else is a bug.

Jason
Yuval Shaia Jan. 24, 2019, 7:02 p.m. UTC | #6
On Thu, Jan 24, 2019 at 11:59:38AM -0700, Jason Gunthorpe wrote:
> On Thu, Jan 24, 2019 at 08:50:47PM +0200, Yuval Shaia wrote:
> > On Thu, Jan 24, 2019 at 11:30:48AM -0700, Jason Gunthorpe wrote:
> > > On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
> > > > Currently a packet is marked for loopback only if the source and
> > > > destination addresses match, This is not enough when multiple gids are
> > > > present in rxe device's gid table and the traffic is from one gid to
> > > > another, this patch fixes this by marking the packet for loopback if the
> > > > destination address found in the rxe's netdevice address list.
> > > 
> > > Shouldn't loop back detection be based on the DMAC/SMAC being equal
> > > and nothing else?
> > > 
> > > Jason
> > 
> > No because even if DMAC != SMAC (and in our case dgid != sgid) we can still
> > can end up on the same rxe device.
> > 
> > As the commit message trying to explain, take for example one rxe device
> > with two gids one send to the second, this should be processed as loopback,
> > right?
> 
> rxe sits on top of a netdev - so it has *EXACTLY ONE* mac
> address. Anything else is a bug.
> 
> Jason

ok, i think see your point.
So you are saying that instead of traveling the routing table we just need
to check if dmac is the same as smac, correct?

Yuval
Jason Gunthorpe Jan. 24, 2019, 7:10 p.m. UTC | #7
On Thu, Jan 24, 2019 at 09:02:56PM +0200, Yuval Shaia wrote:
> On Thu, Jan 24, 2019 at 11:59:38AM -0700, Jason Gunthorpe wrote:
> > On Thu, Jan 24, 2019 at 08:50:47PM +0200, Yuval Shaia wrote:
> > > On Thu, Jan 24, 2019 at 11:30:48AM -0700, Jason Gunthorpe wrote:
> > > > On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
> > > > > Currently a packet is marked for loopback only if the source and
> > > > > destination addresses match, This is not enough when multiple gids are
> > > > > present in rxe device's gid table and the traffic is from one gid to
> > > > > another, this patch fixes this by marking the packet for loopback if the
> > > > > destination address found in the rxe's netdevice address list.
> > > > 
> > > > Shouldn't loop back detection be based on the DMAC/SMAC being equal
> > > > and nothing else?
> > > > 
> > > > Jason
> > > 
> > > No because even if DMAC != SMAC (and in our case dgid != sgid) we can still
> > > can end up on the same rxe device.
> > > 
> > > As the commit message trying to explain, take for example one rxe device
> > > with two gids one send to the second, this should be processed as loopback,
> > > right?
> > 
> > rxe sits on top of a netdev - so it has *EXACTLY ONE* mac
> > address. Anything else is a bug.
> > 
> > Jason
> 
> ok, i think see your point.
> So you are saying that instead of traveling the routing table we just need
> to check if dmac is the same as smac, correct?

dmac should be set to the 'exactly one' mac, so yes.

Jsaon
Kamal Heib Jan. 27, 2019, 7:43 a.m. UTC | #8
On 1/24/19 9:10 PM, Jason Gunthorpe wrote:
> On Thu, Jan 24, 2019 at 09:02:56PM +0200, Yuval Shaia wrote:
>> On Thu, Jan 24, 2019 at 11:59:38AM -0700, Jason Gunthorpe wrote:
>>> On Thu, Jan 24, 2019 at 08:50:47PM +0200, Yuval Shaia wrote:
>>>> On Thu, Jan 24, 2019 at 11:30:48AM -0700, Jason Gunthorpe wrote:
>>>>> On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
>>>>>> Currently a packet is marked for loopback only if the source and
>>>>>> destination addresses match, This is not enough when multiple gids are
>>>>>> present in rxe device's gid table and the traffic is from one gid to
>>>>>> another, this patch fixes this by marking the packet for loopback if the
>>>>>> destination address found in the rxe's netdevice address list.
>>>>> Shouldn't loop back detection be based on the DMAC/SMAC being equal
>>>>> and nothing else?
>>>>>
>>>>> Jason
>>>> No because even if DMAC != SMAC (and in our case dgid != sgid) we can still
>>>> can end up on the same rxe device.
>>>>
>>>> As the commit message trying to explain, take for example one rxe device
>>>> with two gids one send to the second, this should be processed as loopback,
>>>> right?
>>> rxe sits on top of a netdev - so it has *EXACTLY ONE* mac
>>> address. Anything else is a bug.
>>>
>>> Jason
>> ok, i think see your point.
>> So you are saying that instead of traveling the routing table we just need
>> to check if dmac is the same as smac, correct?
> dmac should be set to the 'exactly one' mac, so yes.
>
> Jsaon

Thanks Yuval and Jason.
I'll prepare a new fix that will check if smac == dmac.

Thanks,
Kamal
Yuval Shaia Jan. 27, 2019, 7:50 a.m. UTC | #9
On Sun, Jan 27, 2019 at 09:43:45AM +0200, Kamal Heib wrote:
> 
> 
> On 1/24/19 9:10 PM, Jason Gunthorpe wrote:
> > On Thu, Jan 24, 2019 at 09:02:56PM +0200, Yuval Shaia wrote:
> >> On Thu, Jan 24, 2019 at 11:59:38AM -0700, Jason Gunthorpe wrote:
> >>> On Thu, Jan 24, 2019 at 08:50:47PM +0200, Yuval Shaia wrote:
> >>>> On Thu, Jan 24, 2019 at 11:30:48AM -0700, Jason Gunthorpe wrote:
> >>>>> On Thu, Jan 24, 2019 at 04:44:34PM +0200, Kamal Heib wrote:
> >>>>>> Currently a packet is marked for loopback only if the source and
> >>>>>> destination addresses match, This is not enough when multiple gids are
> >>>>>> present in rxe device's gid table and the traffic is from one gid to
> >>>>>> another, this patch fixes this by marking the packet for loopback if the
> >>>>>> destination address found in the rxe's netdevice address list.
> >>>>> Shouldn't loop back detection be based on the DMAC/SMAC being equal
> >>>>> and nothing else?
> >>>>>
> >>>>> Jason
> >>>> No because even if DMAC != SMAC (and in our case dgid != sgid) we can still
> >>>> can end up on the same rxe device.
> >>>>
> >>>> As the commit message trying to explain, take for example one rxe device
> >>>> with two gids one send to the second, this should be processed as loopback,
> >>>> right?
> >>> rxe sits on top of a netdev - so it has *EXACTLY ONE* mac
> >>> address. Anything else is a bug.
> >>>
> >>> Jason
> >> ok, i think see your point.
> >> So you are saying that instead of traveling the routing table we just need
> >> to check if dmac is the same as smac, correct?
> > dmac should be set to the 'exactly one' mac, so yes.
> >
> > Jsaon
> 
> Thanks Yuval and Jason.
> I'll prepare a new fix that will check if smac == dmac.
> 
> Thanks,
> Kamal

Thanks,
I was under the (probably wrong) assumption that dmac is yet unknown at
this stage.

Yuval
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 8fd03ae20efc..6ee984b7793b 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -368,6 +368,25 @@  static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
 	ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
 }
 
+static inline bool same_rxe4(struct net_device *ndev, struct in_addr *daddr)
+{
+	struct in_device *in_dev;
+	bool same_rxe = false;
+
+	in_dev = in_dev_get(ndev);
+	if (!in_dev)
+		return false;
+
+	for_ifa(in_dev)
+		if (!memcmp(&ifa->ifa_address, daddr, sizeof(*daddr))) {
+			same_rxe = true;
+			break;
+		}
+	endfor_ifa(in_dev);
+	in_dev_put(in_dev);
+	return same_rxe;
+}
+
 static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 		    struct rxe_av *av)
 {
@@ -384,7 +403,7 @@  static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 		return -EHOSTUNREACH;
 	}
 
-	if (!memcmp(saddr, daddr, sizeof(*daddr)))
+	if (same_rxe4(skb->dev, daddr))
 		pkt->mask |= RXE_LOOPBACK_MASK;
 
 	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
@@ -397,6 +416,28 @@  static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 	return 0;
 }
 
+static inline bool same_rxe6(struct net_device *ndev, struct in6_addr *daddr)
+{
+	struct inet6_dev *in6_dev;
+	struct inet6_ifaddr *ifp;
+	bool same_rxe = false;
+
+	in6_dev = in6_dev_get(ndev);
+	if (!in6_dev)
+		return false;
+
+	read_lock_bh(&in6_dev->lock);
+	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+		if (!memcmp(&ifp->addr, daddr, sizeof(*daddr))) {
+			same_rxe = true;
+			break;
+		}
+	}
+	read_unlock_bh(&in6_dev->lock);
+	in6_dev_put(in6_dev);
+	return same_rxe;
+}
+
 static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 		    struct rxe_av *av)
 {
@@ -411,7 +452,7 @@  static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 		return -EHOSTUNREACH;
 	}
 
-	if (!memcmp(saddr, daddr, sizeof(*daddr)))
+	if (same_rxe6(skb->dev, daddr))
 		pkt->mask |= RXE_LOOPBACK_MASK;
 
 	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),