diff mbox series

[for-next,13/16] IB/{hfi1, ipoib, rdma}: Broadcast ping sent packets which exceeded mtu size

Message ID 20200210131944.87776.64386.stgit@awfm-01.aw.intel.com (mailing list archive)
State Superseded
Headers show
Series New hfi1 feature: Accelerated IP | expand

Commit Message

Dennis Dalessandro Feb. 10, 2020, 1:19 p.m. UTC
From: Gary Leshner <Gary.S.Leshner@intel.com>

When in connected mode ipoib sent broadcast pings which exceeded the mtu
size for broadcast addresses.

Add an mtu attribute to the rdma_netdev structure which ipoib sets to its
mcast mtu size.

The RDMA netdev uses this value to determine if the skb length is too long
for the mtu specified and if it is, drops the packet and logs an error
about the errant packet.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.alessandro@intel.com>
Signed-off-by: Gary Leshner <Gary.S.Leshner@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |    2 ++
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |    1 +
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c      |    3 +++
 3 files changed, 6 insertions(+)

Comments

Jason Gunthorpe Feb. 19, 2020, 12:42 a.m. UTC | #1
On Mon, Feb 10, 2020 at 08:19:44AM -0500, Dennis Dalessandro wrote:
> From: Gary Leshner <Gary.S.Leshner@intel.com>
> 
> When in connected mode ipoib sent broadcast pings which exceeded the mtu
> size for broadcast addresses.
>
> Add an mtu attribute to the rdma_netdev structure which ipoib sets to its
> mcast mtu size.
> 
> The RDMA netdev uses this value to determine if the skb length is too long
> for the mtu specified and if it is, drops the packet and logs an error
> about the errant packet.

I'm confused by this comment, connected mode is not able to use
rdma_netdev, for various technical reason, I thought?

Is this somehow running a rdma_netdev concurrently with connected
mode? How?

Jason
Erez Shitrit Feb. 19, 2020, 1:41 p.m. UTC | #2
On Mon, Feb 10, 2020 at 3:19 PM Dennis Dalessandro
<dennis.dalessandro@intel.com> wrote:
>
> From: Gary Leshner <Gary.S.Leshner@intel.com>
>
> When in connected mode ipoib sent broadcast pings which exceeded the mtu
> size for broadcast addresses.

But this broadcast done via the UD QP and not via the connected mode,
please explain

>
> Add an mtu attribute to the rdma_netdev structure which ipoib sets to its
> mcast mtu size.
>
> The RDMA netdev uses this value to determine if the skb length is too long
> for the mtu specified and if it is, drops the packet and logs an error
> about the errant packet.
>
> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
> Reviewed-by: Dennis Dalessandro <dennis.alessandro@intel.com>
> Signed-off-by: Gary Leshner <Gary.S.Leshner@intel.com>
> Signed-off-by: Kaike Wan <kaike.wan@intel.com>
> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib_main.c      |    2 ++
>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |    1 +
>  drivers/infiniband/ulp/ipoib/ipoib_vlan.c      |    3 +++
>  3 files changed, 6 insertions(+)
>
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 5c1cf68..ddb896f 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -1906,6 +1906,7 @@ static int ipoib_ndo_init(struct net_device *ndev)
>  {
>         struct ipoib_dev_priv *priv = ipoib_priv(ndev);
>         int rc;
> +       struct rdma_netdev *rn = netdev_priv(ndev);
>
>         if (priv->parent) {
>                 ipoib_child_init(ndev);
> @@ -1918,6 +1919,7 @@ static int ipoib_ndo_init(struct net_device *ndev)
>         /* MTU will be reset when mcast join happens */
>         ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
>         priv->mcast_mtu = priv->admin_mtu = ndev->mtu;
> +       rn->mtu = priv->mcast_mtu;

If this is something specific for your lower driver (opa_vnic etc.)
you don't need to do that here, you can use the rn->clnt_priv member
in order to get the mcast_mtu

>         ndev->max_mtu = IPOIB_CM_MTU;
>
>         ndev->neigh_priv_len = sizeof(struct ipoib_neigh);
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> index 7166ee9b..3d5f6b8 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> @@ -246,6 +246,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
>                 if (priv->mcast_mtu == priv->admin_mtu)
>                         priv->admin_mtu = IPOIB_UD_MTU(mtu);
>                 priv->mcast_mtu = IPOIB_UD_MTU(mtu);
> +               rn->mtu = priv->mcast_mtu;
>
>                 priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
>                 spin_unlock_irq(&priv->lock);
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
> index 8ac8e18..3086560 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
> @@ -97,6 +97,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
>  {
>         struct net_device *ndev = priv->dev;
>         int result;
> +       struct rdma_netdev *rn = netdev_priv(ndev);
>
>         ASSERT_RTNL();
>
> @@ -117,6 +118,8 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
>                 goto out_early;
>         }
>
> +       rn->mtu = priv->mcast_mtu;
> +
>         priv->parent = ppriv->dev;
>         priv->pkey = pkey;
>         priv->child_type = type;
>
Dennis Dalessandro Feb. 21, 2020, 7:40 p.m. UTC | #3
On 2/18/2020 7:42 PM, Jason Gunthorpe wrote:
> On Mon, Feb 10, 2020 at 08:19:44AM -0500, Dennis Dalessandro wrote:
>> From: Gary Leshner <Gary.S.Leshner@intel.com>
>>
>> When in connected mode ipoib sent broadcast pings which exceeded the mtu
>> size for broadcast addresses.
>>
>> Add an mtu attribute to the rdma_netdev structure which ipoib sets to its
>> mcast mtu size.
>>
>> The RDMA netdev uses this value to determine if the skb length is too long
>> for the mtu specified and if it is, drops the packet and logs an error
>> about the errant packet.
> 
> I'm confused by this comment, connected mode is not able to use
> rdma_netdev, for various technical reason, I thought?
> 
> Is this somehow running a rdma_netdev concurrently with connected
> mode? How?

No, not concurrently. When ipoib is in connected mode, a broadcast 
request, something like:

ping -s 2017 -i 0.001 -c 10 -M do -I ib0 -b 192.168.0.255

will be sent down from user space to ipoib. At an mcast_mtu of 2048, the 
max payload size is 2016 (2048 - 28 - 4). If AIP is not being used then 
the datagram send function (ipoib_send()) does a check and drops the packet.

However when AIP is enabled ipoib_send is of course not used and we land 
in rn->send function. Which needs to do the same check.

Alternatively, the mcast_mtu check could be added to ipoib so that this 
checking is performed before rn->send() is called. In that case, this 
patch will not be needed and the new rdma_netdev (or clnt_priv) field 
that shadows mcast_mtu would also not be needed.

Also for the datagram mode case, the ping command will correctly check 
the payload size and drop the request in the user
space so that ipoib will not receive the broadcast request at all:

   ping -s 2017 -i 0.001 -c 1 -M do -I ib0 -b 192.168.0.255
   WARNING: pinging broadcast address
   PING 192.168.0.255 (192.168.0.255) from 192.168.0.1 ib0: 2017(2045) 
bytes of data.
   ping: local error: Message too long, mtu=2044

-Denny
Dennis Dalessandro Feb. 21, 2020, 7:40 p.m. UTC | #4
On 2/19/2020 8:41 AM, Erez Shitrit wrote:
> On Mon, Feb 10, 2020 at 3:19 PM Dennis Dalessandro
> <dennis.dalessandro@intel.com> wrote:
>>
>> From: Gary Leshner <Gary.S.Leshner@intel.com>
>>
>> When in connected mode ipoib sent broadcast pings which exceeded the mtu
>> size for broadcast addresses.
> 
> But this broadcast done via the UD QP and not via the connected mode,
> please explain

It still lands in the ipoib_send() function though which does the length 
check for non-AIP. With AIP we still need to do a similar check.

>> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
>> @@ -1906,6 +1906,7 @@ static int ipoib_ndo_init(struct net_device *ndev)
>>   {
>>          struct ipoib_dev_priv *priv = ipoib_priv(ndev);
>>          int rc;
>> +       struct rdma_netdev *rn = netdev_priv(ndev);
>>
>>          if (priv->parent) {
>>                  ipoib_child_init(ndev);
>> @@ -1918,6 +1919,7 @@ static int ipoib_ndo_init(struct net_device *ndev)
>>          /* MTU will be reset when mcast join happens */
>>          ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
>>          priv->mcast_mtu = priv->admin_mtu = ndev->mtu;
>> +       rn->mtu = priv->mcast_mtu;
> 
> If this is something specific for your lower driver (opa_vnic etc.)
> you don't need to do that here, you can use the rn->clnt_priv member
> in order to get the mcast_mtu

That's probably doable, sure. However this seems like something we don't 
want to hide away as a private member. The Mcast MTU is a pretty central 
concept and some other lower driver may conceivably make sue of it. Is 
there a reason we don't want to add to the rn?

The other complication is struct ipoib_dev_priv is not available to 
hfi1, it is a ULP concept and hfi1 should probably not be made aware?

-Denny
Jason Gunthorpe Feb. 21, 2020, 11:32 p.m. UTC | #5
On Fri, Feb 21, 2020 at 02:40:28PM -0500, Dennis Dalessandro wrote:
> On 2/18/2020 7:42 PM, Jason Gunthorpe wrote:
> > On Mon, Feb 10, 2020 at 08:19:44AM -0500, Dennis Dalessandro wrote:
> > > From: Gary Leshner <Gary.S.Leshner@intel.com>
> > > 
> > > When in connected mode ipoib sent broadcast pings which exceeded the mtu
> > > size for broadcast addresses.
> > > 
> > > Add an mtu attribute to the rdma_netdev structure which ipoib sets to its
> > > mcast mtu size.
> > > 
> > > The RDMA netdev uses this value to determine if the skb length is too long
> > > for the mtu specified and if it is, drops the packet and logs an error
> > > about the errant packet.
> > 
> > I'm confused by this comment, connected mode is not able to use
> > rdma_netdev, for various technical reason, I thought?
> > 
> > Is this somehow running a rdma_netdev concurrently with connected
> > mode? How?
> 
> No, not concurrently. When ipoib is in connected mode, a broadcast request,
> something like:
> 
> ping -s 2017 -i 0.001 -c 10 -M do -I ib0 -b 192.168.0.255
> 
> will be sent down from user space to ipoib. At an mcast_mtu of 2048, the max
> payload size is 2016 (2048 - 28 - 4). If AIP is not being used then the
> datagram send function (ipoib_send()) does a check and drops the packet.
> 
> However when AIP is enabled ipoib_send is of course not used and we land in
> rn->send function. Which needs to do the same check.

You just contradicted yourself: the first sentence was 'not
concurrently' and here you say we have connected mode turned on and
yet a packet is delivered to AIP, so what do you mean?

What I mean is if you can do connected mode you don't have a
rdma_netdev and you can't do AIP.

How are things in connected mode and a rdma_netdev is available?

Jason
Dennis Dalessandro March 20, 2020, 1:53 p.m. UTC | #6
On 2/21/2020 6:32 PM, Jason Gunthorpe wrote:
> On Fri, Feb 21, 2020 at 02:40:28PM -0500, Dennis Dalessandro wrote:
>> On 2/18/2020 7:42 PM, Jason Gunthorpe wrote:
>>> On Mon, Feb 10, 2020 at 08:19:44AM -0500, Dennis Dalessandro wrote:
>>>> From: Gary Leshner <Gary.S.Leshner@intel.com>
>>>>
>>>> When in connected mode ipoib sent broadcast pings which exceeded the mtu
>>>> size for broadcast addresses.
>>>>
>>>> Add an mtu attribute to the rdma_netdev structure which ipoib sets to its
>>>> mcast mtu size.
>>>>
>>>> The RDMA netdev uses this value to determine if the skb length is too long
>>>> for the mtu specified and if it is, drops the packet and logs an error
>>>> about the errant packet.
>>>
>>> I'm confused by this comment, connected mode is not able to use
>>> rdma_netdev, for various technical reason, I thought?
>>>
>>> Is this somehow running a rdma_netdev concurrently with connected
>>> mode? How?
>>
>> No, not concurrently. When ipoib is in connected mode, a broadcast request,
>> something like:
>>
>> ping -s 2017 -i 0.001 -c 10 -M do -I ib0 -b 192.168.0.255
>>
>> will be sent down from user space to ipoib. At an mcast_mtu of 2048, the max
>> payload size is 2016 (2048 - 28 - 4). If AIP is not being used then the
>> datagram send function (ipoib_send()) does a check and drops the packet.
>>
>> However when AIP is enabled ipoib_send is of course not used and we land in
>> rn->send function. Which needs to do the same check.
> 
> You just contradicted yourself: the first sentence was 'not
> concurrently' and here you say we have connected mode turned on and
> yet a packet is delivered to AIP, so what do you mean?

AIP provides a rdma_netdev (rn) that overloads the rn inside ipoib. When 
the broadcast skb is passed down from the user space, even in connected 
mode, the skb will be forwarded to the rn to send out.

> What I mean is if you can do connected mode you don't have a
> rdma_netdev and you can't do AIP.

The rdma_netdev is always present, regardless of the ipoib mode.

> How are things in connected mode and a rdma_netdev is available?

So we don't only overload the rn for datagram, we do it for connected as 
well.

The rdma_netdev is set up when ipoib first finds the port, not when the 
mode is switched through sysfs. Therefore, it has to be there always, 
even in connected mode.

In hfi1_ipoib_setup_rn() (the setup function for rdma_netdev), we set:
     rn->send = hfi1_ipoib_send

We also keeps the default netdev_ops and overload it with our netdev_ops 
to set up /tear down resources during netdev init/uninit/open/close:

     Priv->netdev_ops = netdev->netdev_ops;
     Netdev->netdev_ops = &hfi1_ipoib_netdev_ops;

-Denny
diff mbox series

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 5c1cf68..ddb896f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1906,6 +1906,7 @@  static int ipoib_ndo_init(struct net_device *ndev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
 	int rc;
+	struct rdma_netdev *rn = netdev_priv(ndev);
 
 	if (priv->parent) {
 		ipoib_child_init(ndev);
@@ -1918,6 +1919,7 @@  static int ipoib_ndo_init(struct net_device *ndev)
 	/* MTU will be reset when mcast join happens */
 	ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
 	priv->mcast_mtu = priv->admin_mtu = ndev->mtu;
+	rn->mtu = priv->mcast_mtu;
 	ndev->max_mtu = IPOIB_CM_MTU;
 
 	ndev->neigh_priv_len = sizeof(struct ipoib_neigh);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 7166ee9b..3d5f6b8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -246,6 +246,7 @@  static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 		if (priv->mcast_mtu == priv->admin_mtu)
 			priv->admin_mtu = IPOIB_UD_MTU(mtu);
 		priv->mcast_mtu = IPOIB_UD_MTU(mtu);
+		rn->mtu = priv->mcast_mtu;
 
 		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
 		spin_unlock_irq(&priv->lock);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 8ac8e18..3086560 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -97,6 +97,7 @@  int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
 {
 	struct net_device *ndev = priv->dev;
 	int result;
+	struct rdma_netdev *rn = netdev_priv(ndev);
 
 	ASSERT_RTNL();
 
@@ -117,6 +118,8 @@  int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
 		goto out_early;
 	}
 
+	rn->mtu = priv->mcast_mtu;
+
 	priv->parent = ppriv->dev;
 	priv->pkey = pkey;
 	priv->child_type = type;