diff mbox

Expire sendonly joins (was Re: [PATCH rdma-rc 0/2] Add mechanism for ipoib neigh state change notifications)

Message ID alpine.DEB.2.20.1509241158420.21198@east.gentwo.org (mailing list archive)
State Accepted
Headers show

Commit Message

Christoph Lameter (Ampere) Sept. 24, 2015, 5 p.m. UTC
Ok here is the fixed up and tested V2 of the patch. Can this go in with
Doug's  patch?



Subject: ipoib: Expire sendonly multicast joins on neighbor expiration V2

Add mcast_leave functionality to __ipoib_reap_neighbor.

Signed-off-by: Christoph Lameter <cl@linux.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Or Gerlitz Sept. 25, 2015, 3:32 p.m. UTC | #1
On Thu, Sep 24, 2015 at 8:00 PM, Christoph Lameter <cl@linux.com> wrote:
> Ok here is the fixed up and tested V2 of the patch. Can this go in with
> Doug's  patch?


Repeating myself... do you find some over complexity in Erez's
implementation? what's the rational for not using his patch and yes
using yours? Erez and Co were very busy with some internal deadlines
and he's now OOO (it's a high Holiday season now) - will be able to
review your patch once he's back (Oct 6, I believe). It seems that the
patch does the job, but there are locking/contexts and such to
consider here, so I can't just ack it, have you passed it through
testing?

Or.


> Subject: ipoib: Expire sendonly multicast joins on neighbor expiration V2
>
> Add mcast_leave functionality to __ipoib_reap_neighbor.
>
> Signed-off-by: Christoph Lameter <cl@linux.com>
>
> Index: linux/drivers/infiniband/ulp/ipoib/ipoib_main.c
> ===================================================================
> --- linux.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c        2015-09-23 09:51:19.259274231 -0500
> +++ linux/drivers/infiniband/ulp/ipoib/ipoib_main.c     2015-09-23 09:59:59.803289023 -0500
> @@ -1149,6 +1149,9 @@
>         unsigned long dt;
>         unsigned long flags;
>         int i;
> +       LIST_HEAD(remove_list);
> +       struct ipoib_mcast *mcast, *tmcast;
> +       struct net_device *dev = priv->dev;
>
>         if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
>                 return;
> @@ -1176,6 +1179,19 @@
>                                                           lockdep_is_held(&priv->lock))) != NULL) {
>                         /* was the neigh idle for two GC periods */
>                         if (time_after(neigh_obsolete, neigh->alive)) {
> +                               u8 *mgid = neigh->daddr + 4;
> +
> +                               /* Is this multicast ? */
> +                               if (*mgid == 0xff) {
> +                                       mcast = __ipoib_mcast_find(dev, mgid);
> +
> +                                       if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
> +                                               list_del(&mcast->list);
> +                                               rb_erase(&mcast->rb_node, &priv->multicast_tree);
> +                                               list_add_tail(&mcast->list, &remove_list);
> +                                       }
> +                               }
> +
>                                 rcu_assign_pointer(*np,
>                                                    rcu_dereference_protected(neigh->hnext,
>                                                                              lockdep_is_held(&priv->lock)));
> @@ -1191,6 +1207,8 @@
>
>  out_unlock:
>         spin_unlock_irqrestore(&priv->lock, flags);
> +       list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
> +               ipoib_mcast_leave(dev, mcast);
>  }
>
>  static void ipoib_reap_neigh(struct work_struct *work)
> Index: linux/drivers/infiniband/ulp/ipoib/ipoib.h
> ===================================================================
> --- linux.orig/drivers/infiniband/ulp/ipoib/ipoib.h     2015-09-23 09:51:19.259274231 -0500
> +++ linux/drivers/infiniband/ulp/ipoib/ipoib.h  2015-09-23 09:51:19.255274231 -0500
> @@ -548,6 +548,8 @@
>
>  int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
>                        union ib_gid *mgid, int set_qkey);
> +int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast);
> +struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid);
>
>  int ipoib_init_qp(struct net_device *dev);
>  int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
> Index: linux/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> ===================================================================
> --- linux.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c   2015-09-23 09:51:19.259274231 -0500
> +++ linux/drivers/infiniband/ulp/ipoib/ipoib_multicast.c        2015-09-23 09:51:19.255274231 -0500
> @@ -158,7 +158,7 @@
>         return mcast;
>  }
>
> -static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
> +struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
>  {
>         struct ipoib_dev_priv *priv = netdev_priv(dev);
>         struct rb_node *n = priv->multicast_tree.rb_node;
> @@ -705,7 +705,7 @@
>         return 0;
>  }
>
> -static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
> +int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
>  {
>         struct ipoib_dev_priv *priv = netdev_priv(dev);
>         int ret = 0;
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) Sept. 25, 2015, 4:41 p.m. UTC | #2
On Fri, 25 Sep 2015, Or Gerlitz wrote:

> On Thu, Sep 24, 2015 at 8:00 PM, Christoph Lameter <cl@linux.com> wrote:
> > Ok here is the fixed up and tested V2 of the patch. Can this go in with
> > Doug's  patch?
>
>
> Repeating myself... do you find some over complexity in Erez's
> implementation? what's the rational for not using his patch and yes
> using yours? Erez and Co were very busy with some internal deadlines
> and he's now OOO (it's a high Holiday season now) - will be able to
> review your patch once he's back (Oct 6, I believe). It seems that the
> patch does the job, but there are locking/contexts and such to
> consider here, so I can't just ack it, have you passed it through
> testing?

Yes the patch introduces a new callback and creates workqueues that
recheck conditions etc etc.

Makes it difficult to review and potentially creates new race conditions.
I'd rather have a straightforward solution.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) Sept. 25, 2015, 4:55 p.m. UTC | #3
And yes this went through testing here and we want to run this as part of
our prod kernels.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 26, 2015, 5:35 p.m. UTC | #4
On Fri, Sep 25, 2015 at 7:41 PM, Christoph Lameter <cl@linux.com> wrote:

> Yes the patch introduces a new callback and creates workqueues that

It's possible that this was done for a reason, so

> recheck conditions etc etc.
> Makes it difficult to review and potentially creates new race conditions.

maybe, and maybe avoid other race conditions

> I'd rather have a straightforward solution.

> And yes this went through testing here and we want to run this as
> part of our prod kernels.

sounds good, so taking into account that Erez is away till Oct 6th, we
can probably pick your patch and later, if Erez proves us that there's
deep problem there, revert it and take his.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) Sept. 27, 2015, 4:39 p.m. UTC | #5
On Sat, 26 Sep 2015, Or Gerlitz wrote:

> It's possible that this was done for a reason, so

> sounds good, so taking into account that Erez is away till Oct 6th, we
> can probably pick your patch and later, if Erez proves us that there's
> deep problem there, revert it and take his.

Ok but if Erez does not have the time to participate in code development
and follow up on the patch as issues arise then I would rather rework the
code so that it is easily understandable and I will continue to follow up
on the issues with the code as they develop. This seems to be much more
important to my company than Mellanox.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Sept. 27, 2015, 5:32 p.m. UTC | #6
On 09/27/2015 12:39 PM, Christoph Lameter wrote:
> 
> On Sat, 26 Sep 2015, Or Gerlitz wrote:
> 
>> It's possible that this was done for a reason, so
> 
>> sounds good, so taking into account that Erez is away till Oct 6th, we
>> can probably pick your patch and later, if Erez proves us that there's
>> deep problem there, revert it and take his.
> 
> Ok but if Erez does not have the time to participate in code development
> and follow up on the patch as issues arise then I would rather rework the
> code so that it is easily understandable and I will continue to follow up
> on the issues with the code as they develop. This seems to be much more
> important to my company than Mellanox.
> 

Currently I'm testing your patch with a couple other patches.  I dropped
the patch of mine that added a module option, and added two different
patches.  However, I'm still waffling on this patch somewhat.  In the
discussions that Jason and I had, I pretty much decided that I would
like to see all send-only multicast sends be sent immediately with no
backlog queue.  That means that if we had to start a send-only join, or
if we started one and it hasn't completed yet, we would send the packet
immediately via the broadcast group versus queueing.  Doing so might
trip this new code up.

Right now, we start a join, we queue the packet on the mcast struct, and
in join_finish we create an ah, but that's it.  We then restart the send
by calling dev_queue_xmit on the skb we put in the backlog queue, which
takes us back around to mcast_send, where we not have both a mcast and a
mcast->ah, so *then* we alloc a new neigh entry, attach this mcast to
it, and send using it.

If I change mcast_send so that we start a join, but immediately send the
packet in the broadcast group, then I would have to change the
join_finish routine to alloc a neigh that has the right daddr so it can
be found in the future, without the benefit of the daddr passed into the
function mcast_send so missing the ipoib header and instead only having
the raw mgid in the mcmember struct.  But, we would have to have that
neigh struct so that the timeout would work in the case were we had a
packet or two that triggered a join but were all sent prior to the join
completing and so we never got a neigh alloc via mcast_send for this
mcast group.
Christoph Lameter (Ampere) Sept. 28, 2015, 2:28 a.m. UTC | #7
On Sun, 27 Sep 2015, Doug Ledford wrote:

> Currently I'm testing your patch with a couple other patches.  I dropped
> the patch of mine that added a module option, and added two different
> patches.  However, I'm still waffling on this patch somewhat.  In the
> discussions that Jason and I had, I pretty much decided that I would
> like to see all send-only multicast sends be sent immediately with no
> backlog queue.  That means that if we had to start a send-only join, or
> if we started one and it hasn't completed yet, we would send the packet
> immediately via the broadcast group versus queueing.  Doing so might
> trip this new code up.

If we send immediately then we would need to check on each packet if the
multicast creation has been completed?

Also broadcast could cause a unecessary reception event on the NICs of
machines that have no interest in this traffic. We would like to keep
irrelevant traffic off the fabric as much as possible. An a reception
event that requires traffic to be thrown out will cause jitter in the
processing of inbound traffic that we also would like to avoid.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 28, 2015, 6:37 a.m. UTC | #8
On Sun, Sep 27, 2015 at 7:39 PM, Christoph Lameter <cl@linux.com> wrote:
> Ok but if Erez does not have the time to participate in code development
> and follow up on the patch as issues arise then I would rather rework the
> code so that it is easily understandable and I will continue to follow up
> on the issues with the code as they develop.

As I mentioned to you earlier on this thread, currently it's not a
matter of having time to participate but rather happily going through
the Jewish new year holidays, this time Sukkot with many people being
off for the whole of it till Oct 6.

Personally, up to few weeks ago, I was under the misimpression that
not only IPoIB joins as full member also on the sendonly flow, but
also that such group can be actually opened under that flow, and it
turns out they don't. Later you said that your production environment
was running a very old non upstream stack that had a knob to somehow
make it work and as of that didn't realize that something goes wrong
for years w.r.t a gateway functionality with upstream/inbox code, so
we all screwed up here over a time period with is few orders of
magnitude longer than a holiday duration.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) Sept. 28, 2015, 11:17 a.m. UTC | #9
On Mon, 28 Sep 2015, Or Gerlitz wrote:

> Personally, up to few weeks ago, I was under the misimpression that
> not only IPoIB joins as full member also on the sendonly flow, but
> also that such group can be actually opened under that flow, and it
> turns out they don't. Later you said that your production environment
> was running a very old non upstream stack that had a knob to somehow

I said we run OFED 1.5.X on older systems. That is not custom.

> make it work and as of that didn't realize that something goes wrong
> for years w.r.t a gateway functionality with upstream/inbox code, so
> we all screwed up here over a time period with is few orders of
> magnitude longer than a holiday duration.

We have been are migrating to a RH7 native stack over the last months and
in a mixed environment the systems running OFED will create the MC groups
so the issue was hidden. We have talked about this migration a couple of
times even face to face. ???



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Sept. 28, 2015, 3:36 p.m. UTC | #10
On 09/27/2015 10:28 PM, Christoph Lameter wrote:
> On Sun, 27 Sep 2015, Doug Ledford wrote:
> 
>> Currently I'm testing your patch with a couple other patches.  I dropped
>> the patch of mine that added a module option, and added two different
>> patches.  However, I'm still waffling on this patch somewhat.  In the
>> discussions that Jason and I had, I pretty much decided that I would
>> like to see all send-only multicast sends be sent immediately with no
>> backlog queue.  That means that if we had to start a send-only join, or
>> if we started one and it hasn't completed yet, we would send the packet
>> immediately via the broadcast group versus queueing.  Doing so might
>> trip this new code up.
> 
> If we send immediately then we would need to check on each packet if the
> multicast creation has been completed?

We do that already anyway.  Calling find_mcast and then checking
if(!mcast || !mcast-ah) is exactly that check.

> Also broadcast could cause a unecessary reception event on the NICs of
> machines that have no interest in this traffic.

This is true.  However, I'm trying to balance between several competing
issues.  You also stated the revamped multicast code was adding latency
and dropped packets into the problem space.  Sending over the broadcast
would help with latency.  However, I have an alternative idea for that...

> We would like to keep
> irrelevant traffic off the fabric as much as possible. An a reception
> event that requires traffic to be thrown out will cause jitter in the
> processing of inbound traffic that we also would like to avoid.

That may not be optimal for your app, but we also need to try and
maintain proper emulation of typical IP/Ethernet behavior since this is
IPoIB after all.  That's why the app isn't required to join the group
before sending, and also why it should be able to expect that we will
fall back to sending via broadcast if needed.

However, the following algorithm might be suitable here:

On first packet:
  create mcast group
  queue packet to group
  schedule join

On subsequent packets:
  find mcast group
  check mcast state
    if already joined, send immediately
    if joining, queue packet to mcast queue
    if join is deferred, send via bcast

On join completion:
  successful join
    set mcast->ah
    send all queued packets via mcast
    if no queued packets, alloc neigh for default ipv4 ethertype
  on failed join
    mcast->ah remains NULL
    send all queued packets via bcast
    mcast->delay_until is set to future time (used to know join is deferred)
    schedule deferred join attemp
Christoph Lameter (Ampere) Sept. 28, 2015, 3:51 p.m. UTC | #11
On Mon, 28 Sep 2015, Doug Ledford wrote:

> > We would like to keep
> > irrelevant traffic off the fabric as much as possible. An a reception
> > event that requires traffic to be thrown out will cause jitter in the
> > processing of inbound traffic that we also would like to avoid.
>
> That may not be optimal for your app, but we also need to try and
> maintain proper emulation of typical IP/Ethernet behavior since this is
> IPoIB after all.  That's why the app isn't required to join the group
> before sending, and also why it should be able to expect that we will
> fall back to sending via broadcast if needed.

Ok this needs to work with the existing ethernet gateways and verified to
work with them.

> However, the following algorithm might be suitable here:
>
> On first packet:
>   create mcast group
>   queue packet to group
>   schedule join
>
> On subsequent packets:
>   find mcast group
>   check mcast state
>     if already joined, send immediately
>     if joining, queue packet to mcast queue
>     if join is deferred, send via bcast

Hmmm... If the multicast group does not exist in the SM then we could only
bcast to all routers instead? No host in the fabric could then be
listening the only listeners possible are outside the fabric.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Sept. 28, 2015, 4:59 p.m. UTC | #12
On 09/28/2015 11:51 AM, Christoph Lameter wrote:
> On Mon, 28 Sep 2015, Doug Ledford wrote:
> 
>>> We would like to keep
>>> irrelevant traffic off the fabric as much as possible. An a reception
>>> event that requires traffic to be thrown out will cause jitter in the
>>> processing of inbound traffic that we also would like to avoid.
>>
>> That may not be optimal for your app, but we also need to try and
>> maintain proper emulation of typical IP/Ethernet behavior since this is
>> IPoIB after all.  That's why the app isn't required to join the group
>> before sending, and also why it should be able to expect that we will
>> fall back to sending via broadcast if needed.
> 
> Ok this needs to work with the existing ethernet gateways and verified to
> work with them.
> 
>> However, the following algorithm might be suitable here:
>>
>> On first packet:
>>   create mcast group
>>   queue packet to group
>>   schedule join
>>
>> On subsequent packets:
>>   find mcast group
>>   check mcast state
>>     if already joined, send immediately
>>     if joining, queue packet to mcast queue
>>     if join is deferred, send via bcast
> 
> Hmmm... If the multicast group does not exist in the SM then we could only
> bcast to all routers instead? 

No, I was referring to using this on top of your patch and my other two
patches, which change the ipoib driver to create sendonly groups and
then expire them when the neighbor expires.

> No host in the fabric could then be
> listening the only listeners possible are outside the fabric.
>
Christoph Lameter (Ampere) Sept. 28, 2015, 5:05 p.m. UTC | #13
On Mon, 28 Sep 2015, Doug Ledford wrote:

> No, I was referring to using this on top of your patch and my other two
> patches, which change the ipoib driver to create sendonly groups and
> then expire them when the neighbor expires.

Ok under which conditions could the joining be deferred and packets be
sent to broadcast?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Sept. 28, 2015, 5:10 p.m. UTC | #14
On Mon, Sep 28, 2015 at 11:36:11AM -0400, Doug Ledford wrote:

> > Also broadcast could cause a unecessary reception event on the NICs of
> > machines that have no interest in this traffic.
> 
> This is true.  However, I'm trying to balance between several competing
> issues.  You also stated the revamped multicast code was adding latency
> and dropped packets into the problem space.  Sending over the broadcast
> would help with latency.  However, I have an alternative idea for that...

I think your original idea of broadcast immediately and deferred
optimal mlid lookup is the best *functionally* for every case - only
when you enter the very edge world of caring about timing does it make
any difference.

Christoph's needs would probably be better served by giving some API
to control the mlid cache (ie the neightbour table is already 99% of
the way there). This would let some userspace component pre-load and
fix all relevant data and undesired cache activity simply can't add
jitter.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) Sept. 28, 2015, 5:19 p.m. UTC | #15
On Mon, 28 Sep 2015, Jason Gunthorpe wrote:

> I think your original idea of broadcast immediately and deferred
> optimal mlid lookup is the best *functionally* for every case - only
> when you enter the very edge world of caring about timing does it make
> any difference.

Infiniband is about the edge.

> Christoph's needs would probably be better served by giving some API
> to control the mlid cache (ie the neightbour table is already 99% of
> the way there). This would let some userspace component pre-load and
> fix all relevant data and undesired cache activity simply can't add
> jitter.

Ok so on boot up we preload 3000 multicast groups into the neighbor table?
What impact on IB performance would having such a number of mlid's in the
cache have?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Sept. 28, 2015, 5:36 p.m. UTC | #16
On Mon, Sep 28, 2015 at 12:19:04PM -0500, Christoph Lameter wrote:
 
> > Christoph's needs would probably be better served by giving some API
> > to control the mlid cache (ie the neightbour table is already 99% of
> > the way there). This would let some userspace component pre-load and
> > fix all relevant data and undesired cache activity simply can't add
> > jitter.
> 
> Ok so on boot up we preload 3000 multicast groups into the neighbor table?
> What impact on IB performance would having such a number of mlid's in the
> cache have?

What is the cost of a neighbour lookup? Isn't it hashed? So not very
much if the hash function/table size work well with the distribution
of IPs.

The win is any send to any of the 3000 groups is consistent in all
cases, no outlier that is slower due to the SA turn around.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Sept. 29, 2015, 5:47 p.m. UTC | #17
On 09/28/2015 01:10 PM, Jason Gunthorpe wrote:
> On Mon, Sep 28, 2015 at 11:36:11AM -0400, Doug Ledford wrote:
> 
>>> Also broadcast could cause a unecessary reception event on the NICs of
>>> machines that have no interest in this traffic.
>>
>> This is true.  However, I'm trying to balance between several competing
>> issues.  You also stated the revamped multicast code was adding latency
>> and dropped packets into the problem space.  Sending over the broadcast
>> would help with latency.  However, I have an alternative idea for that...
> 
> I think your original idea of broadcast immediately and deferred
> optimal mlid lookup is the best *functionally* for every case - only
> when you enter the very edge world of caring about timing does it make
> any difference.
> 
> Christoph's needs would probably be better served by giving some API
> to control the mlid cache (ie the neightbour table is already 99% of
> the way there). This would let some userspace component pre-load and
> fix all relevant data and undesired cache activity simply can't add
> jitter.

So, I've taken Christoph's patch, added two of my own (just changed the
comment and the #if statement so that we create groups on send-only
joins, and upped the max send-only backlog queue).  We'll leave it at
that for 4.3 and try to address it more fully in 4.4.
diff mbox

Patch

Index: linux/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- linux.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2015-09-23 09:51:19.259274231 -0500
+++ linux/drivers/infiniband/ulp/ipoib/ipoib_main.c	2015-09-23 09:59:59.803289023 -0500
@@ -1149,6 +1149,9 @@ 
 	unsigned long dt;
 	unsigned long flags;
 	int i;
+	LIST_HEAD(remove_list);
+	struct ipoib_mcast *mcast, *tmcast;
+	struct net_device *dev = priv->dev;

 	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
 		return;
@@ -1176,6 +1179,19 @@ 
 							  lockdep_is_held(&priv->lock))) != NULL) {
 			/* was the neigh idle for two GC periods */
 			if (time_after(neigh_obsolete, neigh->alive)) {
+				u8 *mgid = neigh->daddr + 4;
+
+				/* Is this multicast ? */
+				if (*mgid == 0xff) {
+					mcast = __ipoib_mcast_find(dev, mgid);
+
+					if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+						list_del(&mcast->list);
+						rb_erase(&mcast->rb_node, &priv->multicast_tree);
+						list_add_tail(&mcast->list, &remove_list);
+					}
+				}
+
 				rcu_assign_pointer(*np,
 						   rcu_dereference_protected(neigh->hnext,
 									     lockdep_is_held(&priv->lock)));
@@ -1191,6 +1207,8 @@ 

 out_unlock:
 	spin_unlock_irqrestore(&priv->lock, flags);
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
+		ipoib_mcast_leave(dev, mcast);
 }

 static void ipoib_reap_neigh(struct work_struct *work)
Index: linux/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2015-09-23 09:51:19.259274231 -0500
+++ linux/drivers/infiniband/ulp/ipoib/ipoib.h	2015-09-23 09:51:19.255274231 -0500
@@ -548,6 +548,8 @@ 

 int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
 		       union ib_gid *mgid, int set_qkey);
+int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast);
+struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid);

 int ipoib_init_qp(struct net_device *dev);
 int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
Index: linux/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- linux.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2015-09-23 09:51:19.259274231 -0500
+++ linux/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2015-09-23 09:51:19.255274231 -0500
@@ -158,7 +158,7 @@ 
 	return mcast;
 }

-static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
+struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct rb_node *n = priv->multicast_tree.rb_node;
@@ -705,7 +705,7 @@ 
 	return 0;
 }

-static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int ret = 0;