diff mbox

[v5,1/1] IB/ipoib: fix for rare multicast join race condition

Message ID 20160211213051.29589.31877.stgit@phlsvlogin03.ph.intel.com (mailing list archive)
State Accepted
Headers show

Commit Message

Estrin, Alex Feb. 11, 2016, 9:30 p.m. UTC
A narrow window for race condition still exist between
multicast join thread and *dev_flush workers.
A kernel crash caused by prolong erratic link state changes
was observed (most likely a faulty cabling):

[167275.656270] BUG: unable to handle kernel NULL pointer dereference at
0000000000000020
[167275.665973] IP: [<ffffffffa05f8f2e>] ipoib_mcast_join+0xae/0x1d0 [ib_ipoib]
[167275.674443] PGD 0
[167275.677373] Oops: 0000 [#1] SMP
...
[167275.977530] Call Trace:
[167275.982225]  [<ffffffffa05f92f0>] ? ipoib_mcast_free+0x200/0x200 [ib_ipoib]
[167275.992024]  [<ffffffffa05fa1b7>] ipoib_mcast_join_task+0x2a7/0x490
[ib_ipoib]
[167276.002149]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
[167276.010754]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
[167276.019088]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
[167276.027737]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
Here was a hit spot:
ipoib_mcast_join() {
..............
      rec.qkey      = priv->broadcast->mcmember.qkey;
                                       ^^^^^^^
.....
 }
Proposed patch should prevent multicast join task to continue
if link state change is detected.

Signed-off-by: Alex Estrin <alex.estrin@intel.com>

Changes from v4:
- as suggested by Doug Ledford, optimized spinlock usage,
i.e. ipoib_mcast_join() is called with lock held.
Changes from v3:
- sync with priv->lock before flag check.
Chages from v2:
- Move check for OPER_UP flag state to mcast_join() to
ensure no event worker is in progress.
- minor style fixes.
Changes from v1:
- No need to lock again if error detected.
---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   24 +++++++++++++++++-------
 1 files changed, 17 insertions(+), 7 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Doug Ledford Feb. 12, 2016, 7:53 p.m. UTC | #1
On 02/11/2016 04:30 PM, Alex Estrin wrote:
> A narrow window for race condition still exist between
> multicast join thread and *dev_flush workers.
> A kernel crash caused by prolong erratic link state changes
> was observed (most likely a faulty cabling):
> 
> [167275.656270] BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000020
> [167275.665973] IP: [<ffffffffa05f8f2e>] ipoib_mcast_join+0xae/0x1d0 [ib_ipoib]
> [167275.674443] PGD 0
> [167275.677373] Oops: 0000 [#1] SMP
> ...
> [167275.977530] Call Trace:
> [167275.982225]  [<ffffffffa05f92f0>] ? ipoib_mcast_free+0x200/0x200 [ib_ipoib]
> [167275.992024]  [<ffffffffa05fa1b7>] ipoib_mcast_join_task+0x2a7/0x490
> [ib_ipoib]
> [167276.002149]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
> [167276.010754]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
> [167276.019088]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
> [167276.027737]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
> Here was a hit spot:
> ipoib_mcast_join() {
> ..............
>       rec.qkey      = priv->broadcast->mcmember.qkey;
>                                        ^^^^^^^
> .....
>  }
> Proposed patch should prevent multicast join task to continue
> if link state change is detected.
> 
> Signed-off-by: Alex Estrin <alex.estrin@intel.com>

Thanks, applied.

> 
> Changes from v4:
> - as suggested by Doug Ledford, optimized spinlock usage,
> i.e. ipoib_mcast_join() is called with lock held.
> Changes from v3:
> - sync with priv->lock before flag check.
> Chages from v2:
> - Move check for OPER_UP flag state to mcast_join() to
> ensure no event worker is in progress.
> - minor style fixes.
> Changes from v1:
> - No need to lock again if error detected.
> ---
>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   24 +++++++++++++++++-------
>  1 files changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> index 050dfa1..2588931 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> @@ -456,7 +456,10 @@ out_locked:
>  	return status;
>  }
>  
> -static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
> +/*
> + * Caller must hold 'priv->lock'
> + */
> +static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ib_sa_multicast *multicast;
> @@ -466,6 +469,10 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  	ib_sa_comp_mask comp_mask;
>  	int ret = 0;
>  
> +	if (!priv->broadcast ||
> +	    !test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
> +		return -EINVAL;
> +
>  	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
>  
>  	rec.mgid     = mcast->mcmember.mgid;
> @@ -525,20 +532,23 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  			rec.join_state = 4;
>  #endif
>  	}
> +	spin_unlock_irq(&priv->lock);
>  
>  	multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
>  					 &rec, comp_mask, GFP_KERNEL,
>  					 ipoib_mcast_join_complete, mcast);
> +	spin_lock_irq(&priv->lock);
>  	if (IS_ERR(multicast)) {
>  		ret = PTR_ERR(multicast);
>  		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
> -		spin_lock_irq(&priv->lock);
>  		/* Requeue this join task with a backoff delay */
>  		__ipoib_mcast_schedule_join_thread(priv, mcast, 1);
>  		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
>  		spin_unlock_irq(&priv->lock);
>  		complete(&mcast->done);
> +		spin_lock_irq(&priv->lock);
>  	}
> +	return 0;
>  }
>  
>  void ipoib_mcast_join_task(struct work_struct *work)
> @@ -620,9 +630,10 @@ void ipoib_mcast_join_task(struct work_struct *work)
>  				/* Found the next unjoined group */
>  				init_completion(&mcast->done);
>  				set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
> -				spin_unlock_irq(&priv->lock);
> -				ipoib_mcast_join(dev, mcast);
> -				spin_lock_irq(&priv->lock);
> +				if (ipoib_mcast_join(dev, mcast)) {
> +					spin_unlock_irq(&priv->lock);
> +					return;
> +				}
>  			} else if (!delay_until ||
>  				 time_before(mcast->delay_until, delay_until))
>  				delay_until = mcast->delay_until;
> @@ -641,10 +652,9 @@ out:
>  	if (mcast) {
>  		init_completion(&mcast->done);
>  		set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
> +		ipoib_mcast_join(dev, mcast);
>  	}
>  	spin_unlock_irq(&priv->lock);
> -	if (mcast)
> -		ipoib_mcast_join(dev, mcast);
>  }
>  
>  int ipoib_mcast_start_thread(struct net_device *dev)
>
Or Gerlitz Feb. 12, 2016, 9:47 p.m. UTC | #2
On Fri, Feb 12, 2016 at 10:45 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Thu, Feb 11, 2016 at 10:30 PM, Alex Estrin <alex.estrin@intel.com> wrote:
>> Proposed patch should prevent multicast join task to continue
>> if link state change is detected.
>>
>> Signed-off-by: Alex Estrin <alex.estrin@intel.com>
>>
>> Changes from v4:
>> - as suggested by Doug Ledford, optimized spinlock usage,
>> i.e. ipoib_mcast_join() is called with lock held.
>> Changes from v3:
>> - sync with priv->lock before flag check.
>> Chages from v2:
>> - Move check for OPER_UP flag state to mcast_join() to
>> ensure no event worker is in progress.
>> - minor style fixes.
>> Changes from v1:
>> - No need to lock again if error detected.
>> ---
>>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   24
>> +++++++++++++++++-------
>>  1 files changed, 17 insertions(+), 7 deletions(-)
>>
>
>

Alex, for next times... please put the Vx --> Vy history either in the cover
letter or in a case where this is single patch (as was here) after the ---
line, else we'll have this listing present in the upstream kernel git for
the rest of the human/linux history, since when the maintainer uses git am,
it goes in.
>
Doug, I see now in your github that this indeed went in, could you made a
small git rebase rewording and remove this review history?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Feb. 12, 2016, 9:48 p.m. UTC | #3
On 02/12/2016 04:47 PM, Or Gerlitz wrote:
> On Fri, Feb 12, 2016 at 10:45 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Thu, Feb 11, 2016 at 10:30 PM, Alex Estrin <alex.estrin@intel.com> wrote:
>>> Proposed patch should prevent multicast join task to continue
>>> if link state change is detected.
>>>
>>> Signed-off-by: Alex Estrin <alex.estrin@intel.com>
>>>
>>> Changes from v4:
>>> - as suggested by Doug Ledford, optimized spinlock usage,
>>> i.e. ipoib_mcast_join() is called with lock held.
>>> Changes from v3:
>>> - sync with priv->lock before flag check.
>>> Chages from v2:
>>> - Move check for OPER_UP flag state to mcast_join() to
>>> ensure no event worker is in progress.
>>> - minor style fixes.
>>> Changes from v1:
>>> - No need to lock again if error detected.
>>> ---
>>>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   24
>>> +++++++++++++++++-------
>>>  1 files changed, 17 insertions(+), 7 deletions(-)
>>>
>>
>>
> 
> Alex, for next times... please put the Vx --> Vy history either in the cover
> letter or in a case where this is single patch (as was here) after the ---
> line, else we'll have this listing present in the upstream kernel git for
> the rest of the human/linux history, since when the maintainer uses git am,
> it goes in.
>>
> Doug, I see now in your github that this indeed went in, could you made a
> small git rebase rewording and remove this review history?
> 

It's not an important enough issue to warrant rebasing a tree that's
already hit my k.o repo where I don't rebase except under dire need.
Or Gerlitz Feb. 12, 2016, 9:57 p.m. UTC | #4
On Fri, Feb 12, 2016 at 10:48 PM, Doug Ledford <dledford@redhat.com> wrote:
> On 02/12/2016 04:47 PM, Or Gerlitz wrote:
>> On Fri, Feb 12, 2016 at 10:45 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Thu, Feb 11, 2016 at 10:30 PM, Alex Estrin <alex.estrin@intel.com> wrote:
>>>> Proposed patch should prevent multicast join task to continue
>>>> if link state change is detected.
>>>>
>>>> Signed-off-by: Alex Estrin <alex.estrin@intel.com>
>>>>
>>>> Changes from v4:
>>>> - as suggested by Doug Ledford, optimized spinlock usage,
>>>> i.e. ipoib_mcast_join() is called with lock held.
>>>> Changes from v3:
>>>> - sync with priv->lock before flag check.
>>>> Chages from v2:
>>>> - Move check for OPER_UP flag state to mcast_join() to
>>>> ensure no event worker is in progress.
>>>> - minor style fixes.
>>>> Changes from v1:
>>>> - No need to lock again if error detected.

>> Doug, I see now in your github that this indeed went in, could you made a
>> small git rebase rewording and remove this review history?

> It's not an important enough issue to warrant rebasing a tree that's
> already hit my k.o repo where I don't rebase except under dire need.

I disagree re the importance. The patch planted on your k.o tree just
minutes/hours ago... there
should be some borders to how low we're willing to get our subsystem
standards. The S.O.B area of
the patch is really badly written, the author S.O.B, next maybe 20
lines of irrelevant history followed
by the maintainer S.O.B -- this is very far from how properly signed
kernel commits need to look like.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Estrin, Alex Feb. 12, 2016, 11:19 p.m. UTC | #5
> >

> 

> Alex, for next times... please put the Vx --> Vy history either in the cover

> letter or in a case where this is single patch (as was here) after the ---

> line, else we'll have this listing present in the upstream kernel git for

> the rest of the human/linux history, since when the maintainer uses git am,

> it goes in.


Yes, it is my fault.
I am aware about '---', but somehow I've missed it anyway.

> Doug, I see now in your github that this indeed went in, could you made a

> small git rebase rewording and remove this review history?
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 050dfa1..2588931 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -456,7 +456,10 @@  out_locked:
 	return status;
 }
 
-static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
+/*
+ * Caller must hold 'priv->lock'
+ */
+static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_sa_multicast *multicast;
@@ -466,6 +469,10 @@  static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 	ib_sa_comp_mask comp_mask;
 	int ret = 0;
 
+	if (!priv->broadcast ||
+	    !test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		return -EINVAL;
+
 	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
 
 	rec.mgid     = mcast->mcmember.mgid;
@@ -525,20 +532,23 @@  static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 			rec.join_state = 4;
 #endif
 	}
+	spin_unlock_irq(&priv->lock);
 
 	multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
 					 &rec, comp_mask, GFP_KERNEL,
 					 ipoib_mcast_join_complete, mcast);
+	spin_lock_irq(&priv->lock);
 	if (IS_ERR(multicast)) {
 		ret = PTR_ERR(multicast);
 		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
-		spin_lock_irq(&priv->lock);
 		/* Requeue this join task with a backoff delay */
 		__ipoib_mcast_schedule_join_thread(priv, mcast, 1);
 		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
 		spin_unlock_irq(&priv->lock);
 		complete(&mcast->done);
+		spin_lock_irq(&priv->lock);
 	}
+	return 0;
 }
 
 void ipoib_mcast_join_task(struct work_struct *work)
@@ -620,9 +630,10 @@  void ipoib_mcast_join_task(struct work_struct *work)
 				/* Found the next unjoined group */
 				init_completion(&mcast->done);
 				set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-				spin_unlock_irq(&priv->lock);
-				ipoib_mcast_join(dev, mcast);
-				spin_lock_irq(&priv->lock);
+				if (ipoib_mcast_join(dev, mcast)) {
+					spin_unlock_irq(&priv->lock);
+					return;
+				}
 			} else if (!delay_until ||
 				 time_before(mcast->delay_until, delay_until))
 				delay_until = mcast->delay_until;
@@ -641,10 +652,9 @@  out:
 	if (mcast) {
 		init_completion(&mcast->done);
 		set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+		ipoib_mcast_join(dev, mcast);
 	}
 	spin_unlock_irq(&priv->lock);
-	if (mcast)
-		ipoib_mcast_join(dev, mcast);
 }
 
 int ipoib_mcast_start_thread(struct net_device *dev)