diff mbox series

[v2] md: do not _put wrong device in md_seq_next

Message ID 20230912130124.666-1-mariusz.tkaczyk@linux.intel.com (mailing list archive)
State Superseded, archived
Headers show
Series [v2] md: do not _put wrong device in md_seq_next | expand

Commit Message

Mariusz Tkaczyk Sept. 12, 2023, 1:01 p.m. UTC
During working on changes proposed by Kuai [1], I determined that
mddev->active is continusly decremented for array marked by MD_CLOSING.
It brought me to md_seq_next() changed by [2]. I determined the regression
here, if mddev_get() fails we updated mddev pointer and as a result we
_put failed device.

I isolated the change in md_seq_next() and tested it- issue is no longer
reproducible but I don't see the root cause in this scenario. The bug
is obvious so I proceed with fixing. I will submit MD_CLOSING patches
separatelly.

Put the device which has been _get with previous md_seq_next() call.
Add guard for inproper mddev_put usage(). It shouldn't be called if
there are less than 1 for mddev->active.

I didn't convert atomic_t to refcount_t because refcount warns when 0 is
achieved which is likely to happen for mddev->active.

[1] https://lore.kernel.org/linux-raid/028a21df-4397-80aa-c2a5-7c754560f595@gmail.com/T/#m6a534677d9654a4236623661c10646d45419ee1b
[2] https://bugzilla.kernel.org/show_bug.cgi?id=217798

Fixes: 12a6caf27324 ("md: only delete entries from all_mddevs when the disk is freed")
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: AceLan Kao <acelan@gmail.com>
Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
---
 drivers/md/md.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

Comments

Yu Kuai Sept. 12, 2023, 1:25 p.m. UTC | #1
Hi,

在 2023/09/12 21:01, Mariusz Tkaczyk 写道:
> During working on changes proposed by Kuai [1], I determined that
> mddev->active is continusly decremented for array marked by MD_CLOSING.
> It brought me to md_seq_next() changed by [2]. I determined the regression
> here, if mddev_get() fails we updated mddev pointer and as a result we
> _put failed device.

This mddev is decremented while there is another mddev increased, that's
why AceLan said that single array can't reporduce the problem.

And because mddev->active is leaked, then del_gendisk() will never be
called for the mddev while closing the array, that's why user will
always see this array, cause infiniate loop open -> stop array -> close
for systemd-shutdown.
> 
> I isolated the change in md_seq_next() and tested it- issue is no longer
> reproducible but I don't see the root cause in this scenario. The bug
> is obvious so I proceed with fixing. I will submit MD_CLOSING patches
> separatelly.
> 
> Put the device which has been _get with previous md_seq_next() call.
> Add guard for inproper mddev_put usage(). It shouldn't be called if
> there are less than 1 for mddev->active.
> 
> I didn't convert atomic_t to refcount_t because refcount warns when 0 is
> achieved which is likely to happen for mddev->active.
> 

LGTM,
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
> [1] https://lore.kernel.org/linux-raid/028a21df-4397-80aa-c2a5-7c754560f595@gmail.com/T/#m6a534677d9654a4236623661c10646d45419ee1b
> [2] https://bugzilla.kernel.org/show_bug.cgi?id=217798
> 
> Fixes: 12a6caf27324 ("md: only delete entries from all_mddevs when the disk is freed")
> Cc: Yu Kuai <yukuai3@huawei.com>
> Cc: AceLan Kao <acelan@gmail.com>
> Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
> ---
>   drivers/md/md.c | 20 +++++++++++---------
>   1 file changed, 11 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 0fe7ab6e8ab9..bb654ff62765 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -618,6 +618,12 @@ static void mddev_delayed_delete(struct work_struct *ws);
>   
>   void mddev_put(struct mddev *mddev)
>   {
> +	/* Guard for ambiguous put. */
> +	if (unlikely(atomic_read(&mddev->active) < 1)) {
> +		pr_warn("%s: active refcount is lower than 1\n", mdname(mddev));
> +		return;
> +	}
> +
>   	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
>   		return;
>   	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
> @@ -8227,19 +8233,16 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>   {
>   	struct list_head *tmp;
>   	struct mddev *next_mddev, *mddev = v;
> -	struct mddev *to_put = NULL;
>   
>   	++*pos;
> -	if (v == (void*)2)
> +	if (v == (void *)2)
>   		return NULL;
>   
>   	spin_lock(&all_mddevs_lock);
> -	if (v == (void*)1) {
> +	if (v == (void *)1)
>   		tmp = all_mddevs.next;
> -	} else {
> -		to_put = mddev;
> +	else
>   		tmp = mddev->all_mddevs.next;
> -	}
>   
>   	for (;;) {
>   		if (tmp == &all_mddevs) {
> @@ -8250,12 +8253,11 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>   		next_mddev = list_entry(tmp, struct mddev, all_mddevs);
>   		if (mddev_get(next_mddev))
>   			break;
> -		mddev = next_mddev;
> -		tmp = mddev->all_mddevs.next;
> +		tmp = next_mddev->all_mddevs.next;
>   	}
>   	spin_unlock(&all_mddevs_lock);
>   
> -	if (to_put)
> +	if (v != (void *)1)
>   		mddev_put(mddev);
>   	return next_mddev;
>   
>
Mariusz Tkaczyk Sept. 12, 2023, 1:40 p.m. UTC | #2
On Tue, 12 Sep 2023 21:25:24 +0800
Yu Kuai <yukuai1@huaweicloud.com> wrote:

> Hi,
> 
> 在 2023/09/12 21:01, Mariusz Tkaczyk 写道:
> > During working on changes proposed by Kuai [1], I determined that
> > mddev->active is continusly decremented for array marked by MD_CLOSING.
> > It brought me to md_seq_next() changed by [2]. I determined the regression
> > here, if mddev_get() fails we updated mddev pointer and as a result we
> > _put failed device.  
> 
> This mddev is decremented while there is another mddev increased, that's
> why AceLan said that single array can't reporduce the problem.
> 
> And because mddev->active is leaked, then del_gendisk() will never be
> called for the mddev while closing the array, that's why user will
> always see this array, cause infiniate loop open -> stop array -> close
> for systemd-shutdown.

Ohh, I see the scenario now...
First array can be successfully stopped. We marked MD_DELETED and proceed
with scheduling wq but in the middle of that md_seq_next() increased active for
other array and decrement active for the one with MD_DELETED.
For this next array we are unable to reach active == 0 anymore.

Song, let me know if you need description like that in commit message.

Thanks!
Mariusz
Song Liu Sept. 12, 2023, 8:24 p.m. UTC | #3
Hi Mariusz,

Thanks for the fix!

On Tue, Sep 12, 2023 at 6:40 AM Mariusz Tkaczyk
<mariusz.tkaczyk@linux.intel.com> wrote:
>
> On Tue, 12 Sep 2023 21:25:24 +0800
> Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> > Hi,
> >
> > 在 2023/09/12 21:01, Mariusz Tkaczyk 写道:
> > > During working on changes proposed by Kuai [1], I determined that
> > > mddev->active is continusly decremented for array marked by MD_CLOSING.
> > > It brought me to md_seq_next() changed by [2]. I determined the regression
> > > here, if mddev_get() fails we updated mddev pointer and as a result we
> > > _put failed device.
> >
> > This mddev is decremented while there is another mddev increased, that's
> > why AceLan said that single array can't reporduce the problem.
> >
> > And because mddev->active is leaked, then del_gendisk() will never be
> > called for the mddev while closing the array, that's why user will
> > always see this array, cause infiniate loop open -> stop array -> close
> > for systemd-shutdown.
>
> Ohh, I see the scenario now...
> First array can be successfully stopped. We marked MD_DELETED and proceed
> with scheduling wq but in the middle of that md_seq_next() increased active for
> other array and decrement active for the one with MD_DELETED.
> For this next array we are unable to reach active == 0 anymore.
>
> Song, let me know if you need description like that in commit message.

Yes, please revise the commit log to include this information.

Also, please add Reported-by: and/or Closes: tags.

Thanks again!
Song
Song Liu Sept. 12, 2023, 10:49 p.m. UTC | #4
On Tue, Sep 12, 2023 at 6:02 AM Mariusz Tkaczyk
<mariusz.tkaczyk@linux.intel.com> wrote:
>
> During working on changes proposed by Kuai [1], I determined that
> mddev->active is continusly decremented for array marked by MD_CLOSING.
> It brought me to md_seq_next() changed by [2]. I determined the regression
> here, if mddev_get() fails we updated mddev pointer and as a result we
> _put failed device.
>
> I isolated the change in md_seq_next() and tested it- issue is no longer
> reproducible but I don't see the root cause in this scenario. The bug
> is obvious so I proceed with fixing. I will submit MD_CLOSING patches
> separatelly.
>
> Put the device which has been _get with previous md_seq_next() call.
> Add guard for inproper mddev_put usage(). It shouldn't be called if
> there are less than 1 for mddev->active.
>
> I didn't convert atomic_t to refcount_t because refcount warns when 0 is
> achieved which is likely to happen for mddev->active.
>
> [1] https://lore.kernel.org/linux-raid/028a21df-4397-80aa-c2a5-7c754560f595@gmail.com/T/#m6a534677d9654a4236623661c10646d45419ee1b
> [2] https://bugzilla.kernel.org/show_bug.cgi?id=217798
>
> Fixes: 12a6caf27324 ("md: only delete entries from all_mddevs when the disk is freed")
> Cc: Yu Kuai <yukuai3@huawei.com>
> Cc: AceLan Kao <acelan@gmail.com>
> Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
> ---
>  drivers/md/md.c | 20 +++++++++++---------
>  1 file changed, 11 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 0fe7ab6e8ab9..bb654ff62765 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -618,6 +618,12 @@ static void mddev_delayed_delete(struct work_struct *ws);
>
>  void mddev_put(struct mddev *mddev)
>  {
> +       /* Guard for ambiguous put. */
> +       if (unlikely(atomic_read(&mddev->active) < 1)) {
> +               pr_warn("%s: active refcount is lower than 1\n", mdname(mddev));
> +               return;
> +       }
> +

Could you please explain why we need this guard? We should probably fix
the caller that causes this.

>         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
>                 return;
>         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
> @@ -8227,19 +8233,16 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>  {
>         struct list_head *tmp;
>         struct mddev *next_mddev, *mddev = v;
> -       struct mddev *to_put = NULL;

IIUC, all we need is the following:

diff --git i/drivers/md/md.c w/drivers/md/md.c
index 73758b754127..a104a025084d 100644
--- i/drivers/md/md.c
+++ w/drivers/md/md.c
@@ -8265,7 +8265,7 @@ static void *md_seq_next(struct seq_file *seq,
void *v, loff_t *pos)
        spin_unlock(&all_mddevs_lock);

        if (to_put)
-               mddev_put(mddev);
+               mddev_put(to_put);
        return next_mddev;

 }

Is this sufficient? If so, how about we ship this first and refactor
the code later
in a separate patch?

Thanks,
Song
Mariusz Tkaczyk Sept. 13, 2023, 8:26 a.m. UTC | #5
On Tue, 12 Sep 2023 15:49:12 -0700
Song Liu <song@kernel.org> wrote:

> On Tue, Sep 12, 2023 at 6:02 AM Mariusz Tkaczyk
> <mariusz.tkaczyk@linux.intel.com> wrote:
> >
> > During working on changes proposed by Kuai [1], I determined that
> > mddev->active is continusly decremented for array marked by MD_CLOSING.
> > It brought me to md_seq_next() changed by [2]. I determined the regression
> > here, if mddev_get() fails we updated mddev pointer and as a result we
> > _put failed device.
> >
> > I isolated the change in md_seq_next() and tested it- issue is no longer
> > reproducible but I don't see the root cause in this scenario. The bug
> > is obvious so I proceed with fixing. I will submit MD_CLOSING patches
> > separatelly.
> >
> > Put the device which has been _get with previous md_seq_next() call.
> > Add guard for inproper mddev_put usage(). It shouldn't be called if
> > there are less than 1 for mddev->active.
> >
> > I didn't convert atomic_t to refcount_t because refcount warns when 0 is
> > achieved which is likely to happen for mddev->active.
> >
> > [1]
> > https://lore.kernel.org/linux-raid/028a21df-4397-80aa-c2a5-7c754560f595@gmail.com/T/#m6a534677d9654a4236623661c10646d45419ee1b
> > [2] https://bugzilla.kernel.org/show_bug.cgi?id=217798
> >
> > Fixes: 12a6caf27324 ("md: only delete entries from all_mddevs when the disk
> > is freed") Cc: Yu Kuai <yukuai3@huawei.com>
> > Cc: AceLan Kao <acelan@gmail.com>
> > Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
> > ---
> >  drivers/md/md.c | 20 +++++++++++---------
> >  1 file changed, 11 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 0fe7ab6e8ab9..bb654ff62765 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -618,6 +618,12 @@ static void mddev_delayed_delete(struct work_struct
> > *ws);
> >
> >  void mddev_put(struct mddev *mddev)
> >  {
> > +       /* Guard for ambiguous put. */
> > +       if (unlikely(atomic_read(&mddev->active) < 1)) {
> > +               pr_warn("%s: active refcount is lower than 1\n",
> > mdname(mddev));
> > +               return;
> > +       }
> > +  
> 
> Could you please explain why we need this guard? We should probably fix
> the caller that causes this.

We have an issue, so I added warning to catch similar mistakes. Please
note that for refcount_t such warning is implemented.
 
If you don't see it useful I can remove it.

> 
> >         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
> >                 return;
> >         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
> > @@ -8227,19 +8233,16 @@ static void *md_seq_next(struct seq_file *seq, void
> > *v, loff_t *pos) {
> >         struct list_head *tmp;
> >         struct mddev *next_mddev, *mddev = v;
> > -       struct mddev *to_put = NULL;  
> 
> IIUC, all we need is the following:
> 
> diff --git i/drivers/md/md.c w/drivers/md/md.c
> index 73758b754127..a104a025084d 100644
> --- i/drivers/md/md.c
> +++ w/drivers/md/md.c
> @@ -8265,7 +8265,7 @@ static void *md_seq_next(struct seq_file *seq,
> void *v, loff_t *pos)
>         spin_unlock(&all_mddevs_lock);
> 
>         if (to_put)
> -               mddev_put(mddev);
> +               mddev_put(to_put);
>         return next_mddev;
> 
>  }
> 
> Is this sufficient? If so, how about we ship this first and refactor
> the code later
> in a separate patch?

That is correct it should be sufficient. If you don't want it in one patch I
will drop refactor because Kuai re-implemented it already.

Anyway, changes I made are simple and tested, I don't see it risk in it.
Your proposal will make merge conflicts less likely to appear. I that matters I
can simplify the patch.

Please let me know what you think, then I will send v3.

Thanks,
Mariusz
Song Liu Sept. 13, 2023, 4:22 p.m. UTC | #6
On Wed, Sep 13, 2023 at 1:26 AM Mariusz Tkaczyk
<mariusz.tkaczyk@linux.intel.com> wrote:
>
> On Tue, 12 Sep 2023 15:49:12 -0700
> Song Liu <song@kernel.org> wrote:
>
> > On Tue, Sep 12, 2023 at 6:02 AM Mariusz Tkaczyk
> > <mariusz.tkaczyk@linux.intel.com> wrote:
> > >
> > > During working on changes proposed by Kuai [1], I determined that
> > > mddev->active is continusly decremented for array marked by MD_CLOSING.
> > > It brought me to md_seq_next() changed by [2]. I determined the regression
> > > here, if mddev_get() fails we updated mddev pointer and as a result we
> > > _put failed device.
> > >
> > > I isolated the change in md_seq_next() and tested it- issue is no longer
> > > reproducible but I don't see the root cause in this scenario. The bug
> > > is obvious so I proceed with fixing. I will submit MD_CLOSING patches
> > > separatelly.
> > >
> > > Put the device which has been _get with previous md_seq_next() call.
> > > Add guard for inproper mddev_put usage(). It shouldn't be called if
> > > there are less than 1 for mddev->active.
> > >
> > > I didn't convert atomic_t to refcount_t because refcount warns when 0 is
> > > achieved which is likely to happen for mddev->active.
> > >
> > > [1]
> > > https://lore.kernel.org/linux-raid/028a21df-4397-80aa-c2a5-7c754560f595@gmail.com/T/#m6a534677d9654a4236623661c10646d45419ee1b
> > > [2] https://bugzilla.kernel.org/show_bug.cgi?id=217798
> > >
> > > Fixes: 12a6caf27324 ("md: only delete entries from all_mddevs when the disk
> > > is freed") Cc: Yu Kuai <yukuai3@huawei.com>
> > > Cc: AceLan Kao <acelan@gmail.com>
> > > Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
> > > ---
> > >  drivers/md/md.c | 20 +++++++++++---------
> > >  1 file changed, 11 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > > index 0fe7ab6e8ab9..bb654ff62765 100644
> > > --- a/drivers/md/md.c
> > > +++ b/drivers/md/md.c
> > > @@ -618,6 +618,12 @@ static void mddev_delayed_delete(struct work_struct
> > > *ws);
> > >
> > >  void mddev_put(struct mddev *mddev)
> > >  {
> > > +       /* Guard for ambiguous put. */
> > > +       if (unlikely(atomic_read(&mddev->active) < 1)) {
> > > +               pr_warn("%s: active refcount is lower than 1\n",
> > > mdname(mddev));
> > > +               return;
> > > +       }
> > > +
> >
> > Could you please explain why we need this guard? We should probably fix
> > the caller that causes this.
>
> We have an issue, so I added warning to catch similar mistakes. Please
> note that for refcount_t such warning is implemented.
>
> If you don't see it useful I can remove it.

Let's add it (or use refcount_t) in a separate patch.

> >
> > >         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
> > >                 return;
> > >         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
> > > @@ -8227,19 +8233,16 @@ static void *md_seq_next(struct seq_file *seq, void
> > > *v, loff_t *pos) {
> > >         struct list_head *tmp;
> > >         struct mddev *next_mddev, *mddev = v;
> > > -       struct mddev *to_put = NULL;
> >
> > IIUC, all we need is the following:
> >
> > diff --git i/drivers/md/md.c w/drivers/md/md.c
> > index 73758b754127..a104a025084d 100644
> > --- i/drivers/md/md.c
> > +++ w/drivers/md/md.c
> > @@ -8265,7 +8265,7 @@ static void *md_seq_next(struct seq_file *seq,
> > void *v, loff_t *pos)
> >         spin_unlock(&all_mddevs_lock);
> >
> >         if (to_put)
> > -               mddev_put(mddev);
> > +               mddev_put(to_put);
> >         return next_mddev;
> >
> >  }
> >
> > Is this sufficient? If so, how about we ship this first and refactor
> > the code later
> > in a separate patch?
>
> That is correct it should be sufficient. If you don't want it in one patch I
> will drop refactor because Kuai re-implemented it already.
>
> Anyway, changes I made are simple and tested, I don't see it risk in it.
> Your proposal will make merge conflicts less likely to appear. I that matters I
> can simplify the patch.

Let's keep the fix as clean as possible. Please test the one line fix and
resubmit the patch.

Thanks,
Song
diff mbox series

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0fe7ab6e8ab9..bb654ff62765 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -618,6 +618,12 @@  static void mddev_delayed_delete(struct work_struct *ws);
 
 void mddev_put(struct mddev *mddev)
 {
+	/* Guard for ambiguous put. */
+	if (unlikely(atomic_read(&mddev->active) < 1)) {
+		pr_warn("%s: active refcount is lower than 1\n", mdname(mddev));
+		return;
+	}
+
 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 		return;
 	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -8227,19 +8233,16 @@  static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct list_head *tmp;
 	struct mddev *next_mddev, *mddev = v;
-	struct mddev *to_put = NULL;
 
 	++*pos;
-	if (v == (void*)2)
+	if (v == (void *)2)
 		return NULL;
 
 	spin_lock(&all_mddevs_lock);
-	if (v == (void*)1) {
+	if (v == (void *)1)
 		tmp = all_mddevs.next;
-	} else {
-		to_put = mddev;
+	else
 		tmp = mddev->all_mddevs.next;
-	}
 
 	for (;;) {
 		if (tmp == &all_mddevs) {
@@ -8250,12 +8253,11 @@  static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		next_mddev = list_entry(tmp, struct mddev, all_mddevs);
 		if (mddev_get(next_mddev))
 			break;
-		mddev = next_mddev;
-		tmp = mddev->all_mddevs.next;
+		tmp = next_mddev->all_mddevs.next;
 	}
 	spin_unlock(&all_mddevs_lock);
 
-	if (to_put)
+	if (v != (void *)1)
 		mddev_put(mddev);
 	return next_mddev;