diff mbox series

[v2,08/11] md: add atomic mode switching in RAID 1/10

Message ID 47c035c3e741418b80eb6b73d96e7e92@kioxia.com (mailing list archive)
State Changes Requested, archived
Headers show
Series [v2,01/11] md: add infra for active_aligned_reads changes | expand

Commit Message

tada keisuke April 18, 2024, 5:44 a.m. UTC
This patch depends on patch 07.

All rdevs running in RAID 1/10 switch nr_pending to atomic mode.
The value of nr_pending is read in a normal operation (choose_best_rdev()).
Therefore, nr_pending must always be consistent.

Signed-off-by: Keisuke TADA <keisuke1.tada@kioxia.com>
Signed-off-by: Toshifumi OHTAKE <toshifumi.ootake@kioxia.com>
---
 drivers/md/md.h     | 14 ++++++++++++++
 drivers/md/raid1.c  |  7 +++++++
 drivers/md/raid10.c |  4 ++++
 3 files changed, 25 insertions(+)

Comments

Paul E Luse April 16, 2024, 2:38 p.m. UTC | #1
On Thu, 18 Apr 2024 14:39:53 +0800
Yu Kuai <yukuai1@huaweicloud.com> wrote:

> Hi,
> 
> 在 2024/04/18 13:44, tada keisuke 写道:
> > This patch depends on patch 07.
> > 
> > All rdevs running in RAID 1/10 switch nr_pending to atomic mode.
> > The value of nr_pending is read in a normal operation
> > (choose_best_rdev()). Therefore, nr_pending must always be
> > consistent.
> > 
> > Signed-off-by: Keisuke TADA <keisuke1.tada@kioxia.com>
> > Signed-off-by: Toshifumi OHTAKE <toshifumi.ootake@kioxia.com>
> > ---
> >   drivers/md/md.h     | 14 ++++++++++++++
> >   drivers/md/raid1.c  |  7 +++++++
> >   drivers/md/raid10.c |  4 ++++
> >   3 files changed, 25 insertions(+)
> > 
> > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > index ab09e312c9bb..57b09b567ffa 100644
> > --- a/drivers/md/md.h
> > +++ b/drivers/md/md.h
> > @@ -236,6 +236,20 @@ static inline unsigned long
> > nr_pending_read(struct md_rdev *rdev) return
> > atomic_long_read(&rdev->nr_pending.data->count); }
> >   
> > +static inline bool nr_pending_is_percpu_mode(struct md_rdev *rdev)
> > +{
> > +	unsigned long __percpu *percpu_count;
> > +
> > +	return __ref_is_percpu(&rdev->nr_pending, &percpu_count);
> > +}
> > +
> > +static inline bool nr_pending_is_atomic_mode(struct md_rdev *rdev)
> > +{
> > +	unsigned long __percpu *percpu_count;
> > +
> > +	return !__ref_is_percpu(&rdev->nr_pending, &percpu_count);
> > +}
> > +
> >   static inline int is_badblock(struct md_rdev *rdev, sector_t s,
> > int sectors, sector_t *first_bad, int *bad_sectors)
> >   {
> > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> > index 12318fb15a88..c38ae13aadab 100644
> > --- a/drivers/md/raid1.c
> > +++ b/drivers/md/raid1.c
> > @@ -784,6 +784,7 @@ static int choose_best_rdev(struct r1conf
> > *conf, struct r1bio *r1_bio) if (ctl.readable_disks++ == 1)
> >   			set_bit(R1BIO_FailFast, &r1_bio->state);
> >   
> > +		WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
> >   		pending = nr_pending_read(rdev);
> >   		dist = abs(r1_bio->sector -
> > conf->mirrors[disk].head_position); 
> > @@ -1930,6 +1931,7 @@ static int raid1_add_disk(struct mddev
> > *mddev, struct md_rdev *rdev) if (err)
> >   				return err;
> >   
> > +
> > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > raid1_add_conf(conf, rdev, mirror, false); /* As all devices are
> > equivalent, we don't need a full recovery
> >   			 * if this was recently any drive of the
> > array @@ -1949,6 +1951,7 @@ static int raid1_add_disk(struct mddev
> > *mddev, struct md_rdev *rdev) set_bit(Replacement, &rdev->flags);
> >   		raid1_add_conf(conf, rdev, repl_slot, true);
> >   		err = 0;
> > +
> > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);  
> 
> I don't understand what's the point here, 'nr_pending' will be used
> when the rdev issuing IO, and it's always used as atomic mode, there
> is no difference.
> 
> Consider that 'nr_pending' must be read from IO fast path, use it as
> atomic is something we must accept. Unless someone comes up with a
> plan to avoid reading 'inflight' counter from fast path like generic
> block layer, it's not ok to me to switch to percpu_ref for now.
> 
> +CC Paul
> 
> HI, Paul, perhaps you RR mode doesn't need such 'inflight' counter
> anymore?
> 

I too am struggling to see the benefit but am curious enough that I
will run some tests on my own as I have fresh baseline RAID1 large sweep
performance data ready right now.

So my WIP round robin patch won't need nr_pedning for SSDs but I think
it will for HDD plus I need a new atomic counter for round robin for
SSDs anyway but I see no perfomrnace impact so far from adding it.

-Paul

> Thanks,
> Kuai
> 
> >   		conf->fullsync = 1;
> >   	}
> >   
> > @@ -3208,6 +3211,7 @@ static void raid1_free(struct mddev *mddev,
> > void *priv); static int raid1_run(struct mddev *mddev)
> >   {
> >   	struct r1conf *conf;
> > +	struct md_rdev *rdev;
> >   	int i;
> >   	int ret;
> >   
> > @@ -3269,6 +3273,9 @@ static int raid1_run(struct mddev *mddev)
> >   	/*
> >   	 * Ok, everything is just fine now
> >   	 */
> > +	rdev_for_each(rdev, mddev) {
> > +
> > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > +	}
> >   	rcu_assign_pointer(mddev->thread, conf->thread);
> >   	rcu_assign_pointer(conf->thread, NULL);
> >   	mddev->private = conf;
> > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> > index b91dd6c0be5a..66896a1076e1 100644
> > --- a/drivers/md/raid10.c
> > +++ b/drivers/md/raid10.c
> > @@ -808,6 +808,7 @@ static struct md_rdev *read_balance(struct
> > r10conf *conf, 
> >   		nonrot = bdev_nonrot(rdev->bdev);
> >   		has_nonrot_disk |= nonrot;
> > +		WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
> >   		pending = nr_pending_read(rdev);
> >   		if (min_pending > pending && nonrot) {
> >   			min_pending = pending;
> > @@ -2113,6 +2114,7 @@ static int raid10_add_disk(struct mddev
> > *mddev, struct md_rdev *rdev) p->recovery_disabled =
> > mddev->recovery_disabled - 1; rdev->raid_disk = mirror;
> >   		err = 0;
> > +
> > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); if
> > (rdev->saved_raid_disk != mirror) conf->fullsync = 1;
> >   		WRITE_ONCE(p->rdev, rdev);
> > @@ -2127,6 +2129,7 @@ static int raid10_add_disk(struct mddev
> > *mddev, struct md_rdev *rdev) err = mddev_stack_new_rdev(mddev,
> > rdev); if (err)
> >   			return err;
> > +
> > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); conf->fullsync
> > = 1; WRITE_ONCE(p->replacement, rdev);
> >   	}
> > @@ -4028,6 +4031,7 @@ static int raid10_run(struct mddev *mddev)
> >   	rdev_for_each(rdev, mddev) {
> >   		long long diff;
> >   
> > +
> > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); disk_idx =
> > rdev->raid_disk; if (disk_idx < 0)
> >   			continue;
> >   
> 
>
Paul E Luse April 16, 2024, 10:41 p.m. UTC | #2
On Tue, 16 Apr 2024 07:38:06 -0700
Paul E Luse <paul.e.luse@linux.intel.com> wrote:

> On Thu, 18 Apr 2024 14:39:53 +0800
> Yu Kuai <yukuai1@huaweicloud.com> wrote:
> 
> > Hi,
> > 
> > 在 2024/04/18 13:44, tada keisuke 写道:  
> > > This patch depends on patch 07.
> > > 
> > > All rdevs running in RAID 1/10 switch nr_pending to atomic mode.
> > > The value of nr_pending is read in a normal operation
> > > (choose_best_rdev()). Therefore, nr_pending must always be
> > > consistent.
> > > 
> > > Signed-off-by: Keisuke TADA <keisuke1.tada@kioxia.com>
> > > Signed-off-by: Toshifumi OHTAKE <toshifumi.ootake@kioxia.com>
> > > ---
> > >   drivers/md/md.h     | 14 ++++++++++++++
> > >   drivers/md/raid1.c  |  7 +++++++
> > >   drivers/md/raid10.c |  4 ++++
> > >   3 files changed, 25 insertions(+)
> > > 
> > > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > > index ab09e312c9bb..57b09b567ffa 100644
> > > --- a/drivers/md/md.h
> > > +++ b/drivers/md/md.h
> > > @@ -236,6 +236,20 @@ static inline unsigned long
> > > nr_pending_read(struct md_rdev *rdev) return
> > > atomic_long_read(&rdev->nr_pending.data->count); }
> > >   
> > > +static inline bool nr_pending_is_percpu_mode(struct md_rdev
> > > *rdev) +{
> > > +	unsigned long __percpu *percpu_count;
> > > +
> > > +	return __ref_is_percpu(&rdev->nr_pending, &percpu_count);
> > > +}
> > > +
> > > +static inline bool nr_pending_is_atomic_mode(struct md_rdev
> > > *rdev) +{
> > > +	unsigned long __percpu *percpu_count;
> > > +
> > > +	return !__ref_is_percpu(&rdev->nr_pending,
> > > &percpu_count); +}
> > > +
> > >   static inline int is_badblock(struct md_rdev *rdev, sector_t s,
> > > int sectors, sector_t *first_bad, int *bad_sectors)
> > >   {
> > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> > > index 12318fb15a88..c38ae13aadab 100644
> > > --- a/drivers/md/raid1.c
> > > +++ b/drivers/md/raid1.c
> > > @@ -784,6 +784,7 @@ static int choose_best_rdev(struct r1conf
> > > *conf, struct r1bio *r1_bio) if (ctl.readable_disks++ == 1)
> > >   			set_bit(R1BIO_FailFast, &r1_bio->state);
> > >   
> > > +		WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
> > >   		pending = nr_pending_read(rdev);
> > >   		dist = abs(r1_bio->sector -
> > > conf->mirrors[disk].head_position); 
> > > @@ -1930,6 +1931,7 @@ static int raid1_add_disk(struct mddev
> > > *mddev, struct md_rdev *rdev) if (err)
> > >   				return err;
> > >   
> > > +
> > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > > raid1_add_conf(conf, rdev, mirror, false); /* As all devices are
> > > equivalent, we don't need a full recovery
> > >   			 * if this was recently any drive of the
> > > array @@ -1949,6 +1951,7 @@ static int raid1_add_disk(struct mddev
> > > *mddev, struct md_rdev *rdev) set_bit(Replacement, &rdev->flags);
> > >   		raid1_add_conf(conf, rdev, repl_slot, true);
> > >   		err = 0;
> > > +
> > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);    
> > 
> > I don't understand what's the point here, 'nr_pending' will be used
> > when the rdev issuing IO, and it's always used as atomic mode, there
> > is no difference.
> > 
> > Consider that 'nr_pending' must be read from IO fast path, use it as
> > atomic is something we must accept. Unless someone comes up with a
> > plan to avoid reading 'inflight' counter from fast path like generic
> > block layer, it's not ok to me to switch to percpu_ref for now.
> > 
> > +CC Paul
> > 
> > HI, Paul, perhaps you RR mode doesn't need such 'inflight' counter
> > anymore?
> >   
> 
> I too am struggling to see the benefit but am curious enough that I
> will run some tests on my own as I have fresh baseline RAID1 large
> sweep performance data ready right now.
> 
> So my WIP round robin patch won't need nr_pedning for SSDs but I think
> it will for HDD plus I need a new atomic counter for round robin for
> SSDs anyway but I see no perfomrnace impact so far from adding it.
> 
> -Paul
> 

I can run more if others are interested (RAID5 or HDD for example) but
at least for RAID1 there's really no difference.  This was a quick run,
just 40 sec each, 16 jobs and the rest ofthe fio params are on the
charts. 2 disk RAID1. THe baseline is 6.8.0 from the md repo.

Using my favorite drives, of course, KIOXIA KCMYDVUG3T20 :)

Here's the results: https://photos.app.goo.gl/Avyw64eXCqWFWrs78

NOTE:  There are few small randoms where it appears to help but I
assumed that was because these are small randoms with very short run
times.  SO I reran the 4K mixed rw randoms with 5 minute run time and
that chart is at the very bottom and shows that over longer duration
its a wash, there's no clear winner.  I'm sure an even longer run would
show more consistently close results.

> > Thanks,
> > Kuai
> >   
> > >   		conf->fullsync = 1;
> > >   	}
> > >   
> > > @@ -3208,6 +3211,7 @@ static void raid1_free(struct mddev *mddev,
> > > void *priv); static int raid1_run(struct mddev *mddev)
> > >   {
> > >   	struct r1conf *conf;
> > > +	struct md_rdev *rdev;
> > >   	int i;
> > >   	int ret;
> > >   
> > > @@ -3269,6 +3273,9 @@ static int raid1_run(struct mddev *mddev)
> > >   	/*
> > >   	 * Ok, everything is just fine now
> > >   	 */
> > > +	rdev_for_each(rdev, mddev) {
> > > +
> > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > > +	}
> > >   	rcu_assign_pointer(mddev->thread, conf->thread);
> > >   	rcu_assign_pointer(conf->thread, NULL);
> > >   	mddev->private = conf;
> > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> > > index b91dd6c0be5a..66896a1076e1 100644
> > > --- a/drivers/md/raid10.c
> > > +++ b/drivers/md/raid10.c
> > > @@ -808,6 +808,7 @@ static struct md_rdev *read_balance(struct
> > > r10conf *conf, 
> > >   		nonrot = bdev_nonrot(rdev->bdev);
> > >   		has_nonrot_disk |= nonrot;
> > > +		WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
> > >   		pending = nr_pending_read(rdev);
> > >   		if (min_pending > pending && nonrot) {
> > >   			min_pending = pending;
> > > @@ -2113,6 +2114,7 @@ static int raid10_add_disk(struct mddev
> > > *mddev, struct md_rdev *rdev) p->recovery_disabled =
> > > mddev->recovery_disabled - 1; rdev->raid_disk = mirror;
> > >   		err = 0;
> > > +
> > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); if
> > > (rdev->saved_raid_disk != mirror) conf->fullsync = 1;
> > >   		WRITE_ONCE(p->rdev, rdev);
> > > @@ -2127,6 +2129,7 @@ static int raid10_add_disk(struct mddev
> > > *mddev, struct md_rdev *rdev) err = mddev_stack_new_rdev(mddev,
> > > rdev); if (err)
> > >   			return err;
> > > +
> > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > > conf->fullsync = 1; WRITE_ONCE(p->replacement, rdev);
> > >   	}
> > > @@ -4028,6 +4031,7 @@ static int raid10_run(struct mddev *mddev)
> > >   	rdev_for_each(rdev, mddev) {
> > >   		long long diff;
> > >   
> > > +
> > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); disk_idx =
> > > rdev->raid_disk; if (disk_idx < 0)
> > >   			continue;
> > >     
> > 
> >   
>
Paul E Luse April 25, 2024, 9:12 a.m. UTC | #3
On Fri, 26 Apr 2024 08:01:55 +0000
tada keisuke <keisuke1.tada@kioxia.com> wrote:

> > > > Hi,
> > > >
> > > > 在 2024/04/18 13:44, tada keisuke 写道:
> > > > > This patch depends on patch 07.
> > > > >
> > > > > All rdevs running in RAID 1/10 switch nr_pending to atomic
> > > > > mode. The value of nr_pending is read in a normal operation
> > > > > (choose_best_rdev()). Therefore, nr_pending must always be
> > > > > consistent.
> > > > >
> > > > > Signed-off-by: Keisuke TADA <keisuke1.tada@kioxia.com>
> > > > > Signed-off-by: Toshifumi OHTAKE <toshifumi.ootake@kioxia.com>
> > > > > ---
> > > > >   drivers/md/md.h     | 14 ++++++++++++++
> > > > >   drivers/md/raid1.c  |  7 +++++++
> > > > >   drivers/md/raid10.c |  4 ++++
> > > > >   3 files changed, 25 insertions(+)
> > > > >
> > > > > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > > > > index ab09e312c9bb..57b09b567ffa 100644
> > > > > --- a/drivers/md/md.h
> > > > > +++ b/drivers/md/md.h
> > > > > @@ -236,6 +236,20 @@ static inline unsigned long
> > > > > nr_pending_read(struct md_rdev *rdev) return
> > > > > atomic_long_read(&rdev->nr_pending.data->count); }
> > > > >
> > > > > +static inline bool nr_pending_is_percpu_mode(struct md_rdev
> > > > > *rdev) +{
> > > > > +	unsigned long __percpu *percpu_count;
> > > > > +
> > > > > +	return __ref_is_percpu(&rdev->nr_pending,
> > > > > &percpu_count); +}
> > > > > +
> > > > > +static inline bool nr_pending_is_atomic_mode(struct md_rdev
> > > > > *rdev) +{
> > > > > +	unsigned long __percpu *percpu_count;
> > > > > +
> > > > > +	return !__ref_is_percpu(&rdev->nr_pending,
> > > > > &percpu_count); +}
> > > > > +
> > > > >   static inline int is_badblock(struct md_rdev *rdev,
> > > > > sector_t s, int sectors, sector_t *first_bad, int
> > > > > *bad_sectors) {
> > > > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> > > > > index 12318fb15a88..c38ae13aadab 100644
> > > > > --- a/drivers/md/raid1.c
> > > > > +++ b/drivers/md/raid1.c
> > > > > @@ -784,6 +784,7 @@ static int choose_best_rdev(struct r1conf
> > > > > *conf, struct r1bio *r1_bio) if (ctl.readable_disks++ == 1)
> > > > >   			set_bit(R1BIO_FailFast,
> > > > > &r1_bio->state);
> > > > >
> > > > > +
> > > > > WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev)); pending =
> > > > > nr_pending_read(rdev); dist = abs(r1_bio->sector -
> > > > > conf->mirrors[disk].head_position);
> > > > > @@ -1930,6 +1931,7 @@ static int raid1_add_disk(struct mddev
> > > > > *mddev, struct md_rdev *rdev) if (err)
> > > > >   				return err;
> > > > >
> > > > > +
> > > > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > > > > raid1_add_conf(conf, rdev, mirror, false); /* As all devices
> > > > > are equivalent, we don't need a full recovery
> > > > >   			 * if this was recently any drive
> > > > > of the array @@ -1949,6 +1951,7 @@ static int
> > > > > raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
> > > > > set_bit(Replacement, &rdev->flags); raid1_add_conf(conf,
> > > > > rdev, repl_slot, true); err = 0;
> > > > > +
> > > > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > > >
> > > > I don't understand what's the point here, 'nr_pending' will be
> > > > used when the rdev issuing IO, and it's always used as atomic
> > > > mode, there is no difference.
> > > >
> > > > Consider that 'nr_pending' must be read from IO fast path, use
> > > > it as atomic is something we must accept. Unless someone comes
> > > > up with a plan to avoid reading 'inflight' counter from fast
> > > > path like generic block layer, it's not ok to me to switch to
> > > > percpu_ref for now.
> 
> The main purpose of this patchset is to improve RAID5 performance.
> In the current RAID 1/10 design, the value of nr_pending is
> intentionally always in atomic mode because it must be read in IO
> fast path. Unless the design of reading the value of nr_pending has
> changed, I believe that this patchset is a reasonable design and
> RAID1 performance is about the same as atomic_t before this patchset
> was applied. Paul's results also show that.
> 
> Best Regards,
> Keisuke

I only tested RAID1 and do believe that simpler is better so would
prefer not to change the RAID1 code.  I can run some RAID5 tests on
this as well unless you have some wide sweeping results? Would love to
see more RAID5 performance improvments.  Shushu has another RAID5 perf
patch out there that I think has some very good potential, it would be
good if you could take a look at that one.

-Paul

> 
> > > > +CC Paul
> > > >
> > > > HI, Paul, perhaps you RR mode doesn't need such 'inflight'
> > > > counter anymore?
> > > >
> > >
> > > I too am struggling to see the benefit but am curious enough that
> > > I will run some tests on my own as I have fresh baseline RAID1
> > > large sweep performance data ready right now.
> > >
> > > So my WIP round robin patch won't need nr_pedning for SSDs but I
> > > think it will for HDD plus I need a new atomic counter for round
> > > robin for SSDs anyway but I see no perfomrnace impact so far from
> > > adding it.
> > >
> > > -Paul
> > >
> > 
> > I can run more if others are interested (RAID5 or HDD for example)
> > but at least for RAID1 there's really no difference.  This was a
> > quick run, just 40 sec each, 16 jobs and the rest ofthe fio params
> > are on the charts. 2 disk RAID1. THe baseline is 6.8.0 from the md
> > repo. Using my favorite drives, of course, KIOXIA KCMYDVUG3T20 :)
> > 
> > Here's the results: https://photos.app.goo.gl/Avyw64eXCqWFWrs78
> > 
> > NOTE:  There are few small randoms where it appears to help but I
> > assumed that was because these are small randoms with very short run
> > times.  SO I reran the 4K mixed rw randoms with 5 minute run time
> > and that chart is at the very bottom and shows that over longer
> > duration its a wash, there's no clear winner.  I'm sure an even
> > longer run would show more consistently close results.
> 
>
tada keisuke April 26, 2024, 8:01 a.m. UTC | #4
> > > Hi,
> > >
> > > 在 2024/04/18 13:44, tada keisuke 写道:
> > > > This patch depends on patch 07.
> > > >
> > > > All rdevs running in RAID 1/10 switch nr_pending to atomic mode.
> > > > The value of nr_pending is read in a normal operation
> > > > (choose_best_rdev()). Therefore, nr_pending must always be
> > > > consistent.
> > > >
> > > > Signed-off-by: Keisuke TADA <keisuke1.tada@kioxia.com>
> > > > Signed-off-by: Toshifumi OHTAKE <toshifumi.ootake@kioxia.com>
> > > > ---
> > > >   drivers/md/md.h     | 14 ++++++++++++++
> > > >   drivers/md/raid1.c  |  7 +++++++
> > > >   drivers/md/raid10.c |  4 ++++
> > > >   3 files changed, 25 insertions(+)
> > > >
> > > > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > > > index ab09e312c9bb..57b09b567ffa 100644
> > > > --- a/drivers/md/md.h
> > > > +++ b/drivers/md/md.h
> > > > @@ -236,6 +236,20 @@ static inline unsigned long
> > > > nr_pending_read(struct md_rdev *rdev) return
> > > > atomic_long_read(&rdev->nr_pending.data->count); }
> > > >
> > > > +static inline bool nr_pending_is_percpu_mode(struct md_rdev
> > > > *rdev) +{
> > > > +	unsigned long __percpu *percpu_count;
> > > > +
> > > > +	return __ref_is_percpu(&rdev->nr_pending, &percpu_count);
> > > > +}
> > > > +
> > > > +static inline bool nr_pending_is_atomic_mode(struct md_rdev
> > > > *rdev) +{
> > > > +	unsigned long __percpu *percpu_count;
> > > > +
> > > > +	return !__ref_is_percpu(&rdev->nr_pending,
> > > > &percpu_count); +}
> > > > +
> > > >   static inline int is_badblock(struct md_rdev *rdev, sector_t s,
> > > > int sectors, sector_t *first_bad, int *bad_sectors)
> > > >   {
> > > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> > > > index 12318fb15a88..c38ae13aadab 100644
> > > > --- a/drivers/md/raid1.c
> > > > +++ b/drivers/md/raid1.c
> > > > @@ -784,6 +784,7 @@ static int choose_best_rdev(struct r1conf
> > > > *conf, struct r1bio *r1_bio) if (ctl.readable_disks++ == 1)
> > > >   			set_bit(R1BIO_FailFast, &r1_bio->state);
> > > >
> > > > +		WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
> > > >   		pending = nr_pending_read(rdev);
> > > >   		dist = abs(r1_bio->sector -
> > > > conf->mirrors[disk].head_position);
> > > > @@ -1930,6 +1931,7 @@ static int raid1_add_disk(struct mddev
> > > > *mddev, struct md_rdev *rdev) if (err)
> > > >   				return err;
> > > >
> > > > +
> > > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > > > raid1_add_conf(conf, rdev, mirror, false); /* As all devices are
> > > > equivalent, we don't need a full recovery
> > > >   			 * if this was recently any drive of the
> > > > array @@ -1949,6 +1951,7 @@ static int raid1_add_disk(struct mddev
> > > > *mddev, struct md_rdev *rdev) set_bit(Replacement, &rdev->flags);
> > > >   		raid1_add_conf(conf, rdev, repl_slot, true);
> > > >   		err = 0;
> > > > +
> > > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > >
> > > I don't understand what's the point here, 'nr_pending' will be used
> > > when the rdev issuing IO, and it's always used as atomic mode, there
> > > is no difference.
> > >
> > > Consider that 'nr_pending' must be read from IO fast path, use it as
> > > atomic is something we must accept. Unless someone comes up with a
> > > plan to avoid reading 'inflight' counter from fast path like generic
> > > block layer, it's not ok to me to switch to percpu_ref for now.

The main purpose of this patchset is to improve RAID5 performance.
In the current RAID 1/10 design, the value of nr_pending is intentionally always in atomic mode because it must be read in IO fast path.
Unless the design of reading the value of nr_pending has changed, I believe that this patchset is a reasonable design and RAID1 performance is about the same as atomic_t before this patchset was applied.
Paul's results also show that.

Best Regards,
Keisuke

> > > +CC Paul
> > >
> > > HI, Paul, perhaps you RR mode doesn't need such 'inflight' counter
> > > anymore?
> > >
> >
> > I too am struggling to see the benefit but am curious enough that I
> > will run some tests on my own as I have fresh baseline RAID1 large
> > sweep performance data ready right now.
> >
> > So my WIP round robin patch won't need nr_pedning for SSDs but I think
> > it will for HDD plus I need a new atomic counter for round robin for
> > SSDs anyway but I see no perfomrnace impact so far from adding it.
> >
> > -Paul
> >
> 
> I can run more if others are interested (RAID5 or HDD for example) but
> at least for RAID1 there's really no difference.  This was a quick run,
> just 40 sec each, 16 jobs and the rest ofthe fio params are on the
> charts. 2 disk RAID1. THe baseline is 6.8.0 from the md repo.
> Using my favorite drives, of course, KIOXIA KCMYDVUG3T20 :)
> 
> Here's the results: https://photos.app.goo.gl/Avyw64eXCqWFWrs78
> 
> NOTE:  There are few small randoms where it appears to help but I
> assumed that was because these are small randoms with very short run
> times.  SO I reran the 4K mixed rw randoms with 5 minute run time and
> that chart is at the very bottom and shows that over longer duration
> its a wash, there's no clear winner.  I'm sure an even longer run would
> show more consistently close results.
tada keisuke May 9, 2024, 6:46 a.m. UTC | #5
> > > > > Hi,
> > > > >
> > > > > 在 2024/04/18 13:44, tada keisuke 写道:
> > > > > > This patch depends on patch 07.
> > > > > >
> > > > > > All rdevs running in RAID 1/10 switch nr_pending to atomic
> > > > > > mode. The value of nr_pending is read in a normal operation
> > > > > > (choose_best_rdev()). Therefore, nr_pending must always be
> > > > > > consistent.
> > > > > >
> > > > > > Signed-off-by: Keisuke TADA <keisuke1.tada@kioxia.com>
> > > > > > Signed-off-by: Toshifumi OHTAKE <toshifumi.ootake@kioxia.com>
> > > > > > ---
> > > > > >   drivers/md/md.h     | 14 ++++++++++++++
> > > > > >   drivers/md/raid1.c  |  7 +++++++
> > > > > >   drivers/md/raid10.c |  4 ++++
> > > > > >   3 files changed, 25 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > > > > > index ab09e312c9bb..57b09b567ffa 100644
> > > > > > --- a/drivers/md/md.h
> > > > > > +++ b/drivers/md/md.h
> > > > > > @@ -236,6 +236,20 @@ static inline unsigned long
> > > > > > nr_pending_read(struct md_rdev *rdev) return
> > > > > > atomic_long_read(&rdev->nr_pending.data->count); }
> > > > > >
> > > > > > +static inline bool nr_pending_is_percpu_mode(struct md_rdev
> > > > > > *rdev) +{
> > > > > > +	unsigned long __percpu *percpu_count;
> > > > > > +
> > > > > > +	return __ref_is_percpu(&rdev->nr_pending,
> > > > > > &percpu_count); +}
> > > > > > +
> > > > > > +static inline bool nr_pending_is_atomic_mode(struct md_rdev
> > > > > > *rdev) +{
> > > > > > +	unsigned long __percpu *percpu_count;
> > > > > > +
> > > > > > +	return !__ref_is_percpu(&rdev->nr_pending,
> > > > > > &percpu_count); +}
> > > > > > +
> > > > > >   static inline int is_badblock(struct md_rdev *rdev,
> > > > > > sector_t s, int sectors, sector_t *first_bad, int
> > > > > > *bad_sectors) {
> > > > > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> > > > > > index 12318fb15a88..c38ae13aadab 100644
> > > > > > --- a/drivers/md/raid1.c
> > > > > > +++ b/drivers/md/raid1.c
> > > > > > @@ -784,6 +784,7 @@ static int choose_best_rdev(struct r1conf
> > > > > > *conf, struct r1bio *r1_bio) if (ctl.readable_disks++ == 1)
> > > > > >   			set_bit(R1BIO_FailFast,
> > > > > > &r1_bio->state);
> > > > > >
> > > > > > +
> > > > > > WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev)); pending =
> > > > > > nr_pending_read(rdev); dist = abs(r1_bio->sector -
> > > > > > conf->mirrors[disk].head_position);
> > > > > > @@ -1930,6 +1931,7 @@ static int raid1_add_disk(struct mddev
> > > > > > *mddev, struct md_rdev *rdev) if (err)
> > > > > >   				return err;
> > > > > >
> > > > > > +
> > > > > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > > > > > raid1_add_conf(conf, rdev, mirror, false); /* As all devices
> > > > > > are equivalent, we don't need a full recovery
> > > > > >   			 * if this was recently any drive
> > > > > > of the array @@ -1949,6 +1951,7 @@ static int
> > > > > > raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
> > > > > > set_bit(Replacement, &rdev->flags); raid1_add_conf(conf,
> > > > > > rdev, repl_slot, true); err = 0;
> > > > > > +
> > > > > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
> > > > >
> > > > > I don't understand what's the point here, 'nr_pending' will be
> > > > > used when the rdev issuing IO, and it's always used as atomic
> > > > > mode, there is no difference.
> > > > >
> > > > > Consider that 'nr_pending' must be read from IO fast path, use
> > > > > it as atomic is something we must accept. Unless someone comes
> > > > > up with a plan to avoid reading 'inflight' counter from fast
> > > > > path like generic block layer, it's not ok to me to switch to
> > > > > percpu_ref for now.
> >
> > The main purpose of this patchset is to improve RAID5 performance.
> > In the current RAID 1/10 design, the value of nr_pending is
> > intentionally always in atomic mode because it must be read in IO
> > fast path. Unless the design of reading the value of nr_pending has
> > changed, I believe that this patchset is a reasonable design and
> > RAID1 performance is about the same as atomic_t before this patchset
> > was applied. Paul's results also show that.
> >
> > Best Regards,
> > Keisuke
> 
> I only tested RAID1 and do believe that simpler is better so would
> prefer not to change the RAID1 code.  I can run some RAID5 tests on
> this as well unless you have some wide sweeping results? Would love to
> see more RAID5 performance improvments.  Shushu has another RAID5 perf
> patch out there that I think has some very good potential, it would be
> good if you could take a look at that one.
> 
> -Paul

We are planning to measure the performance of RAID5 using SSDs.

Keisuke
diff mbox series

Patch

diff --git a/drivers/md/md.h b/drivers/md/md.h
index ab09e312c9bb..57b09b567ffa 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -236,6 +236,20 @@  static inline unsigned long nr_pending_read(struct md_rdev *rdev)
 	return atomic_long_read(&rdev->nr_pending.data->count);
 }
 
+static inline bool nr_pending_is_percpu_mode(struct md_rdev *rdev)
+{
+	unsigned long __percpu *percpu_count;
+
+	return __ref_is_percpu(&rdev->nr_pending, &percpu_count);
+}
+
+static inline bool nr_pending_is_atomic_mode(struct md_rdev *rdev)
+{
+	unsigned long __percpu *percpu_count;
+
+	return !__ref_is_percpu(&rdev->nr_pending, &percpu_count);
+}
+
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
 			      sector_t *first_bad, int *bad_sectors)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 12318fb15a88..c38ae13aadab 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -784,6 +784,7 @@  static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
 		if (ctl.readable_disks++ == 1)
 			set_bit(R1BIO_FailFast, &r1_bio->state);
 
+		WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
 		pending = nr_pending_read(rdev);
 		dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
 
@@ -1930,6 +1931,7 @@  static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			if (err)
 				return err;
 
+			percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
 			raid1_add_conf(conf, rdev, mirror, false);
 			/* As all devices are equivalent, we don't need a full recovery
 			 * if this was recently any drive of the array
@@ -1949,6 +1951,7 @@  static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		set_bit(Replacement, &rdev->flags);
 		raid1_add_conf(conf, rdev, repl_slot, true);
 		err = 0;
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
 		conf->fullsync = 1;
 	}
 
@@ -3208,6 +3211,7 @@  static void raid1_free(struct mddev *mddev, void *priv);
 static int raid1_run(struct mddev *mddev)
 {
 	struct r1conf *conf;
+	struct md_rdev *rdev;
 	int i;
 	int ret;
 
@@ -3269,6 +3273,9 @@  static int raid1_run(struct mddev *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
+	rdev_for_each(rdev, mddev) {
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
+	}
 	rcu_assign_pointer(mddev->thread, conf->thread);
 	rcu_assign_pointer(conf->thread, NULL);
 	mddev->private = conf;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b91dd6c0be5a..66896a1076e1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -808,6 +808,7 @@  static struct md_rdev *read_balance(struct r10conf *conf,
 
 		nonrot = bdev_nonrot(rdev->bdev);
 		has_nonrot_disk |= nonrot;
+		WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
 		pending = nr_pending_read(rdev);
 		if (min_pending > pending && nonrot) {
 			min_pending = pending;
@@ -2113,6 +2114,7 @@  static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		p->recovery_disabled = mddev->recovery_disabled - 1;
 		rdev->raid_disk = mirror;
 		err = 0;
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
 		if (rdev->saved_raid_disk != mirror)
 			conf->fullsync = 1;
 		WRITE_ONCE(p->rdev, rdev);
@@ -2127,6 +2129,7 @@  static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		err = mddev_stack_new_rdev(mddev, rdev);
 		if (err)
 			return err;
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
 		conf->fullsync = 1;
 		WRITE_ONCE(p->replacement, rdev);
 	}
@@ -4028,6 +4031,7 @@  static int raid10_run(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 		long long diff;
 
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
 		disk_idx = rdev->raid_disk;
 		if (disk_idx < 0)
 			continue;