@@ -8588,6 +8588,14 @@ void md_cluster_stop(struct mddev *mddev)
module_put(md_cluster_mod);
}
+static bool is_mddev_io_starve(struct mddev *mddev)
+{
+ if (mddev->pers->io_starve)
+ return mddev->pers->io_starve(mddev);
+ else
+ return false;
+}
+
static int is_mddev_idle(struct mddev *mddev, int init)
{
struct md_rdev *rdev;
@@ -9219,6 +9227,8 @@ void md_do_sync(struct md_thread *thread)
wait_event(mddev->recovery_wait,
!atomic_read(&mddev->recovery_active));
}
+ if (is_mddev_io_starve(mddev) == true)
+ msleep(500);
}
}
pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
@@ -704,6 +704,7 @@ struct md_personality
struct list_head list;
struct module *owner;
bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
+ bool (*io_starve)(struct mddev *mddev);
/*
* start up works that do NOT require md_thread. tasks that
* requires md_thread should go into start()
@@ -852,6 +852,9 @@ struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
if (flags & R5_GAS_NOBLOCK)
break;
+ if (flags & R5_GAS_IO)
+ set_bit(R5_IO_STARVE, &conf->cache_state);
+
set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
r5l_wake_reclaim(conf->log, 0);
@@ -5961,8 +5964,10 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
flags |= R5_GAS_PREVIOUS;
if (bi->bi_opf & REQ_RAHEAD)
flags |= R5_GAS_NOBLOCK;
+ flags |= R5_GAS_IO;
sh = raid5_get_active_stripe(conf, ctx, new_sector, flags);
if (unlikely(!sh)) {
+ set_bit(R5_IO_STARVE, &conf->cache_state);
/* cannot get stripe, just give-up */
bi->bi_status = BLK_STS_IOERR;
return STRIPE_FAIL;
@@ -6066,6 +6071,15 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
return r_sector + sectors_per_chunk - chunk_offset;
}
+static bool raid5_io_starve(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ if (test_and_clear_bit(R5_IO_STARVE, &conf->cache_state))
+ return true;
+ else
+ return false;
+}
+
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
@@ -8958,6 +8972,7 @@ static struct md_personality raid6_personality =
.level = 6,
.owner = THIS_MODULE,
.make_request = raid5_make_request,
+ .io_starve = raid5_io_starve,
.run = raid5_run,
.start = raid5_start,
.free = raid5_free,
@@ -8984,6 +8999,7 @@ static struct md_personality raid5_personality =
.level = 5,
.owner = THIS_MODULE,
.make_request = raid5_make_request,
+ .io_starve = raid5_io_starve,
.run = raid5_run,
.start = raid5_start,
.free = raid5_free,
@@ -9011,6 +9027,7 @@ static struct md_personality raid4_personality =
.level = 4,
.owner = THIS_MODULE,
.make_request = raid5_make_request,
+ .io_starve = raid5_io_starve,
.run = raid5_run,
.start = raid5_start,
.free = raid5_free,
@@ -537,6 +537,7 @@ enum r5_cache_state {
* released. This avoids flooding
* the cache.
*/
+ R5_IO_STARVE,
R5C_LOG_TIGHT, /* log device space tight, need to
* prioritize stripes at last_checkpoint
*/
@@ -813,6 +814,7 @@ struct stripe_request_ctx;
#define R5_GAS_NOBLOCK (1 << 1)
/* do not block waiting for quiesce to be released */
#define R5_GAS_NOQUIESCE (1 << 2)
+#define R5_GAS_IO (1 << 3)
struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
struct stripe_request_ctx *ctx, sector_t sector,
unsigned int flags);
This problem is talked severl times[1][2]. And this can affect time of mkfs too. The steps that are used to reproduce this problem are: 1. create a raid5 with 10 devices (each device is 5.5TB) 2. mkfs.ext4 -O 'meta_bg,uninit_bg,flex_bg,metadata_csum,^resize_inode' 3. run the same command in step2 again. Step3 gives a message '/dev/md0 contains a ext4 file system' and starts wait. Then it gives another message 'Proceed anyway? (y,N)' after a long time (about 12 minutes). After adding some debug logs, there are many small read ios with size 4KB and they are not sequential. At the same time, the sync speed doesn't decrease. is_mddev_idle is used to control sync speed if there are upper layer io. It calculates a diff between the iostat sectors number and disk sync number. Then it uses the diff as a basement. It calcuates the diff each time. If newdiff-olddiff>64, it thinks upper layer submits bio to array and it's time to slow down sync speed. But it's hard to work for random ios in raid5. Those ios can't get stripe because all stripes are used for sync request. So these random ios can't be submitted to member disks and the iostat sectors can't be increased. So is_mddev_idle can't work well in this situation. This patch tries to resolve this issue by adding a personality method which tells if normal io is starved. It needs to slow down the sync speed if md can't handle ios from upper layer. This patch introduces sync performance decline when there is competing IO. Patch[1] improves the sync performance. But the decline is not serious. Sure the io performance increases at the same time. These are some fio test results: libapi patch: without patch: libaio read bs=4k direct=1 numjobs=1 runtime=120: bw=1845KiB/s sync speed: about 180MB/s libaio read bs=4k direct=1 iodepth=32 numjobs=1 runtime=120: bw=9.78MiB/s sync speed: about 170MB/S libaio write bs=4k direct=1 iodepth=32 numjobs=1 runtime=120: bw=1203KiB/s sync speed: about 170MB/S libaio randread bs=4k direct=1 numjobs=1 runtime=120: bw=17.5KiB/s sync speed: about 185MB/s libaio randwrite bs=4k direct=1 numjobs=1 runtime=120: bw=9312B/s sync speed: about 150MB/s withpatch: libaio read bs=4k direct=1 numjobs=1 runtime=120: bw=19.1MiB/s sync speed: about 100MB/s libaio read bs=4k direct=1 iodepth=32 numjobs=1 runtime=120: bw=68.3MiB/s sync speed: about 100MB/s libaio write bs=4k direct=1 iodepth=32 numjobs=1 runtime=120: bw=4994KiB/s sync speed: about 100MB/s libaio randread bs=4k direct=1 numjobs=1 runtime=120: bw=63.5KiB/s sync speed: about 150MB/s libaio randwrite bs=4k direct=1 numjobs=1 runtime=120: bw=23.0KiB/s sync speed: about 120MB/s Signed-off-by: Xiao Ni <xni@redhat.com> --- drivers/md/md.c | 10 ++++++++++ drivers/md/md.h | 1 + drivers/md/raid5.c | 17 +++++++++++++++++ drivers/md/raid5.h | 2 ++ 4 files changed, 30 insertions(+)