@@ -14,10 +14,10 @@
#include "blk.h"
/* Max dispatch from a group in 1 round */
-static int throtl_grp_quantum = 8;
+static int throtl_grp_quantum = 32;
/* Total max dispatch from all groups in one round */
-static int throtl_quantum = 32;
+static int throtl_quantum = 128;
/* Throttling is performed over a slice and after that slice is renewed */
#define DFL_THROTL_SLICE_HD (HZ / 10)
@@ -43,6 +43,12 @@
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
+enum wl_type {
+ READ_WORKLOAD = 0,
+ SYNC_WRITE_WORKLOAD = 1,
+ ASYNC_WRITE_WORKLOAD = 2
+};
+
/*
* To implement hierarchical throttling, throtl_grps form a tree and bios
* are dispatched upwards level by level until they reach the top and get
@@ -79,8 +85,11 @@ struct throtl_service_queue {
* Bios queued directly to this service_queue or dispatched from
* children throtl_grp's.
*/
- struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
- unsigned int nr_queued[2]; /* number of queued bios */
+ /* throtl_qnode [READ/WRITE/ASYNC_WRITE] */
+ struct list_head queued[3];
+
+ unsigned int nr_queued[3]; /* number of queued bios */
+
/*
* RB tree of active children throtl_grp's, which are sorted by
@@ -127,8 +136,8 @@ struct throtl_grp {
* with the sibling qnode_on_parents and the parent's
* qnode_on_self.
*/
- struct throtl_qnode qnode_on_self[2];
- struct throtl_qnode qnode_on_parent[2];
+ struct throtl_qnode qnode_on_self[3];
+ struct throtl_qnode qnode_on_parent[3];
/*
* Dispatch time in jiffies. This is the estimated time when group
@@ -202,7 +211,7 @@ struct throtl_data
struct request_queue *queue;
/* Total Number of queued bios on READ and WRITE lists */
- unsigned int nr_queued[2];
+ unsigned int nr_queued[3];
unsigned int throtl_slice;
@@ -274,6 +283,18 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
return container_of(sq, struct throtl_data, service_queue);
}
+static inline enum wl_type bio_workload_type(struct bio *bio)
+{
+ return bio_data_dir(bio) ?
+ ((bio->bi_opf & REQ_SYNC) ? SYNC_WRITE_WORKLOAD :
+ ASYNC_WRITE_WORKLOAD) : READ_WORKLOAD;
+}
+
+static inline bool wl_to_rw(enum wl_type type)
+{
+ return type >= SYNC_WRITE_WORKLOAD;
+}
+
/*
* cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
* make the IO dispatch more smooth.
@@ -475,8 +496,9 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
/* init a service_queue, assumes the caller zeroed it */
static void throtl_service_queue_init(struct throtl_service_queue *sq)
{
- INIT_LIST_HEAD(&sq->queued[0]);
- INIT_LIST_HEAD(&sq->queued[1]);
+ INIT_LIST_HEAD(&sq->queued[READ_WORKLOAD]);
+ INIT_LIST_HEAD(&sq->queued[SYNC_WRITE_WORKLOAD]);
+ INIT_LIST_HEAD(&sq->queued[ASYNC_WRITE_WORKLOAD]);
sq->pending_tree = RB_ROOT;
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
@@ -484,7 +506,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
{
struct throtl_grp *tg;
- int rw;
+ enum wl_type type;
tg = kzalloc_node(sizeof(*tg), gfp, node);
if (!tg)
@@ -492,9 +514,9 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
throtl_service_queue_init(&tg->service_queue);
- for (rw = READ; rw <= WRITE; rw++) {
- throtl_qnode_init(&tg->qnode_on_self[rw], tg);
- throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+ for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++) {
+ throtl_qnode_init(&tg->qnode_on_self[type], tg);
+ throtl_qnode_init(&tg->qnode_on_parent[type], tg);
}
RB_CLEAR_NODE(&tg->rb_node);
@@ -985,6 +1007,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
unsigned long *wait)
{
bool rw = bio_data_dir(bio);
+ enum wl_type type = bio_workload_type(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
/*
@@ -993,8 +1016,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* this function with a different bio if there are other bios
* queued.
*/
- BUG_ON(tg->service_queue.nr_queued[rw] &&
- bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
+ BUG_ON(tg->service_queue.nr_queued[type] &&
+ bio != throtl_peek_queued(&tg->service_queue.queued[type]));
/* If tg->bps = -1, then BW is unlimited */
if (tg_bps_limit(tg, rw) == U64_MAX &&
@@ -1011,7 +1034,12 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* If there is queued bio, that means there should be an active
* slice and it should be extended instead.
*/
- if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
+ if (throtl_slice_used(tg, rw) &&
+ ((rw == READ &&
+ !tg->service_queue.nr_queued[READ_WORKLOAD]) ||
+ (rw == WRITE &&
+ !tg->service_queue.nr_queued[SYNC_WRITE_WORKLOAD] &&
+ !tg->service_queue.nr_queued[ASYNC_WRITE_WORKLOAD])))
throtl_start_new_slice(tg, rw);
else {
if (time_before(tg->slice_end[rw],
@@ -1072,10 +1100,10 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
- bool rw = bio_data_dir(bio);
+ int type = bio_workload_type(bio);
if (!qn)
- qn = &tg->qnode_on_self[rw];
+ qn = &tg->qnode_on_self[type];
/*
* If @tg doesn't currently have any bios queued in the same
@@ -1083,12 +1111,12 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* dispatched. Mark that @tg was empty. This is automatically
* cleaered on the next tg_update_disptime().
*/
- if (!sq->nr_queued[rw])
+ if (!sq->nr_queued[type])
tg->flags |= THROTL_TG_WAS_EMPTY;
- throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+ throtl_qnode_add_bio(bio, qn, &sq->queued[type]);
- sq->nr_queued[rw]++;
+ sq->nr_queued[type]++;
throtl_enqueue_tg(tg);
}
@@ -1096,16 +1124,22 @@ static void tg_update_disptime(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+ unsigned long sync_write_wait, async_write_wait;
struct bio *bio;
- bio = throtl_peek_queued(&sq->queued[READ]);
+ bio = throtl_peek_queued(&sq->queued[READ_WORKLOAD]);
if (bio)
tg_may_dispatch(tg, bio, &read_wait);
- bio = throtl_peek_queued(&sq->queued[WRITE]);
+ bio = throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD]);
+ if (bio)
+ tg_may_dispatch(tg, bio, &sync_write_wait);
+
+ bio = throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD]);
if (bio)
- tg_may_dispatch(tg, bio, &write_wait);
+ tg_may_dispatch(tg, bio, &async_write_wait);
+ write_wait = min(sync_write_wait, async_write_wait);
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
@@ -1125,16 +1159,16 @@ static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
throtl_start_new_slice_with_credit(parent_tg, rw,
child_tg->slice_start[rw]);
}
-
}
-static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
+static void tg_dispatch_one_bio(struct throtl_grp *tg, enum wl_type type)
{
struct throtl_service_queue *sq = &tg->service_queue;
struct throtl_service_queue *parent_sq = sq->parent_sq;
struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
struct throtl_grp *tg_to_put = NULL;
struct bio *bio;
+ bool rw = wl_to_rw(type);
/*
* @bio is being transferred from @tg to @parent_sq. Popping a bio
@@ -1142,8 +1176,8 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
* getting released prematurely. Remember the tg to put and put it
* after @bio is transferred to @parent_sq.
*/
- bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
- sq->nr_queued[rw]--;
+ bio = throtl_pop_queued(&sq->queued[type], &tg_to_put);
+ sq->nr_queued[type]--;
throtl_charge_bio(tg, bio);
@@ -1155,13 +1189,13 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
* responsible for issuing these bios.
*/
if (parent_tg) {
- throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
+ throtl_add_bio_tg(bio, &tg->qnode_on_parent[type], parent_tg);
start_parent_slice_with_credit(tg, parent_tg, rw);
} else {
- throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
- &parent_sq->queued[rw]);
- BUG_ON(tg->td->nr_queued[rw] <= 0);
- tg->td->nr_queued[rw]--;
+ throtl_qnode_add_bio(bio, &tg->qnode_on_parent[type],
+ &parent_sq->queued[type]);
+ BUG_ON(tg->td->nr_queued[type] <= 0);
+ tg->td->nr_queued[type]--;
}
throtl_trim_slice(tg, rw);
@@ -1173,34 +1207,45 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
static int throtl_dispatch_tg(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
- unsigned int nr_reads = 0, nr_writes = 0;
+ unsigned int nr_reads = 0, nr_async_writes = 0, nr_sync_writes = 0;
unsigned int max_nr_reads = throtl_grp_quantum*3/4;
- unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+ unsigned int max_sync_writes = (throtl_grp_quantum - max_nr_reads)*7/8;
+ unsigned int max_async_writes = throtl_grp_quantum - max_nr_reads
+ - max_sync_writes;
struct bio *bio;
/* Try to dispatch 75% READS and 25% WRITES */
-
- while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
+ while ((bio = throtl_peek_queued(&sq->queued[READ_WORKLOAD])) &&
tg_may_dispatch(tg, bio, NULL)) {
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
+ tg_dispatch_one_bio(tg, READ_WORKLOAD);
nr_reads++;
if (nr_reads >= max_nr_reads)
break;
}
- while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
+ while ((bio = throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD])) &&
tg_may_dispatch(tg, bio, NULL)) {
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
- nr_writes++;
+ tg_dispatch_one_bio(tg, SYNC_WRITE_WORKLOAD);
+ nr_sync_writes++;
- if (nr_writes >= max_nr_writes)
+ if (nr_sync_writes >= max_sync_writes)
break;
}
- return nr_reads + nr_writes;
+ while ((bio = throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD])) &&
+ tg_may_dispatch(tg, bio, NULL)) {
+
+ tg_dispatch_one_bio(tg, ASYNC_WRITE_WORKLOAD);
+ nr_async_writes++;
+
+ if (nr_async_writes >= max_async_writes)
+ break;
+ }
+
+ return nr_reads + nr_sync_writes + nr_async_writes;
}
static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
@@ -1221,7 +1266,9 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
nr_disp += throtl_dispatch_tg(tg);
- if (sq->nr_queued[0] || sq->nr_queued[1])
+ if (sq->nr_queued[READ_WORKLOAD] ||
+ sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+ sq->nr_queued[ASYNC_WRITE_WORKLOAD])
tg_update_disptime(tg);
if (nr_disp >= throtl_quantum)
@@ -1267,9 +1314,14 @@ static void throtl_pending_timer_fn(struct timer_list *t)
dispatched = false;
while (true) {
- throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
- sq->nr_queued[READ] + sq->nr_queued[WRITE],
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ throtl_log(sq,
+ "dispatch nr_queued=%u read=%u sync_write=%u async_write=%u",
+ sq->nr_queued[READ_WORKLOAD] +
+ sq->nr_queued[SYNC_WRITE_WORKLOAD] +
+ sq->nr_queued[ASYNC_WRITE_WORKLOAD],
+ sq->nr_queued[READ_WORKLOAD],
+ sq->nr_queued[SYNC_WRITE_WORKLOAD],
+ sq->nr_queued[ASYNC_WRITE_WORKLOAD]);
ret = throtl_select_dispatch(sq);
if (ret) {
@@ -1325,13 +1377,13 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
struct bio_list bio_list_on_stack;
struct bio *bio;
struct blk_plug plug;
- int rw;
+ enum wl_type type;
bio_list_init(&bio_list_on_stack);
spin_lock_irq(q->queue_lock);
- for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+ for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++)
+ while ((bio = throtl_pop_queued(&td_sq->queued[type], NULL)))
bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(q->queue_lock);
@@ -1820,11 +1872,13 @@ static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
if (!read_limit && !write_limit)
return true;
- if (read_limit && sq->nr_queued[READ] &&
- (!write_limit || sq->nr_queued[WRITE]))
+ if (read_limit && sq->nr_queued[READ_WORKLOAD] &&
+ (!write_limit || sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+ sq->nr_queued[ASYNC_WRITE_WORKLOAD]))
return true;
- if (write_limit && sq->nr_queued[WRITE] &&
- (!read_limit || sq->nr_queued[READ]))
+ if (write_limit && (sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+ sq->nr_queued[ASYNC_WRITE_WORKLOAD]) &&
+ (!read_limit || sq->nr_queued[READ_WORKLOAD]))
return true;
if (time_after_eq(jiffies,
@@ -2129,6 +2183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
+ enum wl_type type = bio_workload_type(bio);
bool throttled = false;
struct throtl_data *td = tg->td;
@@ -2157,7 +2212,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
throtl_downgrade_check(tg);
throtl_upgrade_check(tg);
/* throtl is FIFO - if bios are already queued, should queue */
- if (sq->nr_queued[rw])
+ if (sq->nr_queued[type])
break;
/* if above limits, break to queue */
@@ -2191,7 +2246,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
* Climb up the ladder. If we''re already at the top, it
* can be executed directly.
*/
- qn = &tg->qnode_on_parent[rw];
+ qn = &tg->qnode_on_parent[type];
sq = sq->parent_sq;
tg = sq_to_tg(sq);
if (!tg)
@@ -2199,16 +2254,19 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
}
/* out-of-limit, queue to @tg */
- throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
+ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u "
+ "iops=%u queued=%d/%d/%d",
rw == READ ? 'R' : 'W',
tg->bytes_disp[rw], bio->bi_iter.bi_size,
tg_bps_limit(tg, rw),
tg->io_disp[rw], tg_iops_limit(tg, rw),
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ sq->nr_queued[READ_WORKLOAD],
+ sq->nr_queued[SYNC_WRITE_WORKLOAD],
+ sq->nr_queued[ASYNC_WRITE_WORKLOAD]);
tg->last_low_overflow_time[rw] = jiffies;
- td->nr_queued[rw]++;
+ td->nr_queued[type]++;
throtl_add_bio_tg(bio, qn, tg);
throttled = true;
@@ -2334,10 +2392,15 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq)
throtl_dequeue_tg(tg);
- while ((bio = throtl_peek_queued(&sq->queued[READ])))
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
- while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
+ while ((bio =
+ throtl_peek_queued(&sq->queued[READ_WORKLOAD])))
+ tg_dispatch_one_bio(tg, READ_WORKLOAD);
+ while ((bio =
+ throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD])))
+ tg_dispatch_one_bio(tg, SYNC_WRITE_WORKLOAD);
+ while ((bio =
+ throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD])))
+ tg_dispatch_one_bio(tg, ASYNC_WRITE_WORKLOAD);
}
}
@@ -2354,7 +2417,7 @@ void blk_throtl_drain(struct request_queue *q)
struct blkcg_gq *blkg;
struct cgroup_subsys_state *pos_css;
struct bio *bio;
- int rw;
+ enum wl_type type;
queue_lockdep_assert_held(q);
rcu_read_lock();
@@ -2375,8 +2438,8 @@ void blk_throtl_drain(struct request_queue *q)
spin_unlock_irq(q->queue_lock);
/* all bios now should be in td->service_queue, issue them */
- for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
+ for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++)
+ while ((bio = throtl_pop_queued(&td->service_queue.queued[type],
NULL)))
generic_make_request(bio);