From c49ec4e8b0e4135a87c9894597901539f3e3ca08 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 21 Dec 2016 12:39:33 +0100
Subject: [PATCH 3/3] blk-mq: Split driver and scheduler tags
Add 'sched_tags' next to 'tags' in struct blk_mq_hw_ctx and also
in struct blk_mq_tag_set. Add 'sched_tag' next to 'tag' in struct
request. Modify blk_mq_update_nr_requests() such that it accepts
values larger than the queue depth. Make __blk_mq_free_request()
free both tags. Make blk_mq_alloc_tag_set() allocate both tag sets.
Make blk_mq_free_tag_set() free both tag sets. Make
blk_mq_dispatch_rq_list() allocate the driver tag. Modify
blk_mq_update_nr_requests() such that it accepts a size that
exceeds the queue depth.
---
block/blk-flush.c | 9 ++-
block/blk-mq.c | 160 +++++++++++++++++++++++++++++++++++--------------
block/blk-mq.h | 5 +-
block/blk-tag.c | 1 +
include/linux/blk-mq.h | 2 +
include/linux/blkdev.h | 1 +
6 files changed, 129 insertions(+), 49 deletions(-)
@@ -170,6 +170,8 @@ static bool blk_flush_complete_seq(struct request *rq,
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
bool queued = false, kicked;
+ BUG_ON(rq->tag < 0);
+
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
@@ -319,6 +321,8 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx;
+ BUG_ON(first_rq->tag < 0);
+
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->tag = first_rq->tag;
fq->orig_rq = first_rq;
@@ -452,8 +456,9 @@ void blk_insert_flush(struct request *rq)
* processed directly without going through flush machinery. Queue
* for normal execution.
*/
- if ((policy & REQ_FSEQ_DATA) &&
- !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+ if (((policy & REQ_FSEQ_DATA) &&
+ !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) ||
+ (q->mq_ops && blk_mq_assign_drv_tag(rq) < 0)) {
if (q->mq_ops)
blk_mq_sched_insert_request(rq, false, true, false);
else
@@ -220,20 +220,21 @@ EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
unsigned int op)
{
- struct blk_mq_tags *tags = data->hctx->tags;
+ struct blk_mq_tags *tags = data->hctx->sched_tags;
struct request *rq;
- unsigned int tag;
+ unsigned int sched_tag;
- tag = blk_mq_get_tag(data, tags);
- if (tag != BLK_MQ_TAG_FAIL) {
- rq = tags->rqs[tag];
+ sched_tag = blk_mq_get_tag(data, tags);
+ if (sched_tag != BLK_MQ_TAG_FAIL) {
+ rq = tags->rqs[sched_tag];
+ rq->tag = -1;
if (blk_mq_tag_busy(data->hctx)) {
rq->rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
- rq->tag = tag;
+ rq->sched_tag = sched_tag;
blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
return rq;
}
@@ -328,6 +329,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct request *rq)
{
const int tag = rq->tag;
+ const int sched_tag = rq->sched_tag;
struct request_queue *q = rq->q;
ctx->rq_completed[rq_is_sync(rq)]++;
@@ -340,7 +342,13 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
- blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+ if (tag >= 0) {
+ WARN_ON_ONCE(hctx->tags->rqs[tag] != rq);
+ hctx->tags->rqs[tag] = NULL;
+ blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+ }
+ if (sched_tag >= 0)
+ blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
blk_queue_exit(q);
}
@@ -844,6 +852,26 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
+int blk_mq_assign_drv_tag(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+ struct blk_mq_alloc_data data = {
+ .q = rq->q,
+ .ctx = rq->mq_ctx,
+ .hctx = hctx,
+ };
+
+ rq->tag = blk_mq_get_tag(&data, hctx->tags);
+ if (rq->tag < 0)
+ goto out;
+ WARN_ON_ONCE(hctx->tags->rqs[rq->tag]);
+ hctx->tags->rqs[rq->tag] = rq;
+
+out:
+ return rq->tag;
+}
+
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct request_queue *q = hctx->queue;
@@ -866,6 +894,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
struct blk_mq_queue_data bd;
rq = list_first_entry(list, struct request, queuelist);
+ if (rq->tag < 0 && blk_mq_assign_drv_tag(rq) < 0)
+ break;
list_del_init(&rq->queuelist);
bd.rq = rq;
@@ -1296,7 +1326,8 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
goto insert;
hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
- if (blk_mq_hctx_stopped(hctx))
+ if (blk_mq_hctx_stopped(hctx) ||
+ (rq->tag < 0 && blk_mq_assign_drv_tag(rq) < 0))
goto insert;
new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
@@ -1592,17 +1623,19 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags)
}
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx)
+ unsigned int hctx_idx,
+ unsigned int nr_tags,
+ unsigned int reserved_tags)
{
struct blk_mq_tags *tags;
- tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+ tags = blk_mq_init_tags(nr_tags, reserved_tags,
set->numa_node,
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;
- tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+ tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node);
if (!tags->rqs) {
@@ -1800,6 +1833,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx];
+ hctx->sched_tags = set->sched_tags[hctx_idx];
/*
* Allocate space for all possible cpus to avoid allocation at
@@ -1881,6 +1915,38 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
+static void __blk_mq_free_rq_map_i(struct blk_mq_tag_set *set, int hctx_idx)
+{
+ if (set->sched_tags[hctx_idx]) {
+ blk_mq_free_rqs(set, set->sched_tags[hctx_idx], hctx_idx);
+ blk_mq_free_rq_map(set->sched_tags[hctx_idx]);
+ set->sched_tags[hctx_idx] = NULL;
+ }
+ if (set->tags[hctx_idx]) {
+ blk_mq_free_rq_map(set->tags[hctx_idx]);
+ set->tags[hctx_idx] = NULL;
+ }
+}
+
+static bool __blk_mq_alloc_rq_map_i(struct blk_mq_tag_set *set, int hctx_idx,
+ unsigned int nr_requests)
+{
+ int ret = 0;
+
+ set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+ set->queue_depth, set->reserved_tags);
+ set->sched_tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+ nr_requests, 0);
+ if (set->sched_tags[hctx_idx])
+ ret = blk_mq_alloc_rqs(set, set->sched_tags[hctx_idx],
+ hctx_idx);
+ if (!set->tags[hctx_idx] || !set->sched_tags[hctx_idx] || ret < 0) {
+ __blk_mq_free_rq_map_i(set, hctx_idx);
+ return false;
+ }
+ return true;
+}
+
static void blk_mq_map_swqueue(struct request_queue *q,
const struct cpumask *online_mask)
{
@@ -1909,23 +1975,15 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx_idx = q->mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
- if (!set->tags[hctx_idx]) {
- set->tags[hctx_idx] = blk_mq_alloc_rq_map(set,
- hctx_idx);
- if (blk_mq_alloc_rqs(set, set->tags[hctx_idx],
- hctx_idx) < 0) {
- blk_mq_free_rq_map(set->tags[hctx_idx]);
- set->tags[hctx_idx] = NULL;
- }
-
+ if (!set->tags[hctx_idx] &&
+ !__blk_mq_alloc_rq_map_i(set, hctx_idx, q->nr_requests)) {
/*
* If tags initialization fail for some hctx,
* that hctx won't be brought online. In this
* case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated
*/
- if (!set->tags[hctx_idx])
- q->mq_map[i] = 0;
+ q->mq_map[i] = 0;
}
ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -2318,26 +2376,20 @@ static int blk_mq_queue_reinit_prepare(unsigned int cpu)
return 0;
}
-static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set,
+ unsigned int nr_requests)
{
int i;
- for (i = 0; i < set->nr_hw_queues; i++) {
- set->tags[i] = blk_mq_alloc_rq_map(set, i);
- if (!set->tags[i])
+ for (i = 0; i < set->nr_hw_queues; i++)
+ if (!__blk_mq_alloc_rq_map_i(set, i, nr_requests))
goto out_unwind;
- if (blk_mq_alloc_rqs(set, set->tags[i], i) < 0)
- goto free_rq_map;
- }
return 0;
out_unwind:
- while (--i >= 0) {
- blk_mq_free_rqs(set, set->tags[i], i);
-free_rq_map:
- blk_mq_free_rq_map(set->tags[i]);
- }
+ while (--i >= 0)
+ __blk_mq_free_rq_map_i(set, i);
return -ENOMEM;
}
@@ -2347,14 +2399,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set,
+ unsigned int nr_requests)
{
unsigned int depth;
int err;
depth = set->queue_depth;
do {
- err = __blk_mq_alloc_rq_maps(set);
+ err = __blk_mq_alloc_rq_maps(set, nr_requests);
if (!err)
break;
@@ -2385,7 +2438,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
- int ret;
+ int ret = -ENOMEM;
BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
@@ -2425,32 +2478,39 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (!set->tags)
return -ENOMEM;
- ret = -ENOMEM;
+ set->sched_tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
+ GFP_KERNEL, set->numa_node);
+ if (!set->sched_tags)
+ goto free_drv_tags;
+
set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
GFP_KERNEL, set->numa_node);
if (!set->mq_map)
- goto out_free_tags;
+ goto free_sched_tags;
if (set->ops->map_queues)
ret = set->ops->map_queues(set);
else
ret = blk_mq_map_queues(set);
if (ret)
- goto out_free_mq_map;
+ goto free_mq_map;
- ret = blk_mq_alloc_rq_maps(set);
+ ret = blk_mq_alloc_rq_maps(set, set->queue_depth/*q->nr_requests*/);
if (ret)
- goto out_free_mq_map;
+ goto free_mq_map;
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
-out_free_mq_map:
+free_mq_map:
kfree(set->mq_map);
set->mq_map = NULL;
-out_free_tags:
+free_sched_tags:
+ kfree(set->sched_tags);
+ set->sched_tags = NULL;
+free_drv_tags:
kfree(set->tags);
set->tags = NULL;
return ret;
@@ -2465,12 +2525,16 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
if (set->tags[i]) {
blk_mq_free_rqs(set, set->tags[i], i);
blk_mq_free_rq_map(set->tags[i]);
+ blk_mq_free_rq_map(set->sched_tags[i]);
}
}
kfree(set->mq_map);
set->mq_map = NULL;
+ kfree(set->sched_tags);
+ set->sched_tags = NULL;
+
kfree(set->tags);
set->tags = NULL;
}
@@ -2482,14 +2546,18 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
struct blk_mq_hw_ctx *hctx;
int i, ret;
- if (!set || nr > set->queue_depth)
+ if (!set)
return -EINVAL;
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->tags)
continue;
- ret = blk_mq_tag_update_depth(hctx->tags, nr);
+ ret = blk_mq_tag_update_depth(hctx->tags,
+ min(nr, set->queue_depth));
+ if (ret)
+ break;
+ ret = blk_mq_tag_update_depth(hctx->sched_tags, nr);
if (ret)
break;
}
@@ -31,6 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
+int blk_mq_assign_drv_tag(struct request *rq);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
@@ -41,7 +42,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx);
+ unsigned int hctx_idx,
+ unsigned int nr_tags,
+ unsigned int reserved_tags);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
list_del_init(&rq->queuelist);
rq->rq_flags &= ~RQF_QUEUED;
rq->tag = -1;
+ rq->sched_tag = -1;
if (unlikely(bqt->tag_index[tag] == NULL))
printk(KERN_ERR "%s: tag %d is missing\n",
@@ -36,6 +36,7 @@ struct blk_mq_hw_ctx {
atomic_t wait_index;
struct blk_mq_tags *tags;
+ struct blk_mq_tags *sched_tags;
struct srcu_struct queue_rq_srcu;
@@ -72,6 +73,7 @@ struct blk_mq_tag_set {
void *driver_data;
struct blk_mq_tags **tags;
+ struct blk_mq_tags **sched_tags;
struct mutex tag_list_lock;
struct list_head tag_list;
@@ -223,6 +223,7 @@ struct request {
void *special; /* opaque pointer available for LLD use */
int tag;
+ int sched_tag;
int errors;
/*
--
2.11.0