@@ -538,6 +538,9 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
if (!hctx->sched_tags)
return -ENOMEM;
+ if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
+ return 0;
+
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
if (ret)
blk_mq_sched_free_tags(set, hctx, hctx_idx);
@@ -563,8 +566,30 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
{
struct blk_mq_tag_set *set = queue->tag_set;
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
+ gfp_t flags = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
struct blk_mq_hw_ctx *hctx;
- int ret, i;
+ int ret, i, j;
+
+ /*
+ * In case we need to grow, allocate max we will ever need. This will
+ * waste memory when the request queue depth is less than the max,
+ * i.e. almost always. But helps keep our sanity, rather than dealing
+ * with error handling in blk_mq_update_nr_requests().
+ */
+ queue->static_rqs = kcalloc_node(MAX_SCHED_RQ, sizeof(struct request *),
+ flags, queue->node);
+ if (!queue->static_rqs)
+ return -ENOMEM;
+
+ ret = __blk_mq_alloc_rqs(set, 0, MAX_SCHED_RQ, &queue->page_list,
+ queue->static_rqs);
+ if (ret)
+ goto err_rqs;
+
+ queue_for_each_hw_ctx(queue, hctx, i) {
+ for (j = 0; j < queue->nr_requests; j++)
+ hctx->sched_tags->static_rqs[j] = queue->static_rqs[j];
+ }
/*
* Set initial depth at max so that we don't need to reallocate for
@@ -575,7 +600,7 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
MAX_SCHED_RQ, set->reserved_tags,
set->numa_node, alloc_policy);
if (ret)
- return ret;
+ goto err_bitmaps;
queue_for_each_hw_ctx(queue, hctx, i) {
hctx->sched_tags->bitmap_tags =
@@ -587,10 +612,24 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
blk_mq_tag_resize_sched_shared_sbitmap(queue);
return 0;
+
+err_bitmaps:
+ __blk_mq_free_rqs(set, 0, MAX_SCHED_RQ, &queue->page_list,
+ queue->static_rqs);
+err_rqs:
+ kfree(queue->static_rqs);
+ queue->static_rqs = NULL;
+ return ret;
}
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
{
+ __blk_mq_free_rqs(queue->tag_set, 0, MAX_SCHED_RQ, &queue->page_list,
+ queue->static_rqs);
+
+ kfree(queue->static_rqs);
+ queue->static_rqs = NULL;
+
sbitmap_queue_free(&queue->sched_bitmap_tags);
sbitmap_queue_free(&queue->sched_breserved_tags);
}
@@ -670,8 +709,12 @@ void blk_mq_sched_free_requests(struct request_queue *q)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->sched_tags)
- blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+ if (hctx->sched_tags) {
+ if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
+ } else {
+ blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+ }
+ }
}
}
@@ -592,7 +592,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set;
struct blk_mq_tags *new;
- bool ret;
if (!can_grow)
return -EINVAL;
@@ -608,13 +607,14 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
tags->nr_reserved_tags, set->flags);
if (!new)
return -ENOMEM;
- ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
- if (ret) {
- blk_mq_free_rq_map(new, set->flags);
- return -ENOMEM;
+ if (!blk_mq_is_sbitmap_shared(hctx->flags)) {
+ if (blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth)) {
+ blk_mq_free_rq_map(new, set->flags);
+ return -ENOMEM;
+ }
+ blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
}
- blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
blk_mq_free_rq_map(*tagsptr, set->flags);
*tagsptr = new;
} else {
@@ -3694,10 +3694,17 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
nr, true);
if (blk_mq_is_sbitmap_shared(set->flags)) {
+ int j;
+
hctx->sched_tags->bitmap_tags =
&q->sched_bitmap_tags;
hctx->sched_tags->breserved_tags =
&q->sched_breserved_tags;
+
+ for (j = 0;j < hctx->sched_tags->nr_tags; j++) {
+ hctx->sched_tags->static_rqs[j] =
+ q->static_rqs[j];
+ }
}
} else {
ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
@@ -3708,7 +3715,18 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(hctx);
}
- if (!ret) {
+ if (ret) {
+ if (blk_mq_is_sbitmap_shared(set->flags) && (q->elevator)) {
+ /*
+ * If we error'ed, then we need to revert to the
+ * lowest size, otherwise we may attempt to reference
+ * unset hctx->sched_tags->static_rqs[]
+ */
+ q->nr_requests = min((unsigned long)nr,
+ q->nr_requests);
+ blk_mq_tag_resize_sched_shared_sbitmap(q);
+ }
+ } else {
q->nr_requests = nr;
if (blk_mq_is_sbitmap_shared(set->flags)) {
if (q->elevator) {
@@ -470,6 +470,10 @@ struct request_queue {
struct sbitmap_queue sched_bitmap_tags;
struct sbitmap_queue sched_breserved_tags;
+ /* For shared sbitmap */
+ struct request **static_rqs;
+ struct list_head page_list;
+
struct list_head icq_list;
#ifdef CONFIG_BLK_CGROUP
DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
Similar to allocating a full set of static rqs per hw queue per tag set for when using a shared sbitmap, it is also inefficient in memory terms to allocate a full set of static rqs per hw queue per request queue. Reduce memory usage by allocating a set of static rqs per request queue for when using a shared sbitmap, and make the per-hctx sched_tags->static_rqs[] point at them. Error handling for updating the number of requests in blk_mq_update_nr_requests() -> blk_mq_tag_update_depth() can get quite complicated, so allocate the full max depth of rqs at init time to try to simplify things. This will be somewhat inefficient for when the request queue depth is not close to max, but generally still more efficient than the current situation. For failures in blk_mq_update_nr_requests() -> blk_mq_tag_update_depth() when shrinking request queue depth, q->nr_requests still needs to be updated. This is because some of the hctx->sched_tags may be successfully updated, and now they are now smaller than q->nr_requests, which will lead to problems since a scheduler tag could be greater than hctx->sched_tags size. For failures in blk_mq_update_nr_requests() -> blk_mq_tag_update_depth() when growing a request queue depth, q->nr_requests does not need to be updated. Signed-off-by: John Garry <john.garry@huawei.com> --- block/blk-mq-sched.c | 51 ++++++++++++++++++++++++++++++++++++++---- block/blk-mq-tag.c | 12 +++++----- block/blk-mq.c | 20 ++++++++++++++++- include/linux/blkdev.h | 4 ++++ 4 files changed, 76 insertions(+), 11 deletions(-)