@@ -69,6 +69,14 @@ config MQ_IOSCHED_DEADLINE
---help---
MQ version of the deadline IO scheduler.
+config MQ_IOSCHED_KYBER
+ tristate "Kyber I/O scheduler"
+ default y
+ ---help---
+ The Kyber I/O scheduler is a low-overhead scheduler suitable for
+ multiqueue and other fast devices. Given a target latency, it will
+ self-tune queue depths to achieve that goal.
+
endmenu
endif
@@ -20,6 +20,7 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
+obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
@@ -221,14 +221,15 @@ int elevator_init(struct request_queue *q, char *name)
if (!e) {
/*
- * For blk-mq devices, we default to using mq-deadline,
- * if available, for single queue devices. If deadline
- * isn't available OR we have multiple queues, default
- * to "none".
+ * For blk-mq, we default to using mq-deadline for single-queue
+ * devices and kyber for multi-queue devices. We fall back to
+ * "none" if the preferred scheduler isn't available.
*/
if (q->mq_ops) {
if (q->nr_hw_queues == 1)
e = elevator_get("mq-deadline", false);
+ else
+ e = elevator_get("kyber", false);
if (!e)
return 0;
} else
new file mode 100644
@@ -0,0 +1,586 @@
+/*
+ * The Kyber I/O scheduler. Controls latency by throttling queue depths using
+ * scalable techniques.
+ *
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/module.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-stat.h"
+
+/* Scheduling domains. */
+enum {
+ KYBER_READ,
+ KYBER_WRITE,
+ KYBER_NUM_DOMAINS,
+};
+
+enum {
+ KYBER_MIN_DEPTH = 256,
+
+ /*
+ * Initial device-wide depths for each scheduling domain.
+ *
+ * Even for fast devices with lots of tags like NVMe, you can saturate
+ * the device with only a fraction of the maximum possible queue depth.
+ * So, we cap these to a reasonable value.
+ */
+ KYBER_READ_DEPTH = 256,
+ KYBER_WRITE_DEPTH = KYBER_READ_DEPTH / 4,
+
+ /*
+ * Scheduling domain batch sizes. We favor reads over writes.
+ */
+ KYBER_READ_BATCH = 16,
+ KYBER_WRITE_BATCH = 8,
+
+ /*
+ * In order to prevent starvation of synchronous requests by a flood of
+ * asynchronous requests, we reserve 25% of requests for synchronous
+ * operations.
+ */
+ KYBER_ASYNC_PERCENT = 75,
+};
+
+struct kyber_queue_data {
+ struct request_queue *q;
+
+ struct blk_stat_callback *cb;
+
+ /*
+ * The device is divided into multiple scheduling domains based on the
+ * request type. Each domain has a fixed number of in-flight requests of
+ * that type device-wide, limited by these tokens.
+ */
+ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
+
+ /*
+ * The maximum depth that the domain tokens can be resized to.
+ */
+ unsigned int max_domain_tokens[KYBER_NUM_DOMAINS];
+
+ /* Batch size for each scheduling domain. */
+ unsigned int domain_batch[KYBER_NUM_DOMAINS];
+
+ /*
+ * Async request percentage, converted to per-word depth for
+ * sbitmap_get_shallow().
+ */
+ unsigned int async_depth;
+
+ /* Target read latency in nanoseconds. */
+ u64 read_lat_nsec;
+};
+
+struct kyber_hctx_data {
+ spinlock_t lock;
+ struct list_head rqs[KYBER_NUM_DOMAINS];
+ int cur_domain;
+ unsigned int batching;
+};
+
+/*
+ * Heuristics for limiting queue depths based on latency. Similar to AQM
+ * techniques for network routing.
+ */
+static void kyber_stats_fn(struct blk_stat_callback *cb,
+ struct blk_stats *stats)
+{
+ struct kyber_queue_data *kqd = cb->data;
+ unsigned int orig_write_depth, write_depth;
+ u64 latency, target;
+
+ orig_write_depth = write_depth =
+ READ_ONCE(kqd->domain_tokens[KYBER_WRITE].sb.depth);
+
+ if (!stats->read.nr_samples) {
+ write_depth += 1;
+ goto resize;
+ }
+
+ latency = stats->read.mean;
+ target = kqd->read_lat_nsec;
+
+ if (latency >= 4 * target)
+ write_depth /= 2;
+ else if (latency >= 2 * target)
+ write_depth -= max(write_depth / 4, 1U);
+ else if (latency > target)
+ write_depth -= max(write_depth / 8, 1U);
+ else if (latency <= target / 2)
+ write_depth += 2;
+ else if (latency <= 3 * target / 4)
+ write_depth += 1;
+
+resize:
+ write_depth = clamp_t(unsigned int, write_depth, 1, KYBER_WRITE_DEPTH);
+ if (write_depth != orig_write_depth)
+ sbitmap_queue_resize(&kqd->domain_tokens[KYBER_WRITE], write_depth);
+
+ /* Continue monitoring latencies as long as we are throttling. */
+ if (write_depth < KYBER_WRITE_DEPTH && !timer_pending(&kqd->cb->timer))
+ blk_stat_arm_callback(kqd->cb, jiffies + msecs_to_jiffies(100));
+}
+
+/*
+ * Check if this request met our latency goal. If not, quickly gather some
+ * statistics and start throttling.
+ */
+static void kyber_check_latency(struct kyber_queue_data *kqd,
+ struct request *rq)
+{
+ u64 now, latency;
+ unsigned long expires;
+
+ if (req_op(rq) != REQ_OP_READ)
+ return;
+
+ /* If we are already managing the write depth, don't check again. */
+ if (kqd->domain_tokens[KYBER_WRITE].sb.depth < KYBER_WRITE_DEPTH)
+ return;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ latency = now - blk_stat_time(&rq->issue_stat);
+
+ if (latency <= kqd->read_lat_nsec)
+ return;
+
+ if (!timer_pending(&kqd->cb->timer)) {
+ expires = jiffies + msecs_to_jiffies(10);
+ blk_stat_arm_callback(kqd->cb, expires);
+ }
+}
+
+static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+{
+ /*
+ * All of the hardware queues have the same depth, so we can just grab
+ * the shift of the first one.
+ */
+ return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
+}
+
+static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
+{
+ struct kyber_queue_data *kqd;
+ unsigned int max_tokens;
+ unsigned int shift;
+ int ret = -ENOMEM;
+ int i;
+
+ kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+ if (!kqd)
+ goto err;
+ kqd->q = q;
+
+ kqd->cb = blk_stat_alloc_callback(kyber_stats_fn, kqd);
+ if (!kqd->cb)
+ goto err_kqd;
+
+ /*
+ * The maximum number of tokens for any scheduling domain is at least
+ * the queue depth of a single hardware queue. If the hardware doesn't
+ * have many tags, still provide a reasonable number.
+ */
+ max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
+ KYBER_MIN_DEPTH);
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ kqd->max_domain_tokens[i] = max_tokens;
+ ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
+ max_tokens, -1, false, GFP_KERNEL,
+ q->node);
+ if (ret) {
+ while (--i >= 0)
+ sbitmap_queue_free(&kqd->domain_tokens[i]);
+ goto err_cb;
+ }
+ }
+
+ sbitmap_queue_resize(&kqd->domain_tokens[KYBER_READ], KYBER_READ_DEPTH);
+ sbitmap_queue_resize(&kqd->domain_tokens[KYBER_WRITE], KYBER_WRITE_DEPTH);
+
+ kqd->domain_batch[KYBER_READ] = KYBER_READ_BATCH;
+ kqd->domain_batch[KYBER_WRITE] = KYBER_WRITE_BATCH;
+
+ shift = kyber_sched_tags_shift(kqd);
+ kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+
+ kqd->read_lat_nsec = 2000000ULL;
+
+ return kqd;
+
+err_cb:
+ blk_stat_free_callback(kqd->cb);
+err_kqd:
+ kfree(kqd);
+err:
+ return ERR_PTR(ret);
+}
+
+static void kyber_queue_data_free(struct kyber_queue_data *kqd)
+{
+ int i;
+
+ if (!kqd)
+ return;
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+ sbitmap_queue_free(&kqd->domain_tokens[i]);
+ blk_stat_free_callback(kqd->cb);
+ kfree(kqd);
+}
+
+static int kyber_hctx_data_init(struct blk_mq_hw_ctx *hctx)
+{
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ int i;
+
+ spin_lock_init(&khd->lock);
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+ INIT_LIST_HEAD(&khd->rqs[i]);
+
+ khd->cur_domain = 0;
+ khd->batching = 0;
+
+ return 0;
+}
+
+static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
+{
+ struct kyber_queue_data *kqd;
+ struct elevator_queue *eq;
+ int ret;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
+
+ kqd = kyber_queue_data_alloc(q);
+ if (IS_ERR(kqd)) {
+ ret = PTR_ERR(kqd);
+ goto err_kobj;
+ }
+
+ ret = blk_mq_sched_init_hctx_data(q, sizeof(struct kyber_hctx_data),
+ kyber_hctx_data_init, NULL);
+ if (ret)
+ goto err_kqd;
+
+ eq->elevator_data = kqd;
+ q->elevator = eq;
+
+ blk_stat_add_callback(q, kqd->cb);
+
+ return 0;
+
+err_kqd:
+ kyber_queue_data_free(kqd);
+err_kobj:
+ kobject_put(&eq->kobj);
+ return ret;
+}
+
+static void kyber_exit_sched(struct elevator_queue *e)
+{
+ struct kyber_queue_data *kqd = e->elevator_data;
+ struct request_queue *q = kqd->q;
+
+ blk_stat_remove_callback(q, kqd->cb);
+ blk_mq_sched_free_hctx_data(q, NULL);
+ kyber_queue_data_free(e->elevator_data);
+}
+
+static int op_to_sched_domain(int op)
+{
+ if (op_is_write(op))
+ return KYBER_WRITE;
+ else
+ return KYBER_READ;
+}
+
+static int kyber_get_domain_token(struct kyber_queue_data *kqd,
+ int sched_domain)
+{
+ struct sbitmap_queue *domain_tokens;
+
+ domain_tokens = &kqd->domain_tokens[sched_domain];
+ return __sbitmap_queue_get(domain_tokens);
+}
+
+static int rq_get_domain_token(struct request *rq)
+{
+ return (long)rq->elv.priv[0];
+}
+
+static void rq_set_domain_token(struct request *rq, int token)
+{
+ rq->elv.priv[0] = (void *)(long)token;
+}
+
+static void rq_clear_domain_token(struct kyber_queue_data *kqd,
+ struct request *rq)
+{
+ int sched_domain, nr;
+
+ nr = rq_get_domain_token(rq);
+ if (nr != -1) {
+ sched_domain = op_to_sched_domain(req_op(rq));
+ sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
+ rq->mq_ctx->cpu);
+ }
+}
+
+static struct request *kyber_get_request(struct request_queue *q,
+ unsigned int op,
+ struct blk_mq_alloc_data *data)
+{
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+ struct request *rq;
+
+ /*
+ * We use the scheduler tags as per-hardware queue queueing tokens.
+ * Async requests can be limited at this stage.
+ */
+ if (!op_is_sync(op))
+ data->shallow_depth = READ_ONCE(kqd->async_depth);
+
+ rq = __blk_mq_alloc_request(data, op);
+ if (rq)
+ rq_set_domain_token(rq, -1);
+ return rq;
+}
+
+static void kyber_put_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+
+ kyber_check_latency(kqd, rq);
+ rq_clear_domain_token(kqd, rq);
+ blk_mq_finish_request(rq);
+}
+
+static void kyber_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx,
+ struct kyber_hctx_data *khd)
+{
+ LIST_HEAD(rq_list);
+ struct request *rq, *next;
+
+ blk_mq_flush_busy_ctxs(hctx, &rq_list);
+ list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+ int sched_domain;
+
+ sched_domain = op_to_sched_domain(req_op(rq));
+ list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
+ }
+}
+
+static struct request *
+kyber_dispatch_cur_domain(struct blk_mq_hw_ctx *hctx,
+ struct kyber_queue_data *kqd,
+ struct kyber_hctx_data *khd,
+ bool *flushed, bool *no_tokens)
+{
+ struct list_head *rqs;
+ struct request *rq;
+ int nr;
+
+ rqs = &khd->rqs[khd->cur_domain];
+ rq = list_first_entry_or_null(rqs, struct request, queuelist);
+
+ /*
+ * If there wasn't already a pending request and we haven't flushed the
+ * software queues yet, flush the software queues and check again.
+ */
+ if (!rq && !*flushed) {
+ kyber_flush_busy_ctxs(hctx, khd);
+ *flushed = true;
+ rq = list_first_entry_or_null(rqs, struct request, queuelist);
+ }
+
+ if (rq) {
+ nr = kyber_get_domain_token(kqd, khd->cur_domain);
+ if (nr == -1) {
+ *no_tokens = true;
+ } else {
+ khd->batching++;
+ rq_set_domain_token(rq, nr);
+ list_del_init(&rq->queuelist);
+ return rq;
+ }
+ }
+
+ /* There were either no pending requests or no tokens. */
+ return NULL;
+}
+
+/*
+ * Returns a request on success, NULL if there were no requests to dispatch, and
+ * ERR_PTR(-EBUSY) if there were requests to dispatch but no domain tokens for
+ * them.
+ */
+static struct request *__kyber_dispatch_request(struct kyber_queue_data *kqd,
+ struct kyber_hctx_data *khd,
+ struct blk_mq_hw_ctx *hctx)
+{
+ bool flushed = false, no_tokens = false;
+ struct request *rq;
+ int i;
+
+ /*
+ * First, if we are still entitled to batch, try to dispatch a request
+ * from the batch.
+ */
+ if (khd->batching < READ_ONCE(kqd->domain_batch[khd->cur_domain])) {
+ rq = kyber_dispatch_cur_domain(hctx, kqd, khd, &flushed,
+ &no_tokens);
+ if (rq)
+ return rq;
+ }
+
+ /*
+ * Either,
+ * 1. We were no longer entitled to a batch.
+ * 2. The domain we were batching didn't have any requests.
+ * 3. The domain we were batching was out of tokens.
+ *
+ * Start another batch. Note that this wraps back around to the original
+ * domain if no other domains have requests or tokens.
+ */
+ khd->batching = 0;
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ if (++khd->cur_domain >= KYBER_NUM_DOMAINS)
+ khd->cur_domain = 0;
+
+ rq = kyber_dispatch_cur_domain(hctx, kqd, khd, &flushed,
+ &no_tokens);
+ if (rq)
+ return rq;
+ }
+
+ return no_tokens ? ERR_PTR(-EBUSY) : NULL;
+}
+
+static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ struct request *rq;
+
+ spin_lock(&khd->lock);
+
+ rq = __kyber_dispatch_request(kqd, khd, hctx);
+ if (IS_ERR(rq)) {
+ /*
+ * We failed to get a domain token. Mark the queue as needing a
+ * restart and try again in case a token was freed before we set
+ * the restart bit.
+ */
+ blk_mq_sched_mark_restart_queue(hctx);
+ rq = __kyber_dispatch_request(kqd, khd, hctx);
+ if (IS_ERR(rq))
+ rq = NULL;
+ }
+
+ spin_unlock(&khd->lock);
+
+ return rq;
+}
+
+static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ int i;
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ if (!list_empty_careful(&khd->rqs[i]))
+ return true;
+ }
+ return false;
+}
+
+static ssize_t kyber_read_lat_show(struct elevator_queue *e, char *page)
+{
+ struct kyber_queue_data *kqd = e->elevator_data;
+
+ return sprintf(page, "%llu\n", kqd->read_lat_nsec);
+}
+
+static ssize_t kyber_read_lat_store(struct elevator_queue *e, const char *page,
+ size_t count)
+{
+ struct kyber_queue_data *kqd = e->elevator_data;
+ unsigned long long nsec;
+ int ret;
+
+ ret = kstrtoull(page, 10, &nsec);
+ if (ret)
+ return ret;
+
+ WRITE_ONCE(kqd->read_lat_nsec, nsec);
+
+ return count;
+}
+
+static struct elv_fs_entry kyber_sched_attrs[] = {
+ __ATTR(read_lat_nsec, 0644, kyber_read_lat_show, kyber_read_lat_store),
+ __ATTR_NULL
+};
+
+static struct elevator_type kyber_sched = {
+ .ops.mq = {
+ .init_sched = kyber_init_sched,
+ .exit_sched = kyber_exit_sched,
+ .get_request = kyber_get_request,
+ .put_request = kyber_put_request,
+ .dispatch_request = kyber_dispatch_request,
+ .has_work = kyber_has_work,
+ },
+ .uses_mq = true,
+ .elevator_attrs = kyber_sched_attrs,
+ .elevator_name = "kyber",
+ .elevator_owner = THIS_MODULE,
+};
+
+static int __init kyber_init(void)
+{
+ return elv_register(&kyber_sched);
+}
+
+static void __exit kyber_exit(void)
+{
+ elv_unregister(&kyber_sched);
+}
+
+module_init(kyber_init);
+module_exit(kyber_exit);
+
+MODULE_AUTHOR("Omar Sandoval");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kyber I/O scheduler");