@@ -8,7 +8,8 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
blk-exec.o blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
- genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o
+ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
+ blk-poll.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
@@ -43,26 +43,6 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
-static void blk_mq_poll_stats_start(struct request_queue *q);
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
-
-static int blk_mq_poll_stats_bkt(const struct request *rq)
-{
- int ddir, sectors, bucket;
-
- ddir = rq_data_dir(rq);
- sectors = blk_rq_stats_sectors(rq);
-
- bucket = ddir + 2 * ilog2(sectors);
-
- if (bucket < 0)
- return -1;
- else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
- return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
-
- return bucket;
-}
-
/*
* Check if any of the ctx, dispatch list or elevator
* have pending work in this hardware queue.
@@ -3726,216 +3706,6 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
-/* Enable polling stats and return whether they were already enabled. */
-static bool blk_poll_stats_enable(struct request_queue *q)
-{
- if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
- blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
- return true;
- blk_stat_add_callback(q, q->poll_cb);
- return false;
-}
-
-static void blk_mq_poll_stats_start(struct request_queue *q)
-{
- /*
- * We don't arm the callback if polling stats are not enabled or the
- * callback is already active.
- */
- if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
- blk_stat_is_active(q->poll_cb))
- return;
-
- blk_stat_activate_msecs(q->poll_cb, 100);
-}
-
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
-{
- struct request_queue *q = cb->data;
- int bucket;
-
- for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
- if (cb->stat[bucket].nr_samples)
- q->poll_stat[bucket] = cb->stat[bucket];
- }
-}
-
-static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
- struct request *rq)
-{
- unsigned long ret = 0;
- int bucket;
-
- /*
- * If stats collection isn't on, don't sleep but turn it on for
- * future users
- */
- if (!blk_poll_stats_enable(q))
- return 0;
-
- /*
- * As an optimistic guess, use half of the mean service time
- * for this type of request. We can (and should) make this smarter.
- * For instance, if the completion latencies are tight, we can
- * get closer than just half the mean. This is especially
- * important on devices where the completion latencies are longer
- * than ~10 usec. We do use the stats for the relevant IO size
- * if available which does lead to better estimates.
- */
- bucket = blk_mq_poll_stats_bkt(rq);
- if (bucket < 0)
- return ret;
-
- if (q->poll_stat[bucket].nr_samples)
- ret = (q->poll_stat[bucket].mean + 1) / 2;
-
- return ret;
-}
-
-static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
- struct request *rq)
-{
- struct hrtimer_sleeper hs;
- enum hrtimer_mode mode;
- unsigned int nsecs;
- ktime_t kt;
-
- if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
- return false;
-
- /*
- * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
- *
- * 0: use half of prev avg
- * >0: use this specific value
- */
- if (q->poll_nsec > 0)
- nsecs = q->poll_nsec;
- else
- nsecs = blk_mq_poll_nsecs(q, rq);
-
- if (!nsecs)
- return false;
-
- rq->rq_flags |= RQF_MQ_POLL_SLEPT;
-
- /*
- * This will be replaced with the stats tracking code, using
- * 'avg_completion_time / 2' as the pre-sleep target.
- */
- kt = nsecs;
-
- mode = HRTIMER_MODE_REL;
- hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
- hrtimer_set_expires(&hs.timer, kt);
-
- do {
- if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
- break;
- set_current_state(TASK_UNINTERRUPTIBLE);
- hrtimer_sleeper_start_expires(&hs, mode);
- if (hs.task)
- io_schedule();
- hrtimer_cancel(&hs.timer);
- mode = HRTIMER_MODE_ABS;
- } while (hs.task && !signal_pending(current));
-
- __set_current_state(TASK_RUNNING);
- destroy_hrtimer_on_stack(&hs.timer);
- return true;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
-{
- struct request *rq;
-
- if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
- return false;
-
- if (!blk_qc_t_is_internal(cookie))
- rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
- else {
- rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
- /*
- * With scheduling, if the request has completed, we'll
- * get a NULL return here, as we clear the sched tag when
- * that happens. The request still remains valid, like always,
- * so we should be safe with just the NULL check.
- */
- if (!rq)
- return false;
- }
-
- return blk_mq_poll_hybrid_sleep(q, rq);
-}
-
-/**
- * blk_poll - poll for IO completions
- * @q: the queue
- * @cookie: cookie passed back at IO submission time
- * @spin: whether to spin for completions
- *
- * Description:
- * Poll for completions on the passed in queue. Returns number of
- * completed entries found. If @spin is true, then blk_poll will continue
- * looping until at least one completion is found, unless the task is
- * otherwise marked running (or we need to reschedule).
- */
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
-{
- struct blk_mq_hw_ctx *hctx;
- long state;
-
- if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q))
- return 0;
-
- if (current->plug)
- blk_flush_plug_list(current->plug, false);
-
- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-
- /*
- * If we sleep, have the caller restart the poll loop to reset
- * the state. Like for the other success return cases, the
- * caller is responsible for checking if the IO completed. If
- * the IO isn't complete, we'll get called again and will go
- * straight to the busy poll loop. If specified not to spin,
- * we also should not sleep.
- */
- if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
- return 1;
-
- hctx->poll_considered++;
-
- state = current->state;
- do {
- int ret;
-
- hctx->poll_invoked++;
-
- ret = q->mq_ops->poll(hctx);
- if (ret > 0) {
- hctx->poll_success++;
- __set_current_state(TASK_RUNNING);
- return ret;
- }
-
- if (signal_pending_state(state, current))
- __set_current_state(TASK_RUNNING);
-
- if (current->state == TASK_RUNNING)
- return 1;
- if (ret < 0 || !spin)
- break;
- cpu_relax();
- } while (!need_resched());
-
- __set_current_state(TASK_RUNNING);
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_poll);
-
unsigned int blk_mq_rq_cpu(struct request *rq)
{
return rq->mq_ctx->cpu;
@@ -324,5 +324,45 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return __blk_mq_active_requests(hctx) < depth;
}
+static inline int blk_mq_poll_stats_bkt(const struct request *rq)
+{
+ int ddir, sectors, bucket;
+
+ ddir = rq_data_dir(rq);
+ sectors = blk_rq_stats_sectors(rq);
+
+ bucket = ddir + 2 * ilog2(sectors);
+
+ if (bucket < 0)
+ return -1;
+ else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
+ return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
+
+ return bucket;
+}
+
+static inline void blk_mq_poll_stats_start(struct request_queue *q)
+{
+ /*
+ * We don't arm the callback if polling stats are not enabled or the
+ * callback is already active.
+ */
+ if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+ blk_stat_is_active(q->poll_cb))
+ return;
+
+ blk_stat_activate_msecs(q->poll_cb, 100);
+}
+
+static inline void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
+{
+ struct request_queue *q = cb->data;
+ int bucket;
+
+ for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
+ if (cb->stat[bucket].nr_samples)
+ q->poll_stat[bucket] = cb->stat[bucket];
+ }
+}
#endif
new file mode 100644
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/sched.h>
+#include <linux/hrtimer.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+
+/* Enable polling stats and return whether they were already enabled. */
+static bool blk_poll_stats_enable(struct request_queue *q)
+{
+ if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+ blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
+ return true;
+ blk_stat_add_callback(q, q->poll_cb);
+ return false;
+}
+
+static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
+ struct request *rq)
+{
+ unsigned long ret = 0;
+ int bucket;
+
+ /*
+ * If stats collection isn't on, don't sleep but turn it on for
+ * future users
+ */
+ if (!blk_poll_stats_enable(q))
+ return 0;
+
+ /*
+ * As an optimistic guess, use half of the mean service time
+ * for this type of request. We can (and should) make this smarter.
+ * For instance, if the completion latencies are tight, we can
+ * get closer than just half the mean. This is especially
+ * important on devices where the completion latencies are longer
+ * than ~10 usec. We do use the stats for the relevant IO size
+ * if available which does lead to better estimates.
+ */
+ bucket = blk_mq_poll_stats_bkt(rq);
+ if (bucket < 0)
+ return ret;
+
+ if (q->poll_stat[bucket].nr_samples)
+ ret = (q->poll_stat[bucket].mean + 1) / 2;
+
+ return ret;
+}
+
+static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+ struct request *rq)
+{
+ struct hrtimer_sleeper hs;
+ enum hrtimer_mode mode;
+ unsigned int nsecs;
+ ktime_t kt;
+
+ if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
+ return false;
+
+ /*
+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
+ *
+ * 0: use half of prev avg
+ * >0: use this specific value
+ */
+ if (q->poll_nsec > 0)
+ nsecs = q->poll_nsec;
+ else
+ nsecs = blk_mq_poll_nsecs(q, rq);
+
+ if (!nsecs)
+ return false;
+
+ rq->rq_flags |= RQF_MQ_POLL_SLEPT;
+
+ /*
+ * This will be replaced with the stats tracking code, using
+ * 'avg_completion_time / 2' as the pre-sleep target.
+ */
+ kt = nsecs;
+
+ mode = HRTIMER_MODE_REL;
+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires(&hs.timer, kt);
+
+ do {
+ if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
+ break;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ hrtimer_sleeper_start_expires(&hs, mode);
+ if (hs.task)
+ io_schedule();
+ hrtimer_cancel(&hs.timer);
+ mode = HRTIMER_MODE_ABS;
+ } while (hs.task && !signal_pending(current));
+
+ __set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&hs.timer);
+ return true;
+}
+
+static bool blk_mq_poll_hybrid(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
+{
+ struct request *rq;
+
+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
+ return false;
+
+ if (!blk_qc_t_is_internal(cookie))
+ rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+ else {
+ rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
+ /*
+ * With scheduling, if the request has completed, we'll
+ * get a NULL return here, as we clear the sched tag when
+ * that happens. The request still remains valid, like always,
+ * so we should be safe with just the NULL check.
+ */
+ if (!rq)
+ return false;
+ }
+
+ return blk_mq_poll_hybrid_sleep(q, rq);
+}
+
+/**
+ * blk_poll - poll for IO completions
+ * @q: the queue
+ * @cookie: cookie passed back at IO submission time
+ * @spin: whether to spin for completions
+ *
+ * Description:
+ * Poll for completions on the passed in queue. Returns number of
+ * completed entries found. If @spin is true, then blk_poll will continue
+ * looping until at least one completion is found, unless the task is
+ * otherwise marked running (or we need to reschedule).
+ */
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
+{
+ struct blk_mq_hw_ctx *hctx;
+ long state;
+
+ if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q))
+ return 0;
+
+ if (current->plug)
+ blk_flush_plug_list(current->plug, false);
+
+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+
+ /*
+ * If we sleep, have the caller restart the poll loop to reset
+ * the state. Like for the other success return cases, the
+ * caller is responsible for checking if the IO completed. If
+ * the IO isn't complete, we'll get called again and will go
+ * straight to the busy poll loop. If specified not to spin,
+ * we also should not sleep.
+ */
+ if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
+ return 1;
+
+ hctx->poll_considered++;
+
+ state = current->state;
+ do {
+ int ret;
+
+ hctx->poll_invoked++;
+
+ ret = q->mq_ops->poll(hctx);
+ if (ret > 0) {
+ hctx->poll_success++;
+ __set_current_state(TASK_RUNNING);
+ return ret;
+ }
+
+ if (signal_pending_state(state, current))
+ __set_current_state(TASK_RUNNING);
+
+ if (current->state == TASK_RUNNING)
+ return 1;
+ if (ret < 0 || !spin)
+ break;
+ cpu_relax();
+ } while (!need_resched());
+
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_poll);
Prepare for supporting bio based io polling, and move blk polling code into one dedicated source file. And three shared functions are put into private header of blk-mq.h Suggested-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Ming Lei <ming.lei@redhat.com> --- block/Makefile | 3 +- block/blk-mq.c | 230 ----------------------------------------------- block/blk-mq.h | 40 +++++++++ block/blk-poll.c | 196 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 238 insertions(+), 231 deletions(-) create mode 100644 block/blk-poll.c