[2/7] blk-mq: replace timeout synchronization with a RCU and generation based scheme

Message ID	20171216120726.517153-3-tj@kernel.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-btrfs-owner@kernel.org> From: Tejun Heo <tj@kernel.org> To: jack@suse.cz, axboe@kernel.dk, clm@fb.com, jbacik@fb.com Cc: kernel-team@fb.com, linux-kernel@vger.kernel.org, linux-btrfs@vger.kernel.org, peterz@infradead.org, jianchao.w.wang@oracle.com, Bart.VanAssche@wdc.com, Tejun Heo <tj@kernel.org> Subject: [PATCH 2/7] blk-mq: replace timeout synchronization with a RCU and generation based scheme Date: Sat, 16 Dec 2017 04:07:21 -0800 Message-Id: <20171216120726.517153-3-tj@kernel.org> In-Reply-To: <20171216120726.517153-1-tj@kernel.org> References: <20171216120726.517153-1-tj@kernel.org> Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk

diff --git a/block/blk-core.c b/block/blk-core.c index b888175..6034623 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); } EXPORT_SYMBOL(blk_rq_init); diff --git a/block/blk-mq.c b/block/blk-mq.c index acf4fbb..abd5d01 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq) if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); if (rq->tag != -1) @@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); + if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); if (rq->rq_flags & RQF_STATS) { @@ -557,6 +560,30 @@ static void __blk_mq_complete_request(struct request *rq) put_cpu(); } +static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) +{ + unsigned long flags; + + local_irq_save(flags); + u64_stats_update_begin(&rq->aborted_gstate_sync); + rq->aborted_gstate = gstate; + u64_stats_update_end(&rq->aborted_gstate_sync); + local_irq_restore(flags); +} + +static u64 blk_mq_rq_aborted_gstate(struct request *rq) +{ + unsigned int start; + u64 aborted_gstate; + + do { + start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); + aborted_gstate = rq->aborted_gstate; + } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); + + return aborted_gstate; +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -574,14 +601,21 @@ void blk_mq_complete_request(struct request *rq) if (unlikely(blk_should_fake_timeout(q))) return; + /* + * If @rq->aborted_gstate equals the current instance, timeout is + * claiming @rq and we lost. This is synchronized through RCU. + * See blk_mq_timeout_work() for details. + */ if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - if (!blk_mark_rq_complete(rq)) + if (blk_mq_rq_aborted_gstate(rq) != rq->gstate && + !blk_mark_rq_complete(rq)) __blk_mq_complete_request(rq); rcu_read_unlock(); } else { srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - if (!blk_mark_rq_complete(rq)) + if (blk_mq_rq_aborted_gstate(rq) != rq->gstate && + !blk_mark_rq_complete(rq)) __blk_mq_complete_request(rq); srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } @@ -608,34 +642,32 @@ void blk_mq_start_request(struct request *rq) wbt_issue(q->rq_wb, &rq->issue_stat); } - blk_add_timer(rq); - + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); /* - * Mark us as started and clear complete. Complete might have been - * set if requeue raced with timeout, which then marked it as - * complete. So be sure to clear complete again when we start - * the request, otherwise we'll ignore the completion event. + * Mark @rq in-flight which also advances the generation number, + * and register for timeout. Protect with a seqcount to allow the + * timeout path to read both @rq->gstate and @rq->deadline + * coherently. * - * Ensure that ->deadline is visible before we set STARTED, such that - * blk_mq_check_expired() is guaranteed to observe our ->deadline when - * it observes STARTED. + * This is the only place where a request is marked in-flight. If + * the timeout path reads an in-flight @rq->gstate, the + * @rq->deadline it reads together under @rq->gstate_seq is + * guaranteed to be the matching one. */ - smp_wmb(); + preempt_disable(); + write_seqcount_begin(&rq->gstate_seq); + + blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); + blk_add_timer(rq); + + write_seqcount_end(&rq->gstate_seq); + preempt_enable(); + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { - /* - * Coherence order guarantees these consecutive stores to a - * single variable propagate in the specified order. Thus the - * clear_bit() is ordered _after_ the set bit. See - * blk_mq_check_expired(). - * - * (the bits must be part of the same byte for this to be - * true). - */ + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); - } if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -668,6 +700,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_sched_requeue_request(rq); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -765,6 +798,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); struct blk_mq_timeout_data { unsigned long next; unsigned int next_set; + unsigned int nr_expired; }; void blk_mq_rq_timed_out(struct request *req, bool reserved) @@ -792,6 +826,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) __blk_mq_complete_request(req); break; case BLK_EH_RESET_TIMER: + /* + * As nothing prevents from completion happening while + * ->aborted_gstate is set, this may lead to ignored + * completions and further spurious timeouts. + */ + blk_mq_rq_update_aborted_gstate(req, 0); blk_add_timer(req); blk_clear_rq_complete(req); break; @@ -807,50 +847,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - unsigned long deadline; + unsigned long gstate, deadline; + int start; + + might_sleep(); if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) return; - /* - * Ensures that if we see STARTED we must also see our - * up-to-date deadline, see blk_mq_start_request(). - */ - smp_rmb(); - - deadline = READ_ONCE(rq->deadline); + /* read coherent snapshots of @rq->state_gen and @rq->deadline */ + while (true) { + start = read_seqcount_begin(&rq->gstate_seq); + gstate = READ_ONCE(rq->gstate); + deadline = rq->deadline; + if (!read_seqcount_retry(&rq->gstate_seq, start)) + break; + cond_resched(); + } - /* - * The rq being checked may have been freed and reallocated - * out already here, we avoid this race by checking rq->deadline - * and REQ_ATOM_COMPLETE flag together: - * - * - if rq->deadline is observed as new value because of - * reusing, the rq won't be timed out because of timing. - * - if rq->deadline is observed as previous value, - * REQ_ATOM_COMPLETE flag won't be cleared in reuse path - * because we put a barrier between setting rq->deadline - * and clearing the flag in blk_mq_start_request(), so - * this rq won't be timed out too. - */ - if (time_after_eq(jiffies, deadline)) { - if (!blk_mark_rq_complete(rq)) { - /* - * Again coherence order ensures that consecutive reads - * from the same variable must be in that order. This - * ensures that if we see COMPLETE clear, we must then - * see STARTED set and we'll ignore this timeout. - * - * (There's also the MB implied by the test_and_clear()) - */ - blk_mq_rq_timed_out(rq, reserved); - } + /* if in-flight && overdue, mark for abortion */ + if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && + time_after_eq(jiffies, deadline)) { + blk_mq_rq_update_aborted_gstate(rq, gstate); + data->nr_expired++; + hctx->nr_expired++; } else if (!data->next_set || time_after(data->next, deadline)) { data->next = deadline; data->next_set = 1; } } +static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + /* + * We marked @rq->aborted_gstate and waited for RCU. If there were + * completions that we lost to, they would have finished and + * updated @rq->gstate by now; otherwise, the completion path is + * now guaranteed to see @rq->aborted_gstate and yield. If + * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. + */ + if (READ_ONCE(rq->gstate) == rq->aborted_gstate && + !blk_mark_rq_complete(rq)) + blk_mq_rq_timed_out(rq, reserved); +} + static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = @@ -858,7 +899,9 @@ static void blk_mq_timeout_work(struct work_struct *work) struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, + .nr_expired = 0, }; + struct blk_mq_hw_ctx *hctx; int i; /* A deadlock might occur if a request is stuck requiring a @@ -877,14 +920,40 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; + /* scan for the expired ones and set their ->aborted_gstate */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); + if (data.nr_expired) { + bool has_rcu = false; + + /* + * Wait till everyone sees ->aborted_gstate. The + * sequential waits for SRCUs aren't ideal. If this ever + * becomes a problem, we can add per-hw_ctx rcu_head and + * wait in parallel. + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->nr_expired) + continue; + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + has_rcu = true; + else + synchronize_srcu(hctx->queue_rq_srcu); + + hctx->nr_expired = 0; + } + if (has_rcu) + synchronize_rcu(); + + /* terminate the ones we won */ + blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); + } + if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { - struct blk_mq_hw_ctx *hctx; - queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) @@ -1879,6 +1948,22 @@ static size_t order_to_size(unsigned int order) return (size_t)PAGE_SIZE << order; } +static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, int node) +{ + int ret; + + if (set->ops->init_request) { + ret = set->ops->init_request(set, rq, hctx_idx, node); + if (ret) + return ret; + } + + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); + return 0; +} + int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { @@ -1940,12 +2025,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct request *rq = p; tags->static_rqs[i] = rq; - if (set->ops->init_request) { - if (set->ops->init_request(set, rq, hctx_idx, - node)) { - tags->static_rqs[i] = NULL; - goto fail; - } + if (blk_mq_init_request(set, rq, hctx_idx, node)) { + tags->static_rqs[i] = NULL; + goto fail; } p += rq_size; @@ -2084,9 +2166,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->fq) goto sched_exit_hctx; - if (set->ops->init_request && - set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, - node)) + if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) goto free_fq; if (hctx->flags & BLK_MQ_F_BLOCKING) @@ -2980,12 +3060,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) static int __init blk_mq_init(void) { - /* - * See comment in block/blk.h rq_atomic_flags enum - */ - BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != - (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); - cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); return 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index 6c7c3ff..cf01f6f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,6 +27,19 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; +/* + * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value + * and the upper bits the generation number. + */ +enum mq_rq_state { + MQ_RQ_IDLE = 0, + MQ_RQ_IN_FLIGHT = 1, + + MQ_RQ_STATE_BITS = 2, + MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1, + MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS, +}; + void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); @@ -85,6 +98,39 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved); void blk_mq_release(struct request_queue *q); +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline int blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK; +} + +/** + * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request + * @rq: target request. + * @state: new state to set. + * + * Set @rq's state to @state. The caller is responsible for ensuring that + * there are no other updaters. A request can transition into IN_FLIGHT + * only from IDLE and doing so increments the generation number. + */ +static inline void blk_mq_rq_update_state(struct request *rq, + enum mq_rq_state state) +{ + u64 old_val = READ_ONCE(rq->gstate); + u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state; + + if (state == MQ_RQ_IN_FLIGHT) { + WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE); + new_val += MQ_RQ_GEN_INC; + } + + /* avoid exposing interim values */ + WRITE_ONCE(rq->gstate, new_val); +} + static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 764ecf9..6427be7 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -208,7 +208,7 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; - WRITE_ONCE(req->deadline, jiffies + req->timeout); + req->deadline = jiffies + req->timeout; /* * Only the non-mq case needs to add the request to a protected list. diff --git a/block/blk.h b/block/blk.h index 3f14469..9cb2739 100644 --- a/block/blk.h +++ b/block/blk.h @@ -123,12 +123,6 @@ void blk_account_io_done(struct request *req); * Internal atomic flags for request handling */ enum rq_atomic_flags { - /* - * Keep these two bits first - not because we depend on the - * value of them, but we do depend on them being in the same - * byte of storage to ensure ordering on writes. Keeping them - * first will achieve that nicely. - */ REQ_ATOM_COMPLETE = 0, REQ_ATOM_STARTED, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 95c9a5c..460798d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -51,6 +51,7 @@ struct blk_mq_hw_ctx { unsigned int queue_num; atomic_t nr_active; + unsigned int nr_expired; struct hlist_node cpuhp_dead; struct kobject kobj; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8089ca1..2d6fd11 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,8 @@ #include <linux/percpu-refcount.h> #include <linux/scatterlist.h> #include <linux/blkzoned.h> +#include <linux/seqlock.h> +#include <linux/u64_stats_sync.h> struct module; struct scsi_ioctl_command; @@ -228,6 +230,27 @@ struct request { unsigned short write_hint; + /* + * On blk-mq, the lower bits of ->gstate carry the MQ_RQ_* state + * value and the upper bits the generation number which is + * monotonically incremented and used to distinguish the reuse + * instances. + * + * ->gstate_seq allows updates to ->gstate and other fields + * (currently ->deadline) during request start to be read + * atomically from the timeout path, so that it can operate on a + * coherent set of information. + */ + seqcount_t gstate_seq; + u64 gstate; + + /* + * ->aborted_gstate is used by the timeout to claim a specific + * recycle instance of this request. See blk_mq_timeout_work(). + */ + struct u64_stats_sync aborted_gstate_sync; + u64 aborted_gstate; + unsigned long deadline; struct list_head timeout_list;

[2/7] blk-mq: replace timeout synchronization with a RCU and generation based scheme

Commit Message

Patch