[RFC,18/22] block, bfq: add Early Queue Merge (EQM)

Message ID	1454364778-25179-19-git-send-email-paolo.valente@linaro.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@kernel.org> From: Paolo Valente <paolo.valente@linaro.org> To: Jens Axboe <axboe@kernel.dk>, Tejun Heo <tj@kernel.org> Cc: Fabio Checconi <fchecconi@gmail.com>, Arianna Avanzini <avanzini.arianna@gmail.com>, linux-block@vger.kernel.org, linux-kernel@vger.kernel.org, ulf.hansson@linaro.org, linus.walleij@linaro.org, broonie@kernel.org, Mauro Andreolini <mauro.andreolini@unimore.it>, Paolo Valente <paolo.valente@linaro.org> Subject: [PATCH RFC 18/22] block, bfq: add Early Queue Merge (EQM) Date: Mon, 1 Feb 2016 23:12:54 +0100 Message-Id: <1454364778-25179-19-git-send-email-paolo.valente@linaro.org> In-Reply-To: <1454364778-25179-1-git-send-email-paolo.valente@linaro.org> References: <1454364778-25179-1-git-send-email-paolo.valente@linaro.org> UNIMORE-X-SA-Score: -2.9 Sender: linux-block-owner@vger.kernel.org Precedence: bulk

diff --git a/block/bfq.h b/block/bfq.h index 2960e5d..a246b66 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -168,6 +168,10 @@ struct bfq_group; * @ioprio_class: the ioprio_class in use. * @new_ioprio_class: when an ioprio_class change is requested, the new * ioprio_class value. + * @new_bfqq: shared bfq_queue if queue is cooperating with + * one or more other queues. + * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). * @sort_list: sorted list of pending requests. * @next_rq: if fifo isn't expired, next request to serve. * @queued: nr of requests queued in @sort_list. @@ -205,13 +209,16 @@ struct bfq_group; * @service_from_backlogged: cumulative service received from the @bfq_queue * since the last transition from idle to * backlogged + * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the + * queue is shared * * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async. @cgroup holds a reference to - * the cgroup, to be sure that it does not disappear while a bfqq - * still references it (mostly to avoid races between request issuing - * and task migration followed by cgroup destruction). All the fields - * are protected by the queue lock of the containing bfqd. + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). + * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_queue { atomic_t ref; @@ -220,6 +227,11 @@ struct bfq_queue { unsigned short ioprio, new_ioprio; unsigned short ioprio_class, new_ioprio_class; + /* fields for cooperating queues handling */ + struct bfq_queue *new_bfqq; + struct rb_node pos_node; + struct rb_root *pos_root; + struct rb_root sort_list; struct request *next_rq; int queued[2]; @@ -246,6 +258,7 @@ struct bfq_queue { unsigned int requests_within_timer; pid_t pid; + struct bfq_io_cq *bic; /* weight-raising fields */ unsigned long wr_cur_max_time; @@ -277,6 +290,21 @@ struct bfq_ttime { * @ttime: associated @bfq_ttime struct * @ioprio: per (request_queue, blkcg) ioprio. * @blkcg_id: id of the blkcg the related io_cq belongs to. + * @wr_time_left: snapshot of the time left before weight raising ends + * for the sync queue associated to this process; this + * snapshot is taken to remember this value while the weight + * raising is suspended because the queue is merged with a + * shared queue, and is used to set @raising_cur_max_time + * when the queue is split from the shared queue and its + * weight is raised again + * @saved_idle_window: same purpose as the previous field for the idle + * window + * @saved_IO_bound: same purpose as the previous two fields for the I/O + * bound classification of a queue + * @cooperations: counter of consecutive successful queue merges underwent + * by any of the process' @bfq_queues + * @failed_cooperations: counter of consecutive failed queue merges of any + * of the process' @bfq_queues */ struct bfq_io_cq { struct io_cq icq; /* must be the first member */ @@ -286,6 +314,13 @@ struct bfq_io_cq { #ifdef CONFIG_CFQ_GROUP_IOSCHED uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif + + unsigned int wr_time_left; + bool saved_idle_window; + bool saved_IO_bound; + + unsigned int cooperations; + unsigned int failed_cooperations; }; enum bfq_device_speed { @@ -338,6 +373,12 @@ enum bfq_device_speed { * they are charged for the whole allocated budget, to try * to preserve a behavior reasonably fair among them, but * without service-domain guarantees). + * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is + * no more granted any weight-raising. + * @bfq_failed_cooperations: number of consecutive failed cooperation + * chances after which weight-raising is restored + * to a queue subject to more than bfq_coop_thresh + * queue merges. * @bfq_requests_within_timer: number of consecutive requests that must be * issued within the idle time slice to set * again idling to a queue which was marked as @@ -407,6 +448,8 @@ struct bfq_data { int bfq_max_budget_async_rq; unsigned int bfq_timeout[2]; + unsigned int bfq_coop_thresh; + unsigned int bfq_failed_cooperations; unsigned int bfq_requests_within_timer; bool low_latency; @@ -445,6 +488,9 @@ enum bfqq_state_flags { * may need softrt-next-start * update */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ + BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ }; #define BFQ_BFQQ_FNS(name) \ @@ -470,6 +516,9 @@ BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(budget_new); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(constantly_seeky); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(just_split); BFQ_BFQQ_FNS(softrt_update); #undef BFQ_BFQQ_FNS @@ -581,6 +630,9 @@ struct bfq_group_data { * to avoid too many special cases during group creation/ * migration. * @stats: stats for this bfqg. + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_find_close_cooperator()). * * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup * there is a set of bfq_groups, each one collecting the lower-level @@ -605,6 +657,8 @@ struct bfq_group { struct bfq_entity *my_entity; + struct rb_root rq_pos_tree; + struct bfqg_stats stats; }; @@ -689,6 +743,27 @@ static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); } +#ifdef CONFIG_BFQ_GROUP_IOSCHED + +static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + struct bfq_entity *group_entity = bfqq->entity.parent; + + if (!group_entity) + group_entity = &bfqq->bfqd->root_group->entity; + + return container_of(group_entity, struct bfq_group, entity); +} + +#else + +static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + return bfqq->bfqd->root_group; +} + +#endif + static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); static void bfq_put_queue(struct bfq_queue *bfqq); static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0539df4..ab298d2 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1675,6 +1675,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) * in bfq_init_queue() */ bfqg->bfqd = bfqd; + bfqg->rq_pos_tree = RB_ROOT; } static void bfq_pd_free(struct blkg_policy_data *pd) @@ -1743,6 +1744,9 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, return bfqg; } +static void bfq_pos_tree_add_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. @@ -1783,8 +1787,11 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, entity->sched_data = &bfqg->sched_data; bfqg_get(bfqg); - if (busy && resume) - bfq_activate_bfqq(bfqd, bfqq); + if (busy) { + bfq_pos_tree_add_move(bfqd, bfqq); + if (resume) + bfq_activate_bfqq(bfqd, bfqq); + } if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); @@ -2496,6 +2503,72 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (unsigned long long)sector, + bfqq ? bfqq->pid : 0); + + return bfqq; +} + +static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (!__bfqq) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + static struct request *bfq_find_next_rq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *last) @@ -2578,6 +2651,55 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } +static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq) +{ + return bfqq->bic ? bfqq->bic->cooperations : 0; +} + +static void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + if (bic->saved_idle_window) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + if (bic->wr_time_left && bfqq->bfqd->low_latency && + bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { + /* + * Start a weight raising period with the duration given by + * the raising_time_left snapshot. + */ + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues++; + bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bic->wr_time_left; + bfqq->last_wr_start_finish = jiffies; + bfqq->entity.prio_changed = 1; + } + /* + * Clear wr_time_left to prevent bfq_bfqq_save_state() from + * getting confused about the queue's need of a weight-raising + * period. + */ + bic->wr_time_left = 0; +} + +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; + + lockdep_assert_held(bfqq->bfqd->queue->queue_lock); + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + + return process_refs; +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -2585,7 +2707,7 @@ static void bfq_add_request(struct request *rq) struct bfq_data *bfqd = bfqq->bfqd; struct request *next_rq, *prev; unsigned long old_wr_coeff = bfqq->wr_coeff; - bool idle_for_long_time = false; + bool interactive = false; bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; @@ -2600,9 +2722,16 @@ static void bfq_add_request(struct request *rq) next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); bfqq->next_rq = next_rq; + /* + * Adjust priority tree position, if next_rq changes. + */ + if (prev != bfqq->next_rq) + bfq_pos_tree_add_move(bfqd, bfqq); + if (!bfq_bfqq_busy(bfqq)) { int soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && - time_is_before_jiffies(bfqq->soft_rt_next_start); + bfq_bfqq_cooperations(bfqq) < bfqd->bfq_coop_thresh && + time_is_before_jiffies(bfqq->soft_rt_next_start), idle_for_long_time = time_is_before_jiffies( bfqq->budget_timeout + @@ -2613,6 +2742,9 @@ static void bfq_add_request(struct request *rq) rq->cmd_flags); #endif + interactive = idle_for_long_time && + bfq_bfqq_cooperations(bfqq) < + bfqd->bfq_coop_thresh; entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); @@ -2631,13 +2763,22 @@ static void bfq_add_request(struct request *rq) if (!bfqd->low_latency) goto add_bfqq_busy; + if (bfq_bfqq_just_split(bfqq)) + goto set_prio_changed; + /* - * If the queue is not being boosted and has been idle for - * enough time, start a weight-raising period. + * If the queue: + * - is not being boosted, + * - has been idle for enough time, + * - is not a sync queue or is linked to a bfq_io_cq (it is + * shared "for its nature" or it is not shared and its + * requests have not been redirected to a shared queue) + * start a weight-raising period. */ - if (old_wr_coeff == 1 && (idle_for_long_time || soft_rt)) { + if (old_wr_coeff == 1 && (interactive || soft_rt) && + (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { bfqq->wr_coeff = bfqd->bfq_wr_coeff; - if (idle_for_long_time) + if (interactive) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); else bfqq->wr_cur_max_time = @@ -2647,11 +2788,13 @@ static void bfq_add_request(struct request *rq) jiffies, jiffies_to_msecs(bfqq->wr_cur_max_time)); } else if (old_wr_coeff > 1) { - if (idle_for_long_time) + if (interactive) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else if (bfqq->wr_cur_max_time == + else if (bfq_bfqq_cooperations(bfqq) >= + bfqd->bfq_coop_thresh || + (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - !soft_rt) { + !soft_rt)) { bfqq->wr_coeff = 1; bfq_log_bfqq(bfqd, bfqq, "wrais ending at %lu, rais_max_time %u", @@ -2713,6 +2856,7 @@ static void bfq_add_request(struct request *rq) bfqd->bfq_wr_rt_max_time; } } +set_prio_changed: if (old_wr_coeff != bfqq->wr_coeff) entity->prio_changed = 1; add_bfqq_busy: @@ -2740,8 +2884,7 @@ add_bfqq_busy: } if (bfqd->low_latency && - (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || - idle_for_long_time)) + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) bfqq->last_wr_start_finish = jiffies; } @@ -2800,6 +2943,13 @@ static void bfq_remove_request(struct request *rq) if (RB_EMPTY_ROOT(&bfqq->sort_list)) { if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) bfq_del_bfqq_busy(bfqd, bfqq, 1); + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } } if (rq->cmd_flags & REQ_META) @@ -2846,11 +2996,14 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, bfqd->last_position); bfqq->next_rq = next_rq; /* - * If next_rq changes, update the queue's budget to fit - * the new request. + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. */ - if (prev != bfqq->next_rq) + if (prev != bfqq->next_rq) { bfq_updated_next_req(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); + } } } @@ -2932,12 +3085,342 @@ static void bfq_end_wr(struct bfq_data *bfqd) spin_unlock_irq(bfqd->queue->queue_lock); } +static sector_t bfq_io_struct_pos(void *io_struct, bool request) +{ + if (request) + return blk_rq_pos(io_struct); + else + return ((struct bio *)io_struct)->bi_iter.bi_sector; +} + +static int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) +{ + return abs(bfq_io_struct_pos(io_struct, request) - sector) <= + BFQQ_SEEK_THR; +} + +static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + sector_t sector) +{ + struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (!node) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + return NULL; +} + +static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) +{ + struct bfq_queue *bfqq; + + /* + * We shall notice if some of the queues are cooperating, + * e.g., working closely on the same area of the device. In + * that case, we can group them together and: 1) don't waste + * time idling, and 2) serve the union of their requests in + * the best possible order for throughput. + */ + bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); + if (!bfqq || bfqq == cur_bfqq) + return NULL; + + return bfqq; +} + +static struct bfq_queue * +bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return NULL; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return NULL; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); + + /* + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because we + * are in the context of the process owning bfqq, hence we have + * the io_cq of this process. So we can immediately configure this + * io_cq to redirect the requests of the process to new_bfqq. + * + * NOTE, even if new_bfqq coincides with the in-service queue, the + * io_cq of new_bfqq is not available, because, if the in-service + * queue is shared, bfqd->in_service_bic may not point to the + * io_cq of the in-service queue. + * Redirecting the requests of the process owning bfqq to the + * currently in-service queue is in any case the best option, as + * we feed the in-service queue with new requests close to the + * last request served and, by doing so, hopefully increase the + * throughput. + */ + bfqq->new_bfqq = new_bfqq; + atomic_add(process_refs, &new_bfqq->ref); + return new_bfqq; +} + +static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + struct bfq_queue *new_bfqq) +{ + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || + (bfqq->ioprio_class != new_bfqq->ioprio_class)) + return false; + + /* + * If either of the queues has already been detected as seeky, + * then merging it with the other queue is unlikely to lead to + * sequential I/O. + */ + if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) + return false; + + /* + * Interleaved I/O is known to be done by (some) applications + * only for reads, so it does not make sense to merge async + * queues. + */ + if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) + return false; + + return true; +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service queue + * or with a close queue among the scheduled queues. + * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since + * the requests temporarily redirected to the OOM queue could be redirected + * again to dedicated queues at any time, the state needed to correctly + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ + struct bfq_queue *in_service_bfqq, *new_bfqq; + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + /* If device has only one backlogged bfq_queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + + if (!in_service_bfqq || in_service_bfqq == bfqq || + !bfqd->in_service_bic || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfqq->entity.parent == in_service_bfqq->entity.parent && + bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); + if (new_bfqq) + return new_bfqq; + } + /* + * Check whether there is a cooperator among currently scheduled + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +check_scheduled: + new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; +} + +static void bfq_bfqq_save_state(struct bfq_queue *bfqq) +{ + /* + * If !bfqq->bic, the queue is already shared or its requests + * have already been redirected to a shared queue; both idle window + * and weight raising state have already been saved. Do nothing. + */ + if (!bfqq->bic) + return; + if (bfqq->bic->wr_time_left) + /* + * This is the queue of a just-started process, and would + * deserve weight raising: we set wr_time_left to the full + * weight-raising duration to trigger weight-raising when + * and if the queue is split and the first request of the + * queue is enqueued. + */ + bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); + else if (bfqq->wr_coeff > 1) { + unsigned long wr_duration = + jiffies - bfqq->last_wr_start_finish; + /* + * It may happen that a queue's weight raising period lasts + * longer than its wr_cur_max_time, as weight raising is + * handled only when a request is enqueued or dispatched (it + * does not use any timer). If the weight raising period is + * about to end, don't save it. + */ + if (bfqq->wr_cur_max_time <= wr_duration) + bfqq->bic->wr_time_left = 0; + else + bfqq->bic->wr_time_left = + bfqq->wr_cur_max_time - wr_duration; + /* + * The bfq_queue is becoming shared or the requests of the + * process owning the queue are being redirected to a shared + * queue. Stop the weight raising period of the queue, as in + * both cases it should not be owned by an interactive or + * soft real-time application. + */ + bfq_bfqq_end_wr(bfqq); + } else + bfqq->bic->wr_time_left = 0; + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq->bic->cooperations++; + bfqq->bic->failed_cooperations = 0; +} + +static void bfq_get_bic_reference(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic has a non-NULL value, the bic to which it belongs + * is about to begin using a shared bfq_queue. + */ + if (bfqq->bic) + atomic_long_inc(&bfqq->bic->icq.ioc->refcount); +} + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (unsigned long)new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). + */ + bfq_get_bic_reference(bfqq); + bfq_get_bic_reference(new_bfqq); + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, 1); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + bfqq->bic = NULL; + bfq_put_queue(bfqq); +} + +static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + struct bfq_data *bfqd = bfqq->bfqd; + + if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { + bic->failed_cooperations++; + if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) + bic->cooperations = 0; + } +} + static int bfq_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_io_cq *bic; - struct bfq_queue *bfqq; + struct bfq_queue *bfqq, *new_bfqq; /* * Disallow merge of a sync bio into an async request. @@ -2955,6 +3438,23 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, return 0; bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ + if (bfqq) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq) { + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + /* + * If we get here, the bio will be queued in the + * shared queue, i.e., new_bfqq, so use new_bfqq + * to decide whether bio and rq can be merged. + */ + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } return bfqq == RQ_BFQQ(rq); } @@ -3150,6 +3650,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) { __bfq_bfqd_reset_in_service(bfqd); + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { /* * Overloading budget_timeout field to store the time @@ -3158,8 +3667,13 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) */ bfqq->budget_timeout = jiffies; bfq_del_bfqq_busy(bfqd, bfqq, 1); - } else + } else { bfq_activate_bfqq(bfqd, bfqq); + /* + * Resort priority tree of potential close cooperators. + */ + bfq_pos_tree_add_move(bfqd, bfqq); + } } /** @@ -3940,10 +4454,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* * If too much time has elapsed from the beginning of - * this weight-raising period, then end weight - * raising. + * this weight-raising period, or the queue has + * exceeded the acceptable number of cooperations, + * then end weight raising. */ - if (time_is_before_jiffies(bfqq->last_wr_start_finish + + if (bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time)) { bfqq->last_wr_start_finish = jiffies; bfq_log_bfqq(bfqd, bfqq, @@ -4146,6 +4662,25 @@ static void bfq_put_queue(struct bfq_queue *bfqq) #endif } +static void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) + break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { if (bfqq == bfqd->in_service_queue) { @@ -4156,12 +4691,35 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, atomic_read(&bfqq->ref)); + bfq_put_cooperator(bfqq); + bfq_put_queue(bfqq); } static void bfq_init_icq(struct io_cq *icq) { - icq_to_bic(icq)->ttime.last_end_request = jiffies; + struct bfq_io_cq *bic = icq_to_bic(icq); + + bic->ttime.last_end_request = jiffies; + /* + * A newly created bic indicates that the process has just + * started doing I/O, and is probably mapping into memory its + * executable and libraries: it definitely needs weight raising. + * There is however the possibility that the process performs, + * for a while, I/O close to some other process. EQM intercepts + * this behavior and may merge the queue corresponding to the + * process with some other queue, BEFORE the weight of the queue + * is raised. Merged queues are not weight-raised (they are assumed + * to belong to processes that benefit only from high throughput). + * If the merge is basically the consequence of an accident, then + * the queue will be split soon and will get back its old weight. + * It is then important to write down somewhere that this queue + * does need weight raising, even if it did not make it to get its + * weight raised before being merged. To this purpose, we overload + * the field raising_time_left and assign 1 to it, to mark the queue + * as needing weight raising. + */ + bic->wr_time_left = 1; } static void bfq_exit_icq(struct io_cq *icq) @@ -4175,6 +4733,13 @@ static void bfq_exit_icq(struct io_cq *icq) } if (bic->bfqq[BLK_RW_SYNC]) { + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ + if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) + put_io_context(icq->ioc); bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); bic->bfqq[BLK_RW_SYNC] = NULL; } @@ -4483,6 +5048,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) return; + /* Idle window just restored, statistics are meaningless. */ + if (bfq_bfqq_just_split(bfqq)) + return; + enable_idle = bfq_bfqq_idle_window(bfqq); if (atomic_read(&bic->icq.ioc->active_ref) == 0 || @@ -4525,6 +5094,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || !BFQQ_SEEKY(bfqq)) bfq_update_idle_window(bfqd, bfqq, bic); + bfq_clear_bfqq_just_split(bfqq); bfq_log_bfqq(bfqd, bfqq, "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", @@ -4589,12 +5159,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_insert_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; assert_spin_locked(bfqd->queue->queue_lock); + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to + * merge two bfq_queues. + */ + if (!in_interrupt()) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + if (new_bfqq) { + if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) + new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); + /* + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; + atomic_inc(&new_bfqq->ref); + bfq_put_queue(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + bfq_add_request(rq); + /* + * Here a newly-created bfq_queue has already started a weight-raising + * period: clear raising_time_left to prevent bfq_bfqq_save_state() + * from assigning it a full weight-raising period. See the detailed + * comments about this field in bfq_init_icq(). + */ + if (bfqq->bic) + bfqq->bic->wr_time_left = 0; rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); @@ -4746,6 +5351,32 @@ static void bfq_put_request(struct request *rq) } /* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to said bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + + put_io_context(bic->icq.ioc); + + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + bic_set_bfqq(bic, NULL, 1); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); + return NULL; +} + +/* * Allocate bfq data structures associated with this request. */ static int bfq_set_request(struct request_queue *q, struct request *rq, @@ -4757,6 +5388,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; unsigned long flags; + bool split = false; might_sleep_if(gfpflags_allow_blocking(gfp_mask)); @@ -4769,10 +5401,20 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, bfq_bic_update_cgroup(bic, bio); +new_queue: bfqq = bic_to_bfqq(bic, is_sync); if (!bfqq || bfqq == &bfqd->oom_bfqq) { bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); bic_set_bfqq(bic, bfqq, is_sync); + } else { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + if (!bfqq) + goto new_queue; + } } bfqq->allocated[rw]++; @@ -4783,6 +5425,26 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; + /* + * If a bfq_queue has only one process reference, it is owned + * by only one bfq_io_cq: we can set the bic field of the + * bfq_queue to the address of that structure. Also, if the + * queue has just been split, mark a flag so that the + * information is available to the other scheduler hooks. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + bfq_mark_bfqq_just_split(bfqq); + /* + * If the queue has just been split from a shared + * queue, restore the idle window and the possible + * weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bic); + } + } + spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -4935,6 +5597,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, root_group->my_entity = NULL; root_group->bfqd = bfqd; #endif + root_group->rq_pos_tree = RB_ROOT; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; } @@ -5008,6 +5671,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + bfqd->bfq_coop_thresh = 2; + bfqd->bfq_failed_cooperations = 7000; bfqd->bfq_requests_within_timer = 120; bfqd->low_latency = true; @@ -5400,7 +6065,7 @@ static int __init bfq_init(void) if (ret) goto err_pol_unreg; - pr_info("BFQ I/O-scheduler: v2"); + pr_info("BFQ I/O-scheduler: v6"); return 0;

[RFC,18/22] block, bfq: add Early Queue Merge (EQM)

Commit Message

Patch