@@ -200,6 +200,7 @@ struct bfq_group;
* @dispatched: number of requests on the dispatch list or inside driver.
* @flags: status flags.
* @bfqq_list: node for active/idle bfqq list inside our bfqd.
+ * @burst_list_node: node for the device's burst list.
* @seek_samples: number of seeks sampled
* @seek_total: sum of the distances of the seeks sampled
* @seek_mean: mean seek distance
@@ -266,6 +267,8 @@ struct bfq_queue {
struct list_head bfqq_list;
+ struct hlist_node burst_list_node;
+
unsigned int seek_samples;
u64 seek_total;
sector_t seek_mean;
@@ -317,6 +320,11 @@ struct bfq_ttime {
* window
* @saved_IO_bound: same purpose as the previous two fields for the I/O
* bound classification of a queue
+ * @saved_in_large_burst: same purpose as the previous fields for the
+ * value of the field keeping the queue's belonging
+ * to a large burst
+ * @was_in_burst_list: true if the queue belonged to a burst list
+ * before its merge with another cooperating queue
* @cooperations: counter of consecutive successful queue merges underwent
* by any of the process' @bfq_queues
* @failed_cooperations: counter of consecutive failed queue merges of any
@@ -335,6 +343,9 @@ struct bfq_io_cq {
bool saved_idle_window;
bool saved_IO_bound;
+ bool saved_in_large_burst;
+ bool was_in_burst_list;
+
unsigned int cooperations;
unsigned int failed_cooperations;
};
@@ -441,6 +452,21 @@ enum bfq_device_speed {
* again idling to a queue which was marked as
* non-I/O-bound (see the definition of the
* IO_bound flag for further details).
+ * @last_ins_in_burst: last time at which a queue entered the current
+ * burst of queues being activated shortly after
+ * each other; for more details about this and the
+ * following parameters related to a burst of
+ * activations, see the comments to the function
+ * @bfq_handle_burst.
+ * @bfq_burst_interval: reference time interval used to decide whether a
+ * queue has been activated shortly after
+ * @last_ins_in_burst.
+ * @burst_size: number of queues in the current burst of queue activations.
+ * @bfq_large_burst_thresh: maximum burst size above which the current
+ * queue-activation burst is deemed as 'large'.
+ * @large_burst: true if a large queue-activation burst is in progress.
+ * @burst_list: head of the burst list (as for the above fields, more details
+ * in the comments to the function bfq_handle_burst).
* @low_latency: if set to true, low-latency heuristics are enabled.
* @bfq_wr_coeff: maximum factor by which the weight of a weight-raised
* queue is multiplied.
@@ -518,6 +544,13 @@ struct bfq_data {
unsigned int bfq_failed_cooperations;
unsigned int bfq_requests_within_timer;
+ unsigned long last_ins_in_burst;
+ unsigned long bfq_burst_interval;
+ int burst_size;
+ unsigned long bfq_large_burst_thresh;
+ bool large_burst;
+ struct hlist_head burst_list;
+
bool low_latency;
/* parameters of the low_latency heuristics */
@@ -546,6 +579,10 @@ enum bfqq_state_flags {
* having consumed at most 2/10 of
* its budget
*/
+ BFQ_BFQQ_FLAG_in_large_burst, /*
+ * bfqq activated in a large burst,
+ * see comments to bfq_handle_burst.
+ */
BFQ_BFQQ_FLAG_constantly_seeky, /*
* bfqq has proved to be slow and
* seeky until budget timeout
@@ -581,6 +618,7 @@ BFQ_BFQQ_FNS(idle_window);
BFQ_BFQQ_FNS(sync);
BFQ_BFQQ_FNS(budget_new);
BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(constantly_seeky);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
@@ -40,7 +40,9 @@
* real-time. This feature enables BFQ to provide applications in
* these classes with a very low latency. Finally, BFQ also features
* additional heuristics for preserving both a low latency and a high
- * throughput on NCQ-capable, rotational or flash-based devices.
+ * throughput on NCQ-capable, rotational or flash-based devices, and
+ * to get the job done quickly for applications consisting in many
+ * I/O-bound processes.
*
* With respect to the version of BFQ presented in [1], and in the
* papers cited therein, this implementation adds a hierarchical
@@ -2879,7 +2881,9 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
bfq_mark_bfqq_IO_bound(bfqq);
else
bfq_clear_bfqq_IO_bound(bfqq);
+ /* Assuming that the flag in_large_burst is already correctly set */
if (bic->wr_time_left && bfqq->bfqd->low_latency &&
+ !bfq_bfqq_in_large_burst(bfqq) &&
bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {
/*
* Start a weight raising period with the duration given by
@@ -2912,6 +2916,219 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)
return process_refs;
}
+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_queue *item;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+ hlist_del_init(&item->burst_list_node);
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+ bfqd->burst_size = 1;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /* Increment burst size to take into account also bfqq */
+ bfqd->burst_size++;
+
+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+ struct bfq_queue *pos, *bfqq_item;
+ struct hlist_node *n;
+
+ /*
+ * Enough queues have been activated shortly after each
+ * other to consider this burst as large.
+ */
+ bfqd->large_burst = true;
+
+ /*
+ * We can now mark all queues in the burst list as
+ * belonging to a large burst.
+ */
+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+ burst_list_node)
+ bfq_mark_bfqq_in_large_burst(bfqq_item);
+ bfq_mark_bfqq_in_large_burst(bfqq);
+
+ /*
+ * From now on, and until the current burst finishes, any
+ * new queue being activated shortly after the last queue
+ * was inserted in the burst can be immediately marked as
+ * belonging to a large burst. So the burst list is not
+ * needed any more. Remove it.
+ */
+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+ burst_list_node)
+ hlist_del_init(&pos->burst_list_node);
+ } else /* burst not yet large: add bfqq to the burst list */
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues happen to become active shortly after each other, then,
+ * to help the processes associated to these queues get their job done as
+ * soon as possible, it is usually better to not grant either weight-raising
+ * or device idling to these queues. In this comment we describe, firstly,
+ * the reasons why this fact holds, and, secondly, the next function, which
+ * implements the main steps needed to properly mark these queues so that
+ * they can then be treated in a different way.
+ *
+ * As for the terminology, we say that a queue becomes active, i.e.,
+ * switches from idle to backlogged, either when it is created (as a
+ * consequence of the arrival of an I/O request), or, if already existing,
+ * when a new request for the queue arrives while the queue is idle.
+ * Bursts of activations, i.e., activations of different queues occurring
+ * shortly after each other, are typically caused by services or applications
+ * that spawn or reactivate many parallel threads/processes. Examples are
+ * systemd during boot or git grep.
+ *
+ * These services or applications benefit mostly from a high throughput:
+ * the quicker the requests of the activated queues are cumulatively served,
+ * the sooner the target job of these queues gets completed. As a consequence,
+ * weight-raising any of these queues, which also implies idling the device
+ * for it, is almost always counterproductive: in most cases it just lowers
+ * throughput.
+ *
+ * On the other hand, a burst of activations may be also caused by the start
+ * of an application that does not consist in a lot of parallel I/O-bound
+ * threads. In fact, with a complex application, the burst may be just a
+ * consequence of the fact that several processes need to be executed to
+ * start-up the application. To start an application as quickly as possible,
+ * the best thing to do is to privilege the I/O related to the application
+ * with respect to all other I/O. Therefore, the best strategy to start as
+ * quickly as possible an application that causes a burst of activations is
+ * to weight-raise all the queues activated during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the two
+ * types of bursts need to be distinguished. Fortunately, this seems
+ * relatively easy to do, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that bursts with a larger size
+ * than that threshold are apparently caused only by services or commands
+ * such as systemd or git grep. For brevity, hereafter we call just 'large'
+ * these bursts. BFQ *does not* weight-raise queues whose activations occur
+ * in a large burst. In addition, for each of these queues BFQ performs or
+ * does not perform idling depending on which choice boosts the throughput
+ * most. The exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Turning back to the next function, it implements all the steps needed
+ * to detect the occurrence of a large burst and to properly mark all the
+ * queues belonging to it (so that they can then be treated in a different
+ * way). This goal is achieved by maintaining a special "burst list" that
+ * holds, temporarily, the queues that belong to the burst in progress. The
+ * list is then used to mark these queues as belonging to a large burst if
+ * the burst does become large. The main steps are the following.
+ *
+ * . when the very first queue is activated, the queue is inserted into the
+ * list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ * not yet belong to the burst is activated shortly after the last time
+ * at which a new queue entered the burst list, then the function appends
+ * Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ * the large-burst threshold, then
+ *
+ * . all the queues in the burst list are marked as belonging to a
+ * large burst
+ *
+ * . the burst list is deleted; in fact, the burst list already served
+ * its purpose (keeping temporarily track of the queues in a burst,
+ * so as to be able to mark them as belonging to a large burst in the
+ * previous sub-step), and now is not needed any more
+ *
+ * . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is activated while
+ * the device is in large-burst mode and shortly after the last time
+ * at which a queue either entered the burst list or was marked as
+ * belonging to the current large burst, then Q is immediately marked
+ * as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is activated a while
+ * later, i.e., not shortly after, than the last time at which a queue
+ * either entered the burst list or was marked as belonging to the
+ * current large burst, then the current burst is deemed as finished and:
+ *
+ * . the large-burst mode is reset if set
+ *
+ * . the burst list is emptied
+ *
+ * . Q is inserted in the burst list, as Q may be the first queue
+ * in a possible new burst (then the burst list contains just Q
+ * after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool idle_for_long_time)
+{
+ /*
+ * If bfqq happened to be activated in a burst, but has been idle
+ * for at least as long as an interactive queue, then we assume
+ * that, in the overall I/O initiated in the burst, the I/O
+ * associated to bfqq is finished. So bfqq does not need to be
+ * treated as a queue belonging to a burst anymore. Accordingly,
+ * we reset bfqq's in_large_burst flag if set, and remove bfqq
+ * from the burst list if it's there. We do not decrement instead
+ * burst_size, because the fact that bfqq does not need to belong
+ * to the burst list any more does not invalidate the fact that
+ * bfqq may have been activated during the current burst.
+ */
+ if (idle_for_long_time) {
+ hlist_del_init(&bfqq->burst_list_node);
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ }
+
+ /*
+ * If bfqq is already in the burst list or is part of a large
+ * burst, then there is nothing else to do.
+ */
+ if (!hlist_unhashed(&bfqq->burst_list_node) ||
+ bfq_bfqq_in_large_burst(bfqq))
+ return;
+
+ /*
+ * If bfqq's activation happens late enough, then the current
+ * burst is finished, and related data structures must be reset.
+ *
+ * In this respect, consider the special case where bfqq is the very
+ * first queue being activated. In this case, last_ins_in_burst is
+ * not yet significant when we get here. But it is easy to verify
+ * that, whether or not the following condition is true, bfqq will
+ * end up being inserted into the burst list. In particular the
+ * list will happen to contain only bfqq. And this is exactly what
+ * has to happen, as bfqq may be the first queue in a possible
+ * burst.
+ */
+ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+ bfqd->bfq_burst_interval)) {
+ bfqd->large_burst = false;
+ bfq_reset_burst_list(bfqd, bfqq);
+ return;
+ }
+
+ /*
+ * If we get here, then bfqq is being activated shortly after the
+ * last queue. So, if the current burst is also large, we can mark
+ * bfqq as belonging to this large burst immediately.
+ */
+ if (bfqd->large_burst) {
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ return;
+ }
+
+ /*
+ * If we get here, then a large-burst state has not yet been
+ * reached, but bfqq is being activated shortly after the last
+ * queue. Then we add bfqq to the burst.
+ */
+ bfq_add_to_burst(bfqd, bfqq);
+}
+
static void bfq_add_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -2941,22 +3158,40 @@ static void bfq_add_request(struct request *rq)
bfq_pos_tree_add_move(bfqd, bfqq);
if (!bfq_bfqq_busy(bfqq)) {
- int soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
- bfq_bfqq_cooperations(bfqq) < bfqd->bfq_coop_thresh &&
- time_is_before_jiffies(bfqq->soft_rt_next_start),
- idle_for_long_time =
- time_is_before_jiffies(
- bfqq->budget_timeout +
- bfqd->bfq_wr_min_idle_time);
+ bool soft_rt, coop_or_in_burst,
+ idle_for_long_time = time_is_before_jiffies(
+ bfqq->budget_timeout +
+ bfqd->bfq_wr_min_idle_time);
#ifdef CONFIG_CFQ_GROUP_IOSCHED
bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
rq->cmd_flags);
#endif
+ if (bfq_bfqq_sync(bfqq)) {
+ bool already_in_burst =
+ !hlist_unhashed(&bfqq->burst_list_node) ||
+ bfq_bfqq_in_large_burst(bfqq);
+ bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
+ /*
+ * If bfqq was not already in the current burst,
+ * then, at this point, bfqq either has been
+ * added to the current burst or has caused the
+ * current burst to terminate. In particular, in
+ * the second case, bfqq has become the first
+ * queue in a possible new burst.
+ * In both cases last_ins_in_burst needs to be
+ * moved forward.
+ */
+ if (!already_in_burst)
+ bfqd->last_ins_in_burst = jiffies;
+ }
- interactive = idle_for_long_time &&
- bfq_bfqq_cooperations(bfqq) <
- bfqd->bfq_coop_thresh;
+ coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||
+ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;
+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+ !coop_or_in_burst &&
+ time_is_before_jiffies(bfqq->soft_rt_next_start);
+ interactive = !coop_or_in_burst && idle_for_long_time;
entity->budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(next_rq, bfqq));
@@ -3002,8 +3237,7 @@ static void bfq_add_request(struct request *rq)
} else if (old_wr_coeff > 1) {
if (interactive)
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
- else if (bfq_bfqq_cooperations(bfqq) >=
- bfqd->bfq_coop_thresh ||
+ else if (coop_or_in_burst ||
(bfqq->wr_cur_max_time ==
bfqd->bfq_wr_rt_max_time &&
!soft_rt)) {
@@ -3563,6 +3797,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
bfqq->bic->wr_time_left = 0;
bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+ bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+ bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
bfqq->bic->cooperations++;
bfqq->bic->failed_cooperations = 0;
}
@@ -4620,11 +4856,22 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
!bfq_symmetric_scenario(bfqd);
/*
- * Combining the two cases above, we can now establish when
- * idling is actually needed to preserve service guarantees.
+ * Finally, there is a case where maximizing throughput is the
+ * best choice even if it may cause unfairness toward
+ * bfqq. Such a case is when bfqq became active in a burst of
+ * queue activations. Queues that became active during a large
+ * burst benefit only from throughput, as discussed in the
+ * comments to bfq_handle_burst. Thus, if bfqq became active
+ * in a burst and not idling the device maximizes throughput,
+ * then the device must no be idled, because not idling the
+ * device provides bfqq and all other queues in the burst with
+ * maximum benefit. Combining this and the two cases above, we
+ * can now establish when idling is actually needed to
+ * preserve service guarantees.
*/
idling_needed_for_service_guarantees =
- (on_hdd_and_not_all_queues_seeky || asymmetric_scenario);
+ (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&
+ !bfq_bfqq_in_large_burst(bfqq);
/*
* We have now all the components we need to compute the return
@@ -4757,12 +5004,14 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
/*
- * If too much time has elapsed from the beginning of
- * this weight-raising period, or the queue has
+ * If the queue was activated in a burst, or
+ * too much time has elapsed from the beginning
+ * of this weight-raising period, or the queue has
* exceeded the acceptable number of cooperations,
* then end weight raising.
*/
- if (bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||
+ if (bfq_bfqq_in_large_burst(bfqq) ||
+ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||
time_is_before_jiffies(bfqq->last_wr_start_finish +
bfqq->wr_cur_max_time)) {
bfqq->last_wr_start_finish = jiffies;
@@ -4958,6 +5207,17 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
if (!atomic_dec_and_test(&bfqq->ref))
return;
+ if (bfq_bfqq_sync(bfqq))
+ /*
+ * The fact that this queue is being destroyed does not
+ * invalidate the fact that this queue may have been
+ * activated during the current burst. As a consequence,
+ * although the queue does not exist anymore, and hence
+ * needs to be removed from the burst list if there,
+ * the burst size has not to be decremented.
+ */
+ hlist_del_init(&bfqq->burst_list_node);
+
bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
kmem_cache_free(bfq_pool, bfqq);
@@ -5143,6 +5403,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
RB_CLEAR_NODE(&bfqq->entity.rb_node);
INIT_LIST_HEAD(&bfqq->fifo);
+ INIT_HLIST_NODE(&bfqq->burst_list_node);
atomic_set(&bfqq->ref, 0);
bfqq->bfqd = bfqd;
@@ -5723,6 +5984,17 @@ new_queue:
if (!bfqq || bfqq == &bfqd->oom_bfqq) {
bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);
bic_set_bfqq(bic, bfqq, is_sync);
+ if (split && is_sync) {
+ if ((bic->was_in_burst_list && bfqd->large_burst) ||
+ bic->saved_in_large_burst)
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ else {
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ if (bic->was_in_burst_list)
+ hlist_add_head(&bfqq->burst_list_node,
+ &bfqd->burst_list);
+ }
+ }
} else {
/* If the queue was seeky for too long, break it apart. */
if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
@@ -5979,6 +6251,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
INIT_LIST_HEAD(&bfqd->active_list);
INIT_LIST_HEAD(&bfqd->idle_list);
+ INIT_HLIST_HEAD(&bfqd->burst_list);
bfqd->hw_tag = -1;
@@ -5998,6 +6271,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->bfq_failed_cooperations = 7000;
bfqd->bfq_requests_within_timer = 120;
+ bfqd->bfq_large_burst_thresh = 11;
+ bfqd->bfq_burst_interval = msecs_to_jiffies(500);
+
bfqd->low_latency = true;
bfqd->bfq_wr_coeff = 20;
@@ -6390,7 +6666,7 @@ static int __init bfq_init(void)
if (ret)
goto err_pol_unreg;
- pr_info("BFQ I/O-scheduler: v7r3");
+ pr_info("BFQ I/O-scheduler: v7r10");
return 0;