diff mbox series

[RFC,4/5] kyber: implement improved heuristics

Message ID 4440c3f4f1e58f6790cbae57cc138b77cbad84b8.1533846185.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show
Series kyber: better heuristics | expand

Commit Message

Omar Sandoval Aug. 9, 2018, 8:26 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

Kyber's current heuristics have a few flaws:

- It's based on the mean latency, but p99 latency tends to be more
  meaningful to anyone who cares about latency. The mean can also be
  skewed by rare outliers that the scheduler can't do anything about.
- The statistics calculations are purely time-based with a short window.
  This works for steady, high load, but is more sensitive to outliers
  with bursty workloads.
- It only considers the latency once an I/O has been submitted to the
  device, but the user cares about the time spent in the kernel, as
  well.

These are shortcomings of the generic blk-stat code which doesn't quite
fit the ideal use case for Kyber. So, this replaces the statistics with
a histogram used to calculate percentiles of total latency and I/O
latency, which we then use to adjust depths in a slightly more
intelligent manner:

- Sync and async writes are now the same domain.
- Discards are a separate domain.
- Domain queue depths are scaled by the ratio of the p99 total latency
  to the target latency (e.g., if the p99 latency is double the target
  latency, we will double the queue depth; if the p99 latency is half of
  the target latency, we can halve the queue depth).
- We use the I/O latency to determine whether we should scale queue
  depths down: we will only scale down if any domain's I/O latency
  exceeds the target latency, which is an indicator of congestion in the
  device.

These new heuristics are just as scalable as the heuristics they
replace.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 block/kyber-iosched.c | 497 ++++++++++++++++++++++++------------------
 1 file changed, 279 insertions(+), 218 deletions(-)
diff mbox series

Patch

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 08eb5295c18d..adc8e6393829 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -29,13 +29,16 @@ 
 #include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
-#include "blk-stat.h"
 
-/* Scheduling domains. */
+/*
+ * Scheduling domains: the device is divided into multiple domains based on the
+ * request type.
+ */
 enum {
 	KYBER_READ,
-	KYBER_SYNC_WRITE,
-	KYBER_OTHER, /* Async writes, discard, etc. */
+	KYBER_WRITE,
+	KYBER_DISCARD,
+	KYBER_OTHER,
 	KYBER_NUM_DOMAINS,
 };
 
@@ -49,25 +52,82 @@  enum {
 };
 
 /*
- * Initial device-wide depths for each scheduling domain.
+ * Maximum device-wide depth for each scheduling domain.
  *
- * Even for fast devices with lots of tags like NVMe, you can saturate
- * the device with only a fraction of the maximum possible queue depth.
- * So, we cap these to a reasonable value.
+ * Even for fast devices with lots of tags like NVMe, you can saturate the
+ * device with only a fraction of the maximum possible queue depth. So, we cap
+ * these to a reasonable value.
  */
 static const unsigned int kyber_depth[] = {
 	[KYBER_READ] = 256,
-	[KYBER_SYNC_WRITE] = 128,
-	[KYBER_OTHER] = 64,
+	[KYBER_WRITE] = 128,
+	[KYBER_DISCARD] = 64,
+	[KYBER_OTHER] = 16,
 };
 
 /*
- * Scheduling domain batch sizes. We favor reads.
+ * Default latency targets for each scheduling domain.
+ */
+static const u64 kyber_latency_targets[] = {
+	[KYBER_READ] = 2 * NSEC_PER_MSEC,
+	[KYBER_WRITE] = 10 * NSEC_PER_MSEC,
+	[KYBER_DISCARD] = 5 * NSEC_PER_SEC,
+};
+
+/*
+ * Batch size (number of requests we'll dispatch in a row) for each scheduling
+ * domain.
  */
 static const unsigned int kyber_batch_size[] = {
 	[KYBER_READ] = 16,
-	[KYBER_SYNC_WRITE] = 8,
-	[KYBER_OTHER] = 8,
+	[KYBER_WRITE] = 8,
+	[KYBER_DISCARD] = 1,
+	[KYBER_OTHER] = 1,
+};
+
+/*
+ * Requests latencies are recorded in a histogram with buckets defined relative
+ * to the target latency:
+ *
+ * <= 1/4 * target latency
+ * <= 1/2 * target latency
+ * <= 3/4 * target latency
+ * <= target latency
+ * <= 1 1/4 * target latency
+ * <= 1 1/2 * target latency
+ * <= 1 3/4 * target latency
+ * > 1 3/4 * target latency
+ */
+enum {
+	/*
+	 * The width of the latency histogram buckets is
+	 * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
+	 */
+	KYBER_LATENCY_SHIFT = 2,
+	/*
+	 * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
+	 * thus, "good".
+	 */
+	KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
+	/* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
+	KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
+};
+
+/*
+ * We measure both the total latency and the I/O latency (i.e., latency after
+ * submitting to the device).
+ */
+enum {
+	KYBER_TOTAL_LATENCY,
+	KYBER_IO_LATENCY,
+};
+
+/*
+ * Per-cpu latency histograms: total latency and I/O latency for each scheduling
+ * domain except for KYBER_OTHER.
+ */
+struct kyber_cpu_latency {
+	atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
 };
 
 /*
@@ -84,14 +144,9 @@  struct kyber_ctx_queue {
 } ____cacheline_aligned_in_smp;
 
 struct kyber_queue_data {
-	struct request_queue *q;
-
-	struct blk_stat_callback *cb;
-
 	/*
-	 * The device is divided into multiple scheduling domains based on the
-	 * request type. Each domain has a fixed number of in-flight requests of
-	 * that type device-wide, limited by these tokens.
+	 * Each scheduling domain has a limited number of in-flight requests
+	 * device-wide, limited by these tokens.
 	 */
 	struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
 
@@ -101,8 +156,19 @@  struct kyber_queue_data {
 	 */
 	unsigned int async_depth;
 
+	struct kyber_cpu_latency __percpu *cpu_latency;
+
+	/* Timer for stats aggregation and adjusting domain tokens. */
+	struct timer_list timer;
+
+	unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
+
+	unsigned long latency_timeout[KYBER_OTHER];
+
+	int domain_p99[KYBER_OTHER];
+
 	/* Target latencies in nanoseconds. */
-	u64 read_lat_nsec, write_lat_nsec;
+	u64 latency_targets[KYBER_OTHER];
 };
 
 struct kyber_hctx_data {
@@ -122,182 +188,165 @@  static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 
 static unsigned int kyber_sched_domain(unsigned int op)
 {
-	if ((op & REQ_OP_MASK) == REQ_OP_READ)
+	switch (op & REQ_OP_MASK) {
+	case REQ_OP_READ:
 		return KYBER_READ;
-	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
-		return KYBER_SYNC_WRITE;
-	else
+	case REQ_OP_WRITE:
+		return KYBER_WRITE;
+	case REQ_OP_DISCARD:
+		return KYBER_DISCARD;
+	default:
 		return KYBER_OTHER;
+	}
 }
 
-enum {
-	NONE = 0,
-	GOOD = 1,
-	GREAT = 2,
-	BAD = -1,
-	AWFUL = -2,
-};
-
-#define IS_GOOD(status) ((status) > 0)
-#define IS_BAD(status) ((status) < 0)
-
-static int kyber_lat_status(struct blk_stat_callback *cb,
-			    unsigned int sched_domain, u64 target)
+static void flush_latency_buckets(struct kyber_queue_data *kqd,
+				  struct kyber_cpu_latency *cpu_latency,
+				  unsigned int sched_domain, unsigned int type)
 {
-	u64 latency;
-
-	if (!cb->stat[sched_domain].nr_samples)
-		return NONE;
+	unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
+	atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
+	unsigned int bucket;
 
-	latency = cb->stat[sched_domain].mean;
-	if (latency >= 2 * target)
-		return AWFUL;
-	else if (latency > target)
-		return BAD;
-	else if (latency <= target / 2)
-		return GREAT;
-	else /* (latency <= target) */
-		return GOOD;
+	for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
+		buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
 }
 
 /*
- * Adjust the read or synchronous write depth given the status of reads and
- * writes. The goal is that the latencies of the two domains are fair (i.e., if
- * one is good, then the other is good).
+ * Calculate the histogram bucket with the given percentile rank, or -1 if there
+ * aren't enough samples yet.
  */
-static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
-				  unsigned int sched_domain, int this_status,
-				  int other_status)
+static int calculate_percentile(struct kyber_queue_data *kqd,
+				unsigned int sched_domain, unsigned int type,
+				unsigned int percentile)
 {
-	unsigned int orig_depth, depth;
+	unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
+	unsigned int bucket, samples = 0, percentile_samples;
+
+	for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
+		samples += buckets[bucket];
+
+	if (!samples)
+		return -1;
 
 	/*
-	 * If this domain had no samples, or reads and writes are both good or
-	 * both bad, don't adjust the depth.
+	 * We do the calculation once we have 500 samples or one second passes
+	 * since the first sample was recorded, whichever comes first.
 	 */
-	if (this_status == NONE ||
-	    (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
-	    (IS_BAD(this_status) && IS_BAD(other_status)))
-		return;
-
-	orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
+	if (!kqd->latency_timeout[sched_domain])
+		kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
+	if (samples < 500 &&
+	    time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
+		return -1;
+	}
+	kqd->latency_timeout[sched_domain] = 0;
 
-	if (other_status == NONE) {
-		depth++;
-	} else {
-		switch (this_status) {
-		case GOOD:
-			if (other_status == AWFUL)
-				depth -= max(depth / 4, 1U);
-			else
-				depth -= max(depth / 8, 1U);
-			break;
-		case GREAT:
-			if (other_status == AWFUL)
-				depth /= 2;
-			else
-				depth -= max(depth / 4, 1U);
+	percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
+	for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
+		if (buckets[bucket] >= percentile_samples)
 			break;
-		case BAD:
-			depth++;
-			break;
-		case AWFUL:
-			if (other_status == GREAT)
-				depth += 2;
-			else
-				depth++;
-			break;
-		}
+		percentile_samples -= buckets[bucket];
 	}
+	memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
 
+	return bucket;
+}
+
+static void kyber_resize_domain(struct kyber_queue_data *kqd,
+				unsigned int sched_domain, unsigned int depth)
+{
 	depth = clamp(depth, 1U, kyber_depth[sched_domain]);
-	if (depth != orig_depth)
+	if (depth != kqd->domain_tokens[sched_domain].sb.depth)
 		sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
 }
 
-/*
- * Adjust the depth of other requests given the status of reads and synchronous
- * writes. As long as either domain is doing fine, we don't throttle, but if
- * both domains are doing badly, we throttle heavily.
- */
-static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
-				     int read_status, int write_status,
-				     bool have_samples)
-{
-	unsigned int orig_depth, depth;
-	int status;
-
-	orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
-
-	if (read_status == NONE && write_status == NONE) {
-		depth += 2;
-	} else if (have_samples) {
-		if (read_status == NONE)
-			status = write_status;
-		else if (write_status == NONE)
-			status = read_status;
-		else
-			status = max(read_status, write_status);
-		switch (status) {
-		case GREAT:
-			depth += 2;
-			break;
-		case GOOD:
-			depth++;
-			break;
-		case BAD:
-			depth -= max(depth / 4, 1U);
-			break;
-		case AWFUL:
-			depth /= 2;
-			break;
+static void kyber_timer_fn(struct timer_list *t)
+{
+	struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
+	unsigned int sched_domain;
+	int cpu;
+	bool bad = false;
+
+	/* Sum all of the per-cpu latency histograms. */
+	for_each_online_cpu(cpu) {
+		struct kyber_cpu_latency *cpu_latency;
+
+		cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
+		for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+			flush_latency_buckets(kqd, cpu_latency, sched_domain,
+					      KYBER_TOTAL_LATENCY);
+			flush_latency_buckets(kqd, cpu_latency, sched_domain,
+					      KYBER_IO_LATENCY);
 		}
 	}
 
-	depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
-	if (depth != orig_depth)
-		sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
-}
-
-/*
- * Apply heuristics for limiting queue depths based on gathered latency
- * statistics.
- */
-static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
-{
-	struct kyber_queue_data *kqd = cb->data;
-	int read_status, write_status;
-
-	read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
-	write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
+	/*
+	 * Check if any domains have a high I/O latency, which might indicate
+	 * congestion in the device. Note that we use the p90; we don't want to
+	 * be too sensitive to outliers here.
+	 */
+	for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+		int p90;
 
-	kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
-	kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
-	kyber_adjust_other_depth(kqd, read_status, write_status,
-				 cb->stat[KYBER_OTHER].nr_samples != 0);
+		p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
+					   90);
+		if (p90 >= KYBER_GOOD_BUCKETS)
+			bad = true;
+	}
 
 	/*
-	 * Continue monitoring latencies if we aren't hitting the targets or
-	 * we're still throttling other requests.
+	 * Adjust the scheduling domain depths. If we determined that there was
+	 * congestion, we throttle all domains with good latencies. Either way,
+	 * we ease up on throttling domains with bad latencies.
 	 */
-	if (!blk_stat_is_active(kqd->cb) &&
-	    ((IS_BAD(read_status) || IS_BAD(write_status) ||
-	      kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
-		blk_stat_activate_msecs(kqd->cb, 100);
+	for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+		unsigned int orig_depth, depth;
+		int p99;
+
+		p99 = calculate_percentile(kqd, sched_domain,
+					   KYBER_TOTAL_LATENCY, 99);
+		/*
+		 * This is kind of subtle: different domains will not
+		 * necessarily have enough samples to calculate the latency
+		 * percentiles during the same window, so we have to remember
+		 * the p99 for the next time we observe congestion; once we do,
+		 * we don't want to throttle again until we get more data, so we
+		 * reset it to -1.
+		 */
+		if (bad) {
+			if (p99 < 0)
+				p99 = kqd->domain_p99[sched_domain];
+			kqd->domain_p99[sched_domain] = -1;
+		} else if (p99 >= 0) {
+			kqd->domain_p99[sched_domain] = p99;
+		}
+		if (p99 < 0)
+			continue;
+
+		/*
+		 * If this domain has bad latency, throttle less. Otherwise,
+		 * throttle more iff we determined that there is congestion.
+		 *
+		 * The new depth is scaled linearly with the p99 latency vs the
+		 * latency target. E.g., if the p99 is 3/4 of the target, then
+		 * we throttle down to 3/4 of the current depth, and if the p99
+		 * is 2x the target, then we double the depth.
+		 */
+		if (bad || p99 >= KYBER_GOOD_BUCKETS) {
+			orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
+			depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
+			kyber_resize_domain(kqd, sched_domain, depth);
+		}
+	}
 }
 
-static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+static unsigned int kyber_sched_tags_shift(struct request_queue *q)
 {
 	/*
 	 * All of the hardware queues have the same depth, so we can just grab
 	 * the shift of the first one.
 	 */
-	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
-}
-
-static int kyber_bucket_fn(const struct request *rq)
-{
-	return kyber_sched_domain(rq->cmd_flags);
+	return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 }
 
 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
@@ -307,16 +356,17 @@  static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 	int ret = -ENOMEM;
 	int i;
 
-	kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+	kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
 	if (!kqd)
 		goto err;
-	kqd->q = q;
 
-	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
-					  KYBER_NUM_DOMAINS, kqd);
-	if (!kqd->cb)
+	kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
+					    GFP_KERNEL | __GFP_ZERO);
+	if (!kqd->cpu_latency)
 		goto err_kqd;
 
+	timer_setup(&kqd->timer, kyber_timer_fn, 0);
+
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		WARN_ON(!kyber_depth[i]);
 		WARN_ON(!kyber_batch_size[i]);
@@ -326,20 +376,22 @@  static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 		if (ret) {
 			while (--i >= 0)
 				sbitmap_queue_free(&kqd->domain_tokens[i]);
-			goto err_cb;
+			goto err_buckets;
 		}
 	}
 
-	shift = kyber_sched_tags_shift(kqd);
-	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+	for (i = 0; i < KYBER_OTHER; i++) {
+		kqd->domain_p99[i] = -1;
+		kqd->latency_targets[i] = kyber_latency_targets[i];
+	}
 
-	kqd->read_lat_nsec = 2000000ULL;
-	kqd->write_lat_nsec = 10000000ULL;
+	shift = kyber_sched_tags_shift(q);
+	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 
 	return kqd;
 
-err_cb:
-	blk_stat_free_callback(kqd->cb);
+err_buckets:
+	free_percpu(kqd->cpu_latency);
 err_kqd:
 	kfree(kqd);
 err:
@@ -361,25 +413,24 @@  static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 		return PTR_ERR(kqd);
 	}
 
+	blk_stat_enable_accounting(q);
+
 	eq->elevator_data = kqd;
 	q->elevator = eq;
 
-	blk_stat_add_callback(q, kqd->cb);
-
 	return 0;
 }
 
 static void kyber_exit_sched(struct elevator_queue *e)
 {
 	struct kyber_queue_data *kqd = e->elevator_data;
-	struct request_queue *q = kqd->q;
 	int i;
 
-	blk_stat_remove_callback(q, kqd->cb);
+	del_timer_sync(&kqd->timer);
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 		sbitmap_queue_free(&kqd->domain_tokens[i]);
-	blk_stat_free_callback(kqd->cb);
+	free_percpu(kqd->cpu_latency);
 	kfree(kqd);
 }
 
@@ -547,40 +598,44 @@  static void kyber_finish_request(struct request *rq)
 	rq_clear_domain_token(kqd, rq);
 }
 
-static void kyber_completed_request(struct request *rq, u64 now)
+static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
+			       unsigned int sched_domain, unsigned int type,
+			       u64 target, u64 latency)
 {
-	struct request_queue *q = rq->q;
-	struct kyber_queue_data *kqd = q->elevator->elevator_data;
-	unsigned int sched_domain;
-	u64 latency, target;
+	unsigned int bucket;
+	u64 divisor;
 
-	/*
-	 * Check if this request met our latency goal. If not, quickly gather
-	 * some statistics and start throttling.
-	 */
-	sched_domain = kyber_sched_domain(rq->cmd_flags);
-	switch (sched_domain) {
-	case KYBER_READ:
-		target = kqd->read_lat_nsec;
-		break;
-	case KYBER_SYNC_WRITE:
-		target = kqd->write_lat_nsec;
-		break;
-	default:
-		return;
+	if (latency > 0) {
+		divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
+		bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
+			       KYBER_LATENCY_BUCKETS - 1);
+	} else {
+		bucket = 0;
 	}
 
-	/* If we are already monitoring latencies, don't check again. */
-	if (blk_stat_is_active(kqd->cb))
-		return;
+	atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
+}
 
-	if (now < rq->io_start_time_ns)
+static void kyber_completed_request(struct request *rq, u64 now)
+{
+	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
+	struct kyber_cpu_latency *cpu_latency;
+	unsigned int sched_domain;
+	u64 target;
+
+	sched_domain = kyber_sched_domain(rq->cmd_flags);
+	if (sched_domain == KYBER_OTHER)
 		return;
 
-	latency = now - rq->io_start_time_ns;
+	cpu_latency = get_cpu_ptr(kqd->cpu_latency);
+	target = kqd->latency_targets[sched_domain];
+	add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
+			   target, now - rq->start_time_ns);
+	add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
+			   now - rq->io_start_time_ns);
+	put_cpu_ptr(kqd->cpu_latency);
 
-	if (latency > target)
-		blk_stat_activate_msecs(kqd->cb, 10);
+	timer_reduce(&kqd->timer, jiffies + HZ / 10);
 }
 
 struct flush_kcq_data {
@@ -778,17 +833,17 @@  static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
-#define KYBER_LAT_SHOW_STORE(op)					\
-static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,		\
-				     char *page)			\
+#define KYBER_LAT_SHOW_STORE(domain, name)				\
+static ssize_t kyber_##name##_lat_show(struct elevator_queue *e,	\
+				       char *page)			\
 {									\
 	struct kyber_queue_data *kqd = e->elevator_data;		\
 									\
-	return sprintf(page, "%llu\n", kqd->op##_lat_nsec);		\
+	return sprintf(page, "%llu\n", kqd->latency_targets[domain]);	\
 }									\
 									\
-static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,		\
-				      const char *page, size_t count)	\
+static ssize_t kyber_##name##_lat_store(struct elevator_queue *e,	\
+					const char *page, size_t count)	\
 {									\
 	struct kyber_queue_data *kqd = e->elevator_data;		\
 	unsigned long long nsec;					\
@@ -798,12 +853,12 @@  static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,		\
 	if (ret)							\
 		return ret;						\
 									\
-	kqd->op##_lat_nsec = nsec;					\
+	kqd->latency_targets[domain] = nsec;				\
 									\
 	return count;							\
 }
-KYBER_LAT_SHOW_STORE(read);
-KYBER_LAT_SHOW_STORE(write);
+KYBER_LAT_SHOW_STORE(KYBER_READ, read);
+KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
 #undef KYBER_LAT_SHOW_STORE
 
 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
@@ -870,7 +925,8 @@  static int kyber_##name##_waiting_show(void *data, struct seq_file *m)	\
 	return 0;							\
 }
 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
-KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
 
@@ -892,8 +948,11 @@  static int kyber_cur_domain_show(void *data, struct seq_file *m)
 	case KYBER_READ:
 		seq_puts(m, "READ\n");
 		break;
-	case KYBER_SYNC_WRITE:
-		seq_puts(m, "SYNC_WRITE\n");
+	case KYBER_WRITE:
+		seq_puts(m, "WRITE\n");
+		break;
+	case KYBER_DISCARD:
+		seq_puts(m, "DISCARD\n");
 		break;
 	case KYBER_OTHER:
 		seq_puts(m, "OTHER\n");
@@ -918,7 +977,8 @@  static int kyber_batching_show(void *data, struct seq_file *m)
 	{#name "_tokens", 0400, kyber_##name##_tokens_show}
 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 	KYBER_QUEUE_DOMAIN_ATTRS(read),
-	KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
+	KYBER_QUEUE_DOMAIN_ATTRS(write),
+	KYBER_QUEUE_DOMAIN_ATTRS(discard),
 	KYBER_QUEUE_DOMAIN_ATTRS(other),
 	{"async_depth", 0400, kyber_async_depth_show},
 	{},
@@ -930,7 +990,8 @@  static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 	{#name "_waiting", 0400, kyber_##name##_waiting_show}
 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 	KYBER_HCTX_DOMAIN_ATTRS(read),
-	KYBER_HCTX_DOMAIN_ATTRS(sync_write),
+	KYBER_HCTX_DOMAIN_ATTRS(write),
+	KYBER_HCTX_DOMAIN_ATTRS(discard),
 	KYBER_HCTX_DOMAIN_ATTRS(other),
 	{"cur_domain", 0400, kyber_cur_domain_show},
 	{"batching", 0400, kyber_batching_show},