@@ -21,6 +21,8 @@
#define ELV_SLICE_SCALE (500)
#define ELV_SERVICE_SHIFT 20
+static void check_idle_tree_release(struct io_service_tree *st);
+
static inline struct io_queue *ioq_of(struct io_entity *entity)
{
if (entity->my_sd == NULL)
@@ -78,6 +80,11 @@ elv_prio_to_slice(struct elv_fq_data *efqd, struct io_queue *ioq)
return elv_weight_slice(efqd, elv_ioq_sync(ioq), ioq->entity.weight);
}
+static inline int vdisktime_gt(u64 a, u64 b)
+{
+ return (s64)(a - b) > 0;
+}
+
static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
s64 delta = (s64)(vdisktime - min_vdisktime);
@@ -114,6 +121,7 @@ static void update_min_vdisktime(struct io_service_tree *st)
}
st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
+ check_idle_tree_release(st);
}
static inline struct io_entity *parent_entity(struct io_entity *entity)
@@ -167,27 +175,46 @@ static void place_entity(struct io_service_tree *st, struct io_entity *entity,
struct rb_node *parent;
struct io_entity *entry;
int nr_active = st->nr_active - 1;
+ struct io_queue *ioq = ioq_of(entity);
+ int sync = 1;
+
+ if (ioq)
+ sync = elv_ioq_sync(ioq);
+
+ if (add_front || !nr_active) {
+ vdisktime = st->min_vdisktime;
+ goto done;
+ }
+
+ if (sync && entity->vdisktime
+ && vdisktime_gt(entity->vdisktime, st->min_vdisktime)) {
+ /* vdisktime still in future. Use old vdisktime */
+ vdisktime = entity->vdisktime;
+ goto done;
+ }
/*
- * Currently put entity at the end of last entity. This probably will
- * require adjustments as we move along
+ * Effectively a new queue. Assign sync queue a lower vdisktime so
+ * we can achieve better latencies for small file readers. For async
+ * queues, put them at the end of the existing queue.
+ * Group entities are always considered sync.
*/
- if (io_entity_class_idle(entity)) {
- vdisktime = elv_delta_fair(ELV_IDLE_DELAY, entity);
- parent = rb_last(&st->active);
- if (parent) {
- entry = rb_entry(parent, struct io_entity, rb_node);
- vdisktime += entry->vdisktime;
- }
- } else if (!add_front && nr_active) {
- parent = rb_last(&st->active);
- if (parent) {
- entry = rb_entry(parent, struct io_entity, rb_node);
- vdisktime = entry->vdisktime;
- }
- } else
+ if (sync) {
vdisktime = st->min_vdisktime;
+ goto done;
+ }
+ /*
+ * Put entity at the end of the tree. Effectively async queues use
+ * this path.
+ */
+ parent = rb_last(&st->active);
+ if (parent) {
+ entry = rb_entry(parent, struct io_entity, rb_node);
+ vdisktime = entry->vdisktime;
+ } else
+ vdisktime = st->min_vdisktime;
+done:
entity->vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
}
@@ -200,6 +227,122 @@ static inline void io_entity_update_prio(struct io_entity *entity)
*/
init_io_entity_service_tree(entity, parent_entity(entity));
entity->ioprio_changed = 0;
+
+ /*
+ * Assign this entity a fresh vdisktime instead of using
+ * previous one as prio class will lead to service tree
+ * change and this vdisktime will not be valid on new
+ * service tree.
+ *
+ * TODO: Handle the case of only prio change.
+ */
+ entity->vdisktime = 0;
+ }
+}
+
+static void
+__dequeue_io_entity_idle(struct io_service_tree *st, struct io_entity *entity)
+{
+ if (st->rb_leftmost_idle == &entity->rb_node) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&entity->rb_node);
+ st->rb_leftmost_idle = next_node;
+ }
+
+ rb_erase(&entity->rb_node, &st->idle);
+ RB_CLEAR_NODE(&entity->rb_node);
+}
+
+static void dequeue_io_entity_idle(struct io_entity *entity)
+{
+ struct io_queue *ioq = ioq_of(entity);
+
+ __dequeue_io_entity_idle(entity->st, entity);
+ entity->on_idle_st = 0;
+ if (ioq)
+ elv_put_ioq(ioq);
+}
+
+static void
+__enqueue_io_entity_idle(struct io_service_tree *st, struct io_entity *entity)
+{
+ struct rb_node **node = &st->idle.rb_node;
+ struct rb_node *parent = NULL;
+ struct io_entity *entry;
+ int leftmost = 1;
+
+ while (*node != NULL) {
+ parent = *node;
+ entry = rb_entry(parent, struct io_entity, rb_node);
+
+ if (vdisktime_gt(entry->vdisktime, entity->vdisktime))
+ node = &parent->rb_left;
+ else {
+ node = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ /*
+ * Maintain a cache of leftmost tree entries (it is frequently
+ * used)
+ */
+ if (leftmost)
+ st->rb_leftmost_idle = &entity->rb_node;
+
+ rb_link_node(&entity->rb_node, parent, node);
+ rb_insert_color(&entity->rb_node, &st->idle);
+}
+
+static void enqueue_io_entity_idle(struct io_entity *entity)
+{
+ struct io_queue *ioq = ioq_of(entity);
+ struct io_group *parent_iog;
+
+ /*
+ * Don't put an entity on idle tree if it has been marked for deletion.
+ * We are not expecting more io from this entity. No need to cache it
+ */
+
+ if (entity->exiting)
+ return;
+
+ /*
+ * If parent group is exiting, don't put on idle tree. May be task got
+ * moved to a different cgroup and original cgroup got deleted
+ */
+ parent_iog = iog_of(parent_entity(entity));
+ if (parent_iog->entity.exiting)
+ return;
+
+ if (ioq)
+ elv_get_ioq(ioq);
+ __enqueue_io_entity_idle(entity->st, entity);
+ entity->on_idle_st = 1;
+}
+
+static void check_idle_tree_release(struct io_service_tree *st)
+{
+ struct io_entity *leftmost;
+
+ if (!st->rb_leftmost_idle)
+ return;
+
+ leftmost = rb_entry(st->rb_leftmost_idle, struct io_entity, rb_node);
+
+ if (vdisktime_gt(st->min_vdisktime, leftmost->vdisktime))
+ dequeue_io_entity_idle(leftmost);
+}
+
+static void flush_idle_tree(struct io_service_tree *st)
+{
+ struct io_entity *entity;
+
+ while (st->rb_leftmost_idle) {
+ entity = rb_entry(st->rb_leftmost_idle, struct io_entity,
+ rb_node);
+ dequeue_io_entity_idle(entity);
}
}
@@ -235,6 +378,9 @@ static void dequeue_io_entity(struct io_entity *entity)
entity->on_st = 0;
st->nr_active--;
sd->nr_active--;
+
+ if (vdisktime_gt(entity->vdisktime, st->min_vdisktime))
+ enqueue_io_entity_idle(entity);
}
static void
@@ -276,6 +422,16 @@ static void enqueue_io_entity(struct io_entity *entity)
struct io_service_tree *st;
struct io_sched_data *sd = io_entity_sched_data(entity);
+ if (entity->on_idle_st)
+ dequeue_io_entity_idle(entity);
+ else
+ /*
+ * This entity was not in idle tree cache. Zero out vdisktime
+ * so that we don't rely on old vdisktime instead assign a
+ * fresh one.
+ */
+ entity->vdisktime = 0;
+
io_entity_update_prio(entity);
st = entity->st;
st->nr_active++;
@@ -28,6 +28,10 @@ struct io_service_tree {
u64 min_vdisktime;
struct rb_node *rb_leftmost;
unsigned int nr_active;
+
+ /* A cache of io entities which were served and expired */
+ struct rb_root idle;
+ struct rb_node *rb_leftmost_idle;
};
struct io_sched_data {
@@ -39,9 +43,12 @@ struct io_sched_data {
struct io_entity {
struct rb_node rb_node;
int on_st;
+ int on_idle_st;
u64 vdisktime;
unsigned int weight;
struct io_entity *parent;
+ /* This io entity (queue or group) has been marked for deletion */
+ unsigned int exiting;
struct io_sched_data *my_sd;
struct io_service_tree *st;