@@ -1357,6 +1357,8 @@ alloc_cfqq:
/* call it after cfq has initialized queue prio */
elv_init_ioq_io_group(ioq, iog);
+ /* ioq reference on iog */
+ elv_get_iog(iog);
cfq_log_cfqq(cfqd, cfqq, "alloced");
} else {
cfqq = &cfqd->oom_cfqq;
@@ -677,6 +677,7 @@ void elv_put_ioq(struct io_queue *ioq)
{
struct elv_fq_data *efqd = ioq->efqd;
struct elevator_queue *e = efqd->eq;
+ struct io_group *iog;
BUG_ON(atomic_read(&ioq->ref) <= 0);
if (!atomic_dec_and_test(&ioq->ref))
@@ -684,12 +685,14 @@ void elv_put_ioq(struct io_queue *ioq)
BUG_ON(ioq->nr_queued);
BUG_ON(elv_ioq_busy(ioq));
BUG_ON(efqd->active_queue == ioq);
+ iog = ioq_to_io_group(ioq);
/* Can be called by outgoing elevator. Don't use q */
BUG_ON(!e->ops->elevator_free_sched_queue_fn);
e->ops->elevator_free_sched_queue_fn(e, ioq->sched_queue);
elv_log_ioq(efqd, ioq, "put_queue");
elv_free_ioq(ioq);
+ elv_put_iog(iog);
}
EXPORT_SYMBOL(elv_put_ioq);
@@ -919,6 +922,27 @@ void elv_io_group_set_async_queue(struct io_group *iog, int ioprio_class,
EXPORT_SYMBOL(elv_io_group_set_async_queue);
#ifdef CONFIG_GROUP_IOSCHED
+static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup);
+
+static void io_group_init_entity(struct io_cgroup *iocg, struct io_group *iog)
+{
+ struct io_entity *entity = &iog->entity;
+
+ entity->weight = iocg->weight;
+ entity->ioprio_class = iocg->ioprio_class;
+ entity->ioprio_changed = 1;
+ entity->my_sd = &iog->sched_data;
+}
+
+static void io_group_set_parent(struct io_group *iog, struct io_group *parent)
+{
+ struct io_entity *entity = &iog->entity;
+
+ init_io_entity_parent(entity, &parent->entity);
+
+ /* Child group reference on parent group. */
+ elv_get_iog(parent);
+}
struct io_cgroup io_root_cgroup = {
.weight = IO_WEIGHT_DEFAULT,
@@ -931,6 +955,27 @@ static struct io_cgroup *cgroup_to_io_cgroup(struct cgroup *cgroup)
struct io_cgroup, css);
}
+/*
+ * Search the io_group for efqd into the hash table (by now only a list)
+ * of bgrp. Must be called under rcu_read_lock().
+ */
+static struct io_group *
+io_cgroup_lookup_group(struct io_cgroup *iocg, void *key)
+{
+ struct io_group *iog;
+ struct hlist_node *n;
+ void *__key;
+
+ hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) {
+ __key = rcu_dereference(iog->key);
+ if (__key == key)
+ return iog;
+ }
+
+ return NULL;
+}
+
+
#define SHOW_FUNCTION(__VAR) \
static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \
struct cftype *cftype) \
@@ -1070,12 +1115,6 @@ static void iocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
task_unlock(tsk);
}
-static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
-{
-
- /* Implemented in later patch */
-}
-
struct cgroup_subsys io_subsys = {
.name = "io",
.create = iocg_create,
@@ -1087,11 +1126,196 @@ struct cgroup_subsys io_subsys = {
.use_id = 1,
};
+static inline unsigned int iog_weight(struct io_group *iog)
+{
+ return iog->entity.weight;
+}
+
+static struct io_group *
+io_group_chain_alloc(struct request_queue *q, void *key, struct cgroup *cgroup)
+{
+ struct io_cgroup *iocg;
+ struct io_group *iog, *leaf = NULL, *prev = NULL;
+ gfp_t flags = GFP_ATOMIC | __GFP_ZERO;
+
+ for (; cgroup != NULL; cgroup = cgroup->parent) {
+ iocg = cgroup_to_io_cgroup(cgroup);
+
+ iog = io_cgroup_lookup_group(iocg, key);
+ if (iog != NULL) {
+ /*
+ * All the cgroups in the path from there to the
+ * root must have a io_group for efqd, so we don't
+ * need any more allocations.
+ */
+ break;
+ }
+
+ iog = kzalloc_node(sizeof(*iog), flags, q->node);
+ if (!iog)
+ goto cleanup;
+
+ iog->iocg_id = css_id(&iocg->css);
+
+ io_group_init_entity(iocg, iog);
+
+ atomic_set(&iog->ref, 0);
+
+ /*
+ * Take the initial reference that will be released on destroy
+ * This can be thought of a joint reference by cgroup and
+ * elevator which will be dropped by either elevator exit
+ * or cgroup deletion path depending on who is exiting first.
+ */
+ elv_get_iog(iog);
+
+ if (leaf == NULL) {
+ leaf = iog;
+ prev = leaf;
+ } else {
+ io_group_set_parent(prev, iog);
+ /*
+ * Build a list of allocated nodes using the efqd
+ * filed, that is still unused and will be initialized
+ * only after the node will be connected.
+ */
+ prev->key = iog;
+ prev = iog;
+ }
+ }
+
+ return leaf;
+
+cleanup:
+ while (leaf != NULL) {
+ prev = leaf;
+ leaf = leaf->key;
+ kfree(prev);
+ }
+
+ return NULL;
+}
+
+static void io_group_chain_link(struct request_queue *q, void *key,
+ struct cgroup *cgroup, struct io_group *leaf,
+ struct elv_fq_data *efqd)
+{
+ struct io_cgroup *iocg;
+ struct io_group *iog, *next, *prev = NULL;
+ unsigned long flags;
+
+ assert_spin_locked(q->queue_lock);
+
+ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
+ iocg = cgroup_to_io_cgroup(cgroup);
+ next = leaf->key;
+
+ iog = io_cgroup_lookup_group(iocg, key);
+ BUG_ON(iog != NULL);
+
+ spin_lock_irqsave(&iocg->lock, flags);
+
+ rcu_assign_pointer(leaf->key, key);
+ hlist_add_head_rcu(&leaf->group_node, &iocg->group_data);
+ hlist_add_head(&leaf->elv_data_node, &efqd->group_list);
+
+ spin_unlock_irqrestore(&iocg->lock, flags);
+
+ prev = leaf;
+ leaf = next;
+ }
+
+ BUG_ON(cgroup == NULL && leaf != NULL);
+
+ /*
+ * This connects the topmost element of the allocated chain to the
+ * parent group.
+ */
+ if (cgroup != NULL && prev != NULL) {
+ iocg = cgroup_to_io_cgroup(cgroup);
+ iog = io_cgroup_lookup_group(iocg, key);
+ io_group_set_parent(prev, iog);
+ }
+}
+
+static struct io_group *io_find_alloc_group(struct request_queue *q,
+ struct cgroup *cgroup, struct elv_fq_data *efqd,
+ int create)
+{
+ struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup);
+ struct io_group *iog = NULL;
+ /* Note: Use efqd as key */
+ void *key = efqd;
+
+ /*
+ * Take a refenrece to css object. Don't want to map a bio to
+ * a group if it has been marked for deletion
+ */
+
+ if (!iocg || !css_tryget(&iocg->css))
+ return iog;
+
+ iog = io_cgroup_lookup_group(iocg, key);
+ if (iog != NULL || !create)
+ goto end;
+
+ iog = io_group_chain_alloc(q, key, cgroup);
+ if (iog != NULL)
+ io_group_chain_link(q, key, cgroup, iog, efqd);
+
+end:
+ css_put(&iocg->css);
+ return iog;
+}
+
+/*
+ * Search for the io group current task belongs to. If create=1, then also
+ * create the io group if it is not already there.
+ *
+ * Note: This function should be called with queue lock held. It returns
+ * a pointer to io group without taking any reference. That group will
+ * be around as long as queue lock is not dropped (as group reclaim code
+ * needs to get hold of queue lock). So if somebody needs to use group
+ * pointer even after dropping queue lock, take a reference to the group
+ * before dropping queue lock.
+ */
+struct io_group *elv_io_get_io_group(struct request_queue *q, int create)
+{
+ struct cgroup *cgroup;
+ struct io_group *iog;
+ struct elv_fq_data *efqd = q->elevator->efqd;
+
+ assert_spin_locked(q->queue_lock);
+
+ rcu_read_lock();
+ cgroup = task_cgroup(current, io_subsys_id);
+ iog = io_find_alloc_group(q, cgroup, efqd, create);
+ if (!iog) {
+ if (create)
+ iog = efqd->root_group;
+ else
+ /*
+ * bio merge functions doing lookup don't want to
+ * map bio to root group by default
+ */
+ iog = NULL;
+ }
+ rcu_read_unlock();
+ return iog;
+}
+EXPORT_SYMBOL(elv_io_get_io_group);
+
+
static void io_free_root_group(struct elevator_queue *e)
{
struct io_group *iog = e->efqd->root_group;
struct io_service_tree *st;
int i;
+ struct io_cgroup *iocg = &io_root_cgroup;
+
+ spin_lock_irq(&iocg->lock);
+ hlist_del_rcu(&iog->group_node);
+ spin_unlock_irq(&iocg->lock);
for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
st = iog->sched_data.service_tree + i;
@@ -1099,19 +1323,21 @@ static void io_free_root_group(struct elevator_queue *e)
}
put_io_group_queues(e, iog);
- kfree(iog);
+ elv_put_iog(iog);
}
static struct io_group *io_alloc_root_group(struct request_queue *q,
struct elevator_queue *e, void *key)
{
struct io_group *iog;
+ struct io_cgroup *iocg = &io_root_cgroup;
int i;
iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
if (iog == NULL)
return NULL;
+ elv_get_iog(iog);
iog->entity.parent = NULL;
iog->entity.my_sd = &iog->sched_data;
iog->key = key;
@@ -1119,11 +1345,235 @@ static struct io_group *io_alloc_root_group(struct request_queue *q,
for (i = 0; i < IO_IOPRIO_CLASSES; i++)
iog->sched_data.service_tree[i] = ELV_SERVICE_TREE_INIT;
+ spin_lock_irq(&iocg->lock);
+ rcu_assign_pointer(iog->key, key);
+ hlist_add_head_rcu(&iog->group_node, &iocg->group_data);
+ iog->iocg_id = css_id(&iocg->css);
+ spin_unlock_irq(&iocg->lock);
+
return iog;
}
+static void io_group_free_rcu(struct rcu_head *head)
+{
+ struct io_group *iog;
+
+ iog = container_of(head, struct io_group, rcu_head);
+ kfree(iog);
+}
+
+/*
+ * This cleanup function does the last bit of things to destroy cgroup.
+ * It should only get called after io_destroy_group has been invoked.
+ */
+static void io_group_cleanup(struct io_group *iog)
+{
+ struct io_service_tree *st;
+ int i;
+
+ BUG_ON(iog->sched_data.active_entity != NULL);
+
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
+ st = iog->sched_data.service_tree + i;
+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
+ BUG_ON(st->active_entity != NULL);
+ }
+
+ /*
+ * Wait for any rcu readers to exit before freeing up the group.
+ * Primarily useful when elv_io_get_io_group() is called without queue
+ * lock to access some group data from bdi_congested_group() path.
+ */
+ call_rcu(&iog->rcu_head, io_group_free_rcu);
+}
+
+void elv_put_iog(struct io_group *iog)
+{
+ struct io_group *parent_iog = NULL;
+ struct io_entity *parent;
+
+ BUG_ON(atomic_read(&iog->ref) <= 0);
+ if (!atomic_dec_and_test(&iog->ref))
+ return;
+
+ parent = parent_entity(&iog->entity);
+ if (parent)
+ parent_iog = iog_of(parent);
+
+ io_group_cleanup(iog);
+
+ if (parent_iog)
+ elv_put_iog(parent_iog);
+}
+EXPORT_SYMBOL(elv_put_iog);
+
+/*
+ * After the group is destroyed, no new sync IO should come to the group.
+ * It might still have pending IOs in some busy queues. It should be able to
+ * send those IOs down to the disk. The async IOs (due to dirty page writeback)
+ * would go in the root group queues after this, as the group does not exist
+ * anymore.
+ */
+static void __io_destroy_group(struct elv_fq_data *efqd, struct io_group *iog)
+{
+ struct io_service_tree *st;
+ int i;
+ struct io_entity *entity = &iog->entity;
+
+ /*
+ * Mark io group for deletion so that no new entry goes in
+ * idle tree. Any active queue which is removed from active
+ * tree will not be put in to idle tree.
+ */
+ entity->exiting = 1;
+
+ /* We flush idle tree now, and don't put things in there any more. */
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
+ st = iog->sched_data.service_tree + i;
+ flush_idle_tree(st);
+ }
+
+ hlist_del(&iog->elv_data_node);
+ put_io_group_queues(efqd->eq, iog);
+
+ if (entity->on_idle_st)
+ dequeue_io_entity_idle(entity);
+
+ /*
+ * Put the reference taken at the time of creation so that when all
+ * queues are gone, group can be destroyed.
+ */
+ elv_put_iog(iog);
+}
+
+static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+ struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup);
+ struct io_group *iog;
+ struct elv_fq_data *efqd;
+ unsigned long uninitialized_var(flags);
+
+ /*
+ * io groups are linked in two lists. One list is maintained
+ * in elevator (efqd->group_list) and other is maintained
+ * per cgroup structure (iocg->group_data).
+ *
+ * While a cgroup is being deleted, elevator also might be
+ * exiting and both might try to cleanup the same io group
+ * so need to be little careful.
+ *
+ * (iocg->group_data) is protected by iocg->lock. To avoid deadlock,
+ * we can't hold the queue lock while holding iocg->lock. So we first
+ * remove iog from iocg->group_data under iocg->lock. Whoever removes
+ * iog from iocg->group_data should call __io_destroy_group to remove
+ * iog.
+ */
+
+ rcu_read_lock();
+
+remove_entry:
+ spin_lock_irqsave(&iocg->lock, flags);
+
+ if (hlist_empty(&iocg->group_data)) {
+ spin_unlock_irqrestore(&iocg->lock, flags);
+ goto done;
+ }
+ iog = hlist_entry(iocg->group_data.first, struct io_group,
+ group_node);
+ efqd = rcu_dereference(iog->key);
+ hlist_del_rcu(&iog->group_node);
+ iog->iocg_id = 0;
+ spin_unlock_irqrestore(&iocg->lock, flags);
+
+ spin_lock_irqsave(efqd->queue->queue_lock, flags);
+ __io_destroy_group(efqd, iog);
+ spin_unlock_irqrestore(efqd->queue->queue_lock, flags);
+ goto remove_entry;
+
+done:
+ free_css_id(&io_subsys, &iocg->css);
+ rcu_read_unlock();
+ BUG_ON(!hlist_empty(&iocg->group_data));
+ kfree(iocg);
+}
+
+/*
+ * This functions checks if iog is still in iocg->group_data, and removes it.
+ * If iog is not in that list, then cgroup destroy path has removed it, and
+ * we do not need to remove it.
+ */
+static void
+io_group_check_and_destroy(struct elv_fq_data *efqd, struct io_group *iog)
+{
+ struct io_cgroup *iocg;
+ unsigned long flags;
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ css = css_lookup(&io_subsys, iog->iocg_id);
+
+ if (!css)
+ goto out;
+
+ iocg = container_of(css, struct io_cgroup, css);
+
+ spin_lock_irqsave(&iocg->lock, flags);
+
+ if (iog->iocg_id) {
+ hlist_del_rcu(&iog->group_node);
+ __io_destroy_group(efqd, iog);
+ }
+
+ spin_unlock_irqrestore(&iocg->lock, flags);
+out:
+ rcu_read_unlock();
+}
+
+static void release_elv_io_groups(struct elevator_queue *e)
+{
+ struct hlist_node *pos, *n;
+ struct io_group *iog;
+ struct elv_fq_data *efqd = e->efqd;
+
+ hlist_for_each_entry_safe(iog, pos, n, &efqd->group_list,
+ elv_data_node) {
+ io_group_check_and_destroy(efqd, iog);
+ }
+}
+
+/*
+ * if bio sumbmitting task and rq don't belong to same io_group, it can't
+ * be merged
+ */
+int elv_io_group_allow_merge(struct request *rq, struct bio *bio)
+{
+ struct request_queue *q = rq->q;
+ struct io_queue *ioq = rq->ioq;
+ struct io_group *iog, *__iog;
+
+ if (!elv_iosched_fair_queuing_enabled(q->elevator))
+ return 1;
+
+ /* Determine the io group of the bio submitting task */
+ iog = elv_io_get_io_group(q, 0);
+ if (!iog) {
+ /* May be task belongs to a differet cgroup for which io
+ * group has not been setup yet. */
+ return 0;
+ }
+
+ /* Determine the io group of the ioq, rq belongs to*/
+ __iog = ioq_to_io_group(ioq);
+
+ return (iog == __iog);
+}
+
#else /* CONFIG_GROUP_IOSCHED */
+static inline unsigned int iog_weight(struct io_group *iog) { return 0; }
+static inline void release_elv_io_groups(struct elevator_queue *e) {}
+
static struct io_group *io_alloc_root_group(struct request_queue *q,
struct elevator_queue *e, void *key)
{
@@ -1207,8 +1657,13 @@ __elv_set_active_ioq(struct elv_fq_data *efqd, struct io_queue *ioq, int coop)
struct elevator_queue *eq = q->elevator;
if (ioq) {
- elv_log_ioq(efqd, ioq, "set_active, busy=%d",
- efqd->busy_queues);
+ struct io_group *iog = ioq_to_io_group(ioq);
+ elv_log_ioq(efqd, ioq, "set_active, busy=%d class=%hu prio=%hu"
+ " weight=%u group_weight=%u qued=%d",
+ efqd->busy_queues, ioq->entity.ioprio_class,
+ ioq->entity.ioprio, ioq->entity.weight,
+ iog_weight(iog), ioq->nr_queued);
+
ioq->slice_start = ioq->slice_end = 0;
ioq->dispatch_start = jiffies;
@@ -1387,6 +1842,7 @@ static int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
struct io_queue *active_ioq;
struct elevator_queue *eq = q->elevator;
struct io_entity *entity, *new_entity;
+ struct io_group *iog = NULL, *new_iog = NULL;
active_ioq = elv_active_ioq(eq);
@@ -1419,9 +1875,16 @@ static int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
return 1;
/*
- * Check with io scheduler if it has additional criterion based on
- * which it wants to preempt existing queue.
+ * If both the queues belong to same group, check with io scheduler
+ * if it has additional criterion based on which it wants to
+ * preempt existing queue.
*/
+ iog = ioq_to_io_group(active_ioq);
+ new_iog = ioq_to_io_group(new_ioq);
+
+ if (iog != new_iog)
+ return 0;
+
if (eq->ops->elevator_should_preempt_fn) {
void *sched_queue = elv_ioq_sched_queue(new_ioq);
@@ -1569,6 +2032,10 @@ static inline struct io_queue *elv_close_cooperator(struct request_queue *q,
if (new_ioq)
elv_log_ioq(e->efqd, ioq, "cooperating ioq=%d", new_ioq->pid);
+ /* Only select co-operating queue if it belongs to same group as ioq */
+ if (new_ioq && !is_same_group(&ioq->entity, &new_ioq->entity))
+ return NULL;
+
return new_ioq;
}
@@ -1873,6 +2340,7 @@ int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
efqd->idle_slice_timer.data = (unsigned long) efqd;
INIT_WORK(&efqd->unplug_work, elv_kick_queue);
+ INIT_HLIST_HEAD(&efqd->group_list);
efqd->elv_slice[0] = elv_slice_async;
efqd->elv_slice[1] = elv_slice_sync;
@@ -1890,12 +2358,22 @@ int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
void elv_exit_fq_data(struct elevator_queue *e)
{
struct elv_fq_data *efqd = e->efqd;
+ struct request_queue *q = efqd->queue;
if (!elv_iosched_fair_queuing_enabled(e))
return;
elv_shutdown_timer_wq(e);
+ spin_lock_irq(q->queue_lock);
+ release_elv_io_groups(e);
+ spin_unlock_irq(q->queue_lock);
+
+ elv_shutdown_timer_wq(e);
+
+ /* Wait for iog->key accessors to exit their grace periods. */
+ synchronize_rcu();
+
BUG_ON(timer_pending(&efqd->idle_slice_timer));
io_free_root_group(e);
}
@@ -100,6 +100,7 @@ struct io_group {
atomic_t ref;
struct io_sched_data sched_data;
struct hlist_node group_node;
+ struct hlist_node elv_data_node;
unsigned short iocg_id;
/*
* async queue for each priority case for RT and BE class.
@@ -109,6 +110,7 @@ struct io_group {
struct io_queue *async_queue[2][IOPRIO_BE_NR];
struct io_queue *async_idle_queue;
void *key;
+ struct rcu_head rcu_head;
};
struct io_cgroup {
@@ -142,6 +144,9 @@ struct io_group {
struct elv_fq_data {
struct io_group *root_group;
+ /* List of io groups hanging on this elevator */
+ struct hlist_head group_list;
+
struct request_queue *queue;
struct elevator_queue *eq;
unsigned int busy_queues;
@@ -322,6 +327,28 @@ static inline struct io_queue *elv_get_oom_ioq(struct elevator_queue *eq)
return &eq->efqd->oom_ioq;
}
+#ifdef CONFIG_GROUP_IOSCHED
+
+extern int elv_io_group_allow_merge(struct request *rq, struct bio *bio);
+extern void elv_put_iog(struct io_group *iog);
+extern struct io_group *elv_io_get_io_group(struct request_queue *q,
+ int create);
+
+static inline void elv_get_iog(struct io_group *iog)
+{
+ atomic_inc(&iog->ref);
+}
+
+#else /* !GROUP_IOSCHED */
+
+static inline int elv_io_group_allow_merge(struct request *rq, struct bio *bio)
+{
+ return 1;
+}
+
+static inline void elv_get_iog(struct io_group *iog) {}
+static inline void elv_put_iog(struct io_group *iog) {}
+
static inline struct io_group *
elv_io_get_io_group(struct request_queue *q, int create)
{
@@ -329,6 +356,8 @@ elv_io_get_io_group(struct request_queue *q, int create)
return q->elevator->efqd->root_group;
}
+#endif /* GROUP_IOSCHED */
+
extern ssize_t elv_slice_sync_show(struct elevator_queue *q, char *name);
extern ssize_t elv_slice_sync_store(struct elevator_queue *q, const char *name,
size_t count);
@@ -413,6 +442,12 @@ static inline void *elv_select_ioq(struct request_queue *q, int force)
{
return NULL;
}
+
+static inline int elv_io_group_allow_merge(struct request *rq, struct bio *bio)
+
+{
+ return 1;
+}
#endif /* CONFIG_ELV_FAIR_QUEUING */
#endif /* _ELV_SCHED_H */
#endif /* CONFIG_BLOCK */
@@ -122,6 +122,10 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
!bio_failfast_driver(bio) != !blk_failfast_driver(rq))
return 0;
+ /* If rq and bio belongs to different groups, dont allow merging */
+ if (!elv_io_group_allow_merge(rq, bio))
+ return 0;
+
if (!elv_iosched_allow_merge(rq, bio))
return 0;