From patchwork Thu Sep 24 19:25:11 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Vivek Goyal X-Patchwork-Id: 50014 Received: from hormel.redhat.com (hormel1.redhat.com [209.132.177.33]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n8OJg3QD004779 for ; Thu, 24 Sep 2009 19:42:03 GMT Received: from listman.util.phx.redhat.com (listman.util.phx.redhat.com [10.8.4.110]) by hormel.redhat.com (Postfix) with ESMTP id 69AB261AAB3; Thu, 24 Sep 2009 15:33:15 -0400 (EDT) Received: from int-mx04.intmail.prod.int.phx2.redhat.com (nat-pool.util.phx.redhat.com [10.8.5.200]) by listman.util.phx.redhat.com (8.13.1/8.13.1) with ESMTP id n8OJPZoe006023 for ; Thu, 24 Sep 2009 15:25:35 -0400 Received: from machine.usersys.redhat.com (dhcp-100-19-148.bos.redhat.com [10.16.19.148]) by int-mx04.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id n8OJPZiZ027741; Thu, 24 Sep 2009 15:25:35 -0400 Received: by machine.usersys.redhat.com (Postfix, from userid 10451) id 30032262F2; Thu, 24 Sep 2009 15:25:33 -0400 (EDT) From: Vivek Goyal To: linux-kernel@vger.kernel.org, jens.axboe@oracle.com Date: Thu, 24 Sep 2009 15:25:11 -0400 Message-Id: <1253820332-10246-8-git-send-email-vgoyal@redhat.com> In-Reply-To: <1253820332-10246-1-git-send-email-vgoyal@redhat.com> References: <1253820332-10246-1-git-send-email-vgoyal@redhat.com> X-Scanned-By: MIMEDefang 2.67 on 10.5.11.17 X-loop: dm-devel@redhat.com Cc: dhaval@linux.vnet.ibm.com, peterz@infradead.org, dm-devel@redhat.com, dpshah@google.com, agk@redhat.com, balbir@linux.vnet.ibm.com, paolo.valente@unimore.it, jmarchan@redhat.com, guijianfeng@cn.fujitsu.com, fernando@oss.ntt.co.jp, mikew@google.com, jmoyer@redhat.com, nauman@google.com, mingo@elte.hu, vgoyal@redhat.com, m-ikeda@ds.jp.nec.com, riel@redhat.com, lizf@cn.fujitsu.com, fchecconi@gmail.com, s-uchida@ap.jp.nec.com, containers@lists.linux-foundation.org, akpm@linux-foundation.org, righi.andrea@gmail.com, torvalds@linux-foundation.org Subject: [dm-devel] [PATCH 07/28] io-controller: cgroup related changes for hierarchical group support X-BeenThere: dm-devel@redhat.com X-Mailman-Version: 2.1.5 Precedence: junk Reply-To: device-mapper development List-Id: device-mapper development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: dm-devel-bounces@redhat.com Errors-To: dm-devel-bounces@redhat.com o This patch introduces some of the cgroup related code for io controller. Signed-off-by: Fabio Checconi Signed-off-by: Paolo Valente Signed-off-by: Nauman Rafique Signed-off-by: Gui Jianfeng Signed-off-by: Vivek Goyal Acked-by: Rik van Riel --- block/blk-ioc.c | 3 + block/elevator-fq.c | 169 ++++++++++++++++++++++++++++++++++++++++- block/elevator-fq.h | 14 ++++ include/linux/cgroup_subsys.h | 6 ++ include/linux/iocontext.h | 5 + 5 files changed, 196 insertions(+), 1 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d4ed600..0d56336 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -95,6 +95,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) spin_lock_init(&ret->lock); ret->ioprio_changed = 0; ret->ioprio = 0; +#ifdef CONFIG_GROUP_IOSCHED + ret->cgroup_changed = 0; +#endif ret->last_waited = jiffies; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; diff --git a/block/elevator-fq.c b/block/elevator-fq.c index 0e3d58c..0c060a6 100644 --- a/block/elevator-fq.c +++ b/block/elevator-fq.c @@ -265,7 +265,7 @@ static void entity_served(struct io_entity *entity, unsigned long served, unsigned long charge = queue_charge; for_each_entity(entity) { - entity->vdisktime += elv_delta_fair(queue_charge, entity); + entity->vdisktime += elv_delta_fair(charge, entity); update_min_vdisktime(entity->st); /* Group charge can be different from queue charge */ charge = group_charge; @@ -920,6 +920,173 @@ EXPORT_SYMBOL(elv_io_group_set_async_queue); #ifdef CONFIG_GROUP_IOSCHED +struct io_cgroup io_root_cgroup = { + .weight = IO_WEIGHT_DEFAULT, + .ioprio_class = IOPRIO_CLASS_BE, +}; + +static struct io_cgroup *cgroup_to_io_cgroup(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, io_subsys_id), + struct io_cgroup, css); +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype) \ +{ \ + struct io_cgroup *iocg; \ + u64 ret; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + iocg = cgroup_to_io_cgroup(cgroup); \ + spin_lock_irq(&iocg->lock); \ + ret = iocg->__VAR; \ + spin_unlock_irq(&iocg->lock); \ + \ + cgroup_unlock(); \ + \ + return ret; \ +} + +SHOW_FUNCTION(weight); +SHOW_FUNCTION(ioprio_class); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ +static int io_cgroup_##__VAR##_write(struct cgroup *cgroup, \ + struct cftype *cftype, \ + u64 val) \ +{ \ + struct io_cgroup *iocg; \ + struct io_group *iog; \ + struct hlist_node *n; \ + \ + if (val < (__MIN) || val > (__MAX)) \ + return -EINVAL; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + iocg = cgroup_to_io_cgroup(cgroup); \ + \ + spin_lock_irq(&iocg->lock); \ + iocg->__VAR = (unsigned long)val; \ + hlist_for_each_entry(iog, n, &iocg->group_data, group_node) { \ + iog->entity.__VAR = (unsigned long)val; \ + smp_wmb(); \ + iog->entity.ioprio_changed = 1; \ + } \ + spin_unlock_irq(&iocg->lock); \ + \ + cgroup_unlock(); \ + \ + return 0; \ +} + +STORE_FUNCTION(weight, IO_WEIGHT_MIN, IO_WEIGHT_MAX); +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); +#undef STORE_FUNCTION + +struct cftype io_files[] = { + { + .name = "weight", + .read_u64 = io_cgroup_weight_read, + .write_u64 = io_cgroup_weight_write, + }, + { + .name = "ioprio_class", + .read_u64 = io_cgroup_ioprio_class_read, + .write_u64 = io_cgroup_ioprio_class_write, + }, +}; + +static int iocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, io_files, ARRAY_SIZE(io_files)); +} + +static struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys, + struct cgroup *cgroup) +{ + struct io_cgroup *iocg; + + if (cgroup->parent != NULL) { + iocg = kzalloc(sizeof(*iocg), GFP_KERNEL); + if (iocg == NULL) + return ERR_PTR(-ENOMEM); + } else + iocg = &io_root_cgroup; + + spin_lock_init(&iocg->lock); + INIT_HLIST_HEAD(&iocg->group_data); + iocg->weight = IO_WEIGHT_DEFAULT; + iocg->ioprio_class = IOPRIO_CLASS_BE; + + return &iocg->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. By now we allow a task to change + * its cgroup only if it's the only owner of its ioc; the drawback of this + * behavior is that a group containing a task that forked using CLONE_IO + * will not be destroyed until the tasks sharing the ioc die. + */ +static int iocg_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct task_struct *tsk) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) + /* + * ioc == NULL means that the task is either too young or + * exiting: if it has still no ioc the ioc can't be shared, + * if the task is exiting the attach will fail anyway, no + * matter what we return here. + */ + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void iocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct cgroup *prev, struct task_struct *tsk) +{ + struct io_context *ioc; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc != NULL) + ioc->cgroup_changed = 1; + task_unlock(tsk); +} + +static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + + /* Implemented in later patch */ +} + +struct cgroup_subsys io_subsys = { + .name = "io", + .create = iocg_create, + .can_attach = iocg_can_attach, + .attach = iocg_attach, + .destroy = iocg_destroy, + .populate = iocg_populate, + .subsys_id = io_subsys_id, + .use_id = 1, +}; + static void io_free_root_group(struct elevator_queue *e) { struct io_group *iog = e->efqd->root_group; diff --git a/block/elevator-fq.h b/block/elevator-fq.h index 068f240..f343841 100644 --- a/block/elevator-fq.h +++ b/block/elevator-fq.h @@ -13,6 +13,7 @@ #ifdef CONFIG_BLOCK #include +#include #ifndef _ELV_SCHED_H #define _ELV_SCHED_H @@ -98,6 +99,8 @@ struct io_group { struct io_entity entity; atomic_t ref; struct io_sched_data sched_data; + struct hlist_node group_node; + unsigned short iocg_id; /* * async queue for each priority case for RT and BE class. * Used only for cfq. @@ -108,6 +111,17 @@ struct io_group { void *key; }; +struct io_cgroup { + struct cgroup_subsys_state css; + + unsigned int weight; + unsigned short ioprio_class; + + spinlock_t lock; + struct hlist_head group_data; +}; + + #else /* CONFIG_GROUP_IOSCHED */ struct io_group { diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31b..baf544f 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -60,3 +60,9 @@ SUBSYS(net_cls) #endif /* */ + +#ifdef CONFIG_GROUP_IOSCHED +SUBSYS(io) +#endif + +/* */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 4da4a75..b343594 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -73,6 +73,11 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; +#ifdef CONFIG_GROUP_IOSCHED + /* If task changes the cgroup, elevator processes it asynchronously */ + unsigned short cgroup_changed; +#endif + /* * For request batching */