Message ID | 4A569FC5.7090801@cn.fujitsu.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jul 10, 2009 at 09:56:21AM +0800, Gui Jianfeng wrote: > Hi Vivek, > > This patch exports a cgroup based per group request limits interface. > and removes the global one. Now we can use this interface to perform > different request allocation limitation for different groups. > Thanks Gui. Few points come to mind. - You seem to be making this as per cgroup limit on all devices. I guess that different devices in the system can have different settings of q->nr_requests and hence will probably want different per group limit. So we might have to make it per cgroup per device limit. - There does not seem to be any checks for making sure that children cgroups don't have more request descriptors allocated than parent group. - I am re-thinking that what's the advantage of configuring request descriptors also through cgroups. It does bring in additional complexity with it and it should justfiy the advantages. Can you think of some? Until and unless we can come up with some significant advantages, I will prefer to continue to use per group limit through q->nr_group_requests interface instead of cgroup. Once things stablize, we can revisit it and see how this interface can be improved. Thanks Vivek > Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com> > --- > block/blk-core.c | 23 ++++++++++-- > block/blk-settings.c | 1 - > block/blk-sysfs.c | 43 ----------------------- > block/elevator-fq.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++--- > block/elevator-fq.h | 4 ++ > 5 files changed, 111 insertions(+), 54 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 79fe6a9..7010b76 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -722,13 +722,20 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) > static void __freed_request(struct request_queue *q, int sync, > struct request_list *rl) > { > + struct io_group *iog; > + unsigned long nr_group_requests; > + > if (q->rq_data.count[sync] < queue_congestion_off_threshold(q)) > blk_clear_queue_congested(q, sync); > > if (q->rq_data.count[sync] + 1 <= q->nr_requests) > blk_clear_queue_full(q, sync); > > - if (rl->count[sync] + 1 <= q->nr_group_requests) { > + iog = rl_iog(rl); > + > + nr_group_requests = get_group_requests(q, iog); > + > + if (nr_group_requests && rl->count[sync] + 1 <= nr_group_requests) { > if (waitqueue_active(&rl->wait[sync])) > wake_up(&rl->wait[sync]); > } > @@ -828,6 +835,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, > const bool is_sync = rw_is_sync(rw_flags) != 0; > int may_queue, priv; > int sleep_on_global = 0; > + struct io_group *iog; > + unsigned long nr_group_requests; > > may_queue = elv_may_queue(q, rw_flags); > if (may_queue == ELV_MQUEUE_NO) > @@ -843,7 +852,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags, > if (q->rq_data.count[is_sync]+1 >= q->nr_requests) > blk_set_queue_full(q, is_sync); > > - if (rl->count[is_sync]+1 >= q->nr_group_requests) { > + iog = rl_iog(rl); > + > + nr_group_requests = get_group_requests(q, iog); > + > + if (nr_group_requests && > + rl->count[is_sync]+1 >= nr_group_requests) { > ioc = current_io_context(GFP_ATOMIC, q->node); > /* > * The queue request descriptor group will fill after this > @@ -852,7 +866,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, > * This process will be allowed to complete a batch of > * requests, others will be blocked. > */ > - if (rl->count[is_sync] <= q->nr_group_requests) > + if (rl->count[is_sync] <= nr_group_requests) > ioc_set_batching(q, ioc); > else { > if (may_queue != ELV_MQUEUE_MUST > @@ -898,7 +912,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, > * from per group request list > */ > > - if (rl->count[is_sync] >= (3 * q->nr_group_requests / 2)) > + if (nr_group_requests && > + rl->count[is_sync] >= (3 * nr_group_requests / 2)) > goto out; > > rl->starved[is_sync] = 0; > diff --git a/block/blk-settings.c b/block/blk-settings.c > index 78b8aec..bd582a7 100644 > --- a/block/blk-settings.c > +++ b/block/blk-settings.c > @@ -148,7 +148,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) > * set defaults > */ > q->nr_requests = BLKDEV_MAX_RQ; > - q->nr_group_requests = BLKDEV_MAX_GROUP_RQ; > > q->make_request_fn = mfn; > blk_queue_dma_alignment(q, 511); > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c > index 92b9f25..706d852 100644 > --- a/block/blk-sysfs.c > +++ b/block/blk-sysfs.c > @@ -78,40 +78,8 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) > return ret; > } > #ifdef CONFIG_GROUP_IOSCHED > -static ssize_t queue_group_requests_show(struct request_queue *q, char *page) > -{ > - return queue_var_show(q->nr_group_requests, (page)); > -} > - > extern void elv_io_group_congestion_threshold(struct request_queue *q, > struct io_group *iog); > - > -static ssize_t > -queue_group_requests_store(struct request_queue *q, const char *page, > - size_t count) > -{ > - struct hlist_node *n; > - struct io_group *iog; > - struct elv_fq_data *efqd; > - unsigned long nr; > - int ret = queue_var_store(&nr, page, count); > - > - if (nr < BLKDEV_MIN_RQ) > - nr = BLKDEV_MIN_RQ; > - > - spin_lock_irq(q->queue_lock); > - > - q->nr_group_requests = nr; > - > - efqd = &q->elevator->efqd; > - > - hlist_for_each_entry(iog, n, &efqd->group_list, elv_data_node) { > - elv_io_group_congestion_threshold(q, iog); > - } > - > - spin_unlock_irq(q->queue_lock); > - return ret; > -} > #endif > > static ssize_t queue_ra_show(struct request_queue *q, char *page) > @@ -278,14 +246,6 @@ static struct queue_sysfs_entry queue_requests_entry = { > .store = queue_requests_store, > }; > > -#ifdef CONFIG_GROUP_IOSCHED > -static struct queue_sysfs_entry queue_group_requests_entry = { > - .attr = {.name = "nr_group_requests", .mode = S_IRUGO | S_IWUSR }, > - .show = queue_group_requests_show, > - .store = queue_group_requests_store, > -}; > -#endif > - > static struct queue_sysfs_entry queue_ra_entry = { > .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, > .show = queue_ra_show, > @@ -360,9 +320,6 @@ static struct queue_sysfs_entry queue_iostats_entry = { > > static struct attribute *default_attrs[] = { > &queue_requests_entry.attr, > -#ifdef CONFIG_GROUP_IOSCHED > - &queue_group_requests_entry.attr, > -#endif > &queue_ra_entry.attr, > &queue_max_hw_sectors_entry.attr, > &queue_max_sectors_entry.attr, > diff --git a/block/elevator-fq.c b/block/elevator-fq.c > index 29392e7..bfb0210 100644 > --- a/block/elevator-fq.c > +++ b/block/elevator-fq.c > @@ -59,6 +59,35 @@ elv_release_ioq(struct elevator_queue *eq, struct io_queue **ioq_ptr); > #define for_each_entity_safe(entity, parent) \ > for (; entity && ({ parent = entity->parent; 1; }); entity = parent) > > +unsigned short get_group_requests(struct request_queue *q, > + struct io_group *iog) > +{ > + struct cgroup_subsys_state *css; > + struct io_cgroup *iocg; > + unsigned long nr_group_requests; > + > + if (!iog) > + return q->nr_requests; > + > + rcu_read_lock(); > + > + if (!iog->iocg_id) { > + nr_group_requests = 0; > + goto out; > + } > + > + css = css_lookup(&io_subsys, iog->iocg_id); > + if (!css) { > + nr_group_requests = 0; > + goto out; > + } > + > + iocg = container_of(css, struct io_cgroup, css); > + nr_group_requests = iocg->nr_group_requests; > +out: > + rcu_read_unlock(); > + return nr_group_requests; > +} > > static struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd, > int extract); > @@ -1257,14 +1286,17 @@ void elv_io_group_congestion_threshold(struct request_queue *q, > struct io_group *iog) > { > int nr; > + unsigned long nr_group_requests; > > - nr = q->nr_group_requests - (q->nr_group_requests / 8) + 1; > - if (nr > q->nr_group_requests) > - nr = q->nr_group_requests; > + nr_group_requests = get_group_requests(q, iog); > + > + nr = nr_group_requests - (nr_group_requests / 8) + 1; > + if (nr > nr_group_requests) > + nr = nr_group_requests; > iog->nr_congestion_on = nr; > > - nr = q->nr_group_requests - (q->nr_group_requests / 8) > - - (q->nr_group_requests / 16) - 1; > + nr = nr_group_requests - (nr_group_requests / 8) > + - (nr_group_requests / 16) - 1; > if (nr < 1) > nr = 1; > iog->nr_congestion_off = nr; > @@ -1283,6 +1315,7 @@ int elv_io_group_congested(struct request_queue *q, struct page *page, int sync) > { > struct io_group *iog; > int ret = 0; > + unsigned long nr_group_requests; > > rcu_read_lock(); > > @@ -1300,10 +1333,11 @@ int elv_io_group_congested(struct request_queue *q, struct page *page, int sync) > } > > ret = elv_is_iog_congested(q, iog, sync); > + nr_group_requests = get_group_requests(q, iog); > if (ret) > elv_log_iog(&q->elevator->efqd, iog, "iog congested=%d sync=%d" > " rl.count[sync]=%d nr_group_requests=%d", > - ret, sync, iog->rl.count[sync], q->nr_group_requests); > + ret, sync, iog->rl.count[sync], nr_group_requests); > rcu_read_unlock(); > return ret; > } > @@ -1549,6 +1583,48 @@ free_buf: > return ret; > } > > +static u64 io_cgroup_nr_requests_read(struct cgroup *cgroup, > + struct cftype *cftype) > +{ > + struct io_cgroup *iocg; > + u64 ret; > + > + if (!cgroup_lock_live_group(cgroup)) > + return -ENODEV; > + > + iocg = cgroup_to_io_cgroup(cgroup); > + spin_lock_irq(&iocg->lock); > + ret = iocg->nr_group_requests; > + spin_unlock_irq(&iocg->lock); > + > + cgroup_unlock(); > + > + return ret; > +} > + > +static int io_cgroup_nr_requests_write(struct cgroup *cgroup, > + struct cftype *cftype, > + u64 val) > +{ > + struct io_cgroup *iocg; > + > + if (val < BLKDEV_MIN_RQ) > + val = BLKDEV_MIN_RQ; > + > + if (!cgroup_lock_live_group(cgroup)) > + return -ENODEV; > + > + iocg = cgroup_to_io_cgroup(cgroup); > + > + spin_lock_irq(&iocg->lock); > + iocg->nr_group_requests = (unsigned long)val; > + spin_unlock_irq(&iocg->lock); > + > + cgroup_unlock(); > + > + return 0; > +} > + > #define SHOW_FUNCTION(__VAR) \ > static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \ > struct cftype *cftype) \ > @@ -1735,6 +1811,11 @@ static int io_cgroup_disk_dequeue_read(struct cgroup *cgroup, > > struct cftype bfqio_files[] = { > { > + .name = "nr_group_requests", > + .read_u64 = io_cgroup_nr_requests_read, > + .write_u64 = io_cgroup_nr_requests_write, > + }, > + { > .name = "policy", > .read_seq_string = io_cgroup_policy_read, > .write_string = io_cgroup_policy_write, > @@ -1790,6 +1871,7 @@ static struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys, > > spin_lock_init(&iocg->lock); > INIT_HLIST_HEAD(&iocg->group_data); > + iocg->nr_group_requests = BLKDEV_MAX_GROUP_RQ; > iocg->weight = IO_DEFAULT_GRP_WEIGHT; > iocg->ioprio_class = IO_DEFAULT_GRP_CLASS; > INIT_LIST_HEAD(&iocg->policy_list); > diff --git a/block/elevator-fq.h b/block/elevator-fq.h > index f089a55..df077d0 100644 > --- a/block/elevator-fq.h > +++ b/block/elevator-fq.h > @@ -308,6 +308,7 @@ struct io_cgroup { > unsigned int weight; > unsigned short ioprio_class; > > + unsigned long nr_group_requests; > /* list of io_policy_node */ > struct list_head policy_list; > > @@ -386,6 +387,9 @@ struct elv_fq_data { > unsigned int fairness; > }; > > +extern unsigned short get_group_requests(struct request_queue *q, > + struct io_group *iog); > + > /* Logging facilities. */ > #ifdef CONFIG_DEBUG_GROUP_IOSCHED > #define elv_log_ioq(efqd, ioq, fmt, args...) \ > -- > 1.5.4.rc3 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Vivek Goyal wrote, on 07/13/2009 12:03 PM: > On Fri, Jul 10, 2009 at 09:56:21AM +0800, Gui Jianfeng wrote: >> Hi Vivek, >> >> This patch exports a cgroup based per group request limits interface. >> and removes the global one. Now we can use this interface to perform >> different request allocation limitation for different groups. >> > > Thanks Gui. Few points come to mind. > > - You seem to be making this as per cgroup limit on all devices. I guess > that different devices in the system can have different settings of > q->nr_requests and hence will probably want different per group limit. > So we might have to make it per cgroup per device limit. From the viewpoint of implementation, there is a difficulty in my mind to implement per cgroup per device limit arising from that io_group is allocated when associated device is firstly used. I guess Gui chose per cgroup limit on all devices approach because of this, right? > - There does not seem to be any checks for making sure that children > cgroups don't have more request descriptors allocated than parent group. > > - I am re-thinking that what's the advantage of configuring request > descriptors also through cgroups. It does bring in additional complexity > with it and it should justfiy the advantages. Can you think of some? > > Until and unless we can come up with some significant advantages, I will > prefer to continue to use per group limit through q->nr_group_requests > interface instead of cgroup. Once things stablize, we can revisit it and > see how this interface can be improved. I agree. I will try to clarify if per group per device limitation is needed or not (or, if it has the advantage beyond the complexity) through some tests. Tnaks a lot, Muuhh > Thanks > Vivek > >> Signed-off-by: Gui Jianfeng<guijianfeng@cn.fujitsu.com> >> --- >> block/blk-core.c | 23 ++++++++++-- >> block/blk-settings.c | 1 - >> block/blk-sysfs.c | 43 ----------------------- >> block/elevator-fq.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++--- >> block/elevator-fq.h | 4 ++ >> 5 files changed, 111 insertions(+), 54 deletions(-) >> >> diff --git a/block/blk-core.c b/block/blk-core.c >> index 79fe6a9..7010b76 100644 >> --- a/block/blk-core.c >> +++ b/block/blk-core.c >> @@ -722,13 +722,20 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) >> static void __freed_request(struct request_queue *q, int sync, >> struct request_list *rl) >> { >> + struct io_group *iog; >> + unsigned long nr_group_requests; >> + >> if (q->rq_data.count[sync]< queue_congestion_off_threshold(q)) >> blk_clear_queue_congested(q, sync); >> >> if (q->rq_data.count[sync] + 1<= q->nr_requests) >> blk_clear_queue_full(q, sync); >> >> - if (rl->count[sync] + 1<= q->nr_group_requests) { >> + iog = rl_iog(rl); >> + >> + nr_group_requests = get_group_requests(q, iog); >> + >> + if (nr_group_requests&& rl->count[sync] + 1<= nr_group_requests) { >> if (waitqueue_active(&rl->wait[sync])) >> wake_up(&rl->wait[sync]); >> } >> @@ -828,6 +835,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, >> const bool is_sync = rw_is_sync(rw_flags) != 0; >> int may_queue, priv; >> int sleep_on_global = 0; >> + struct io_group *iog; >> + unsigned long nr_group_requests; >> >> may_queue = elv_may_queue(q, rw_flags); >> if (may_queue == ELV_MQUEUE_NO) >> @@ -843,7 +852,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags, >> if (q->rq_data.count[is_sync]+1>= q->nr_requests) >> blk_set_queue_full(q, is_sync); >> >> - if (rl->count[is_sync]+1>= q->nr_group_requests) { >> + iog = rl_iog(rl); >> + >> + nr_group_requests = get_group_requests(q, iog); >> + >> + if (nr_group_requests&& >> + rl->count[is_sync]+1>= nr_group_requests) { >> ioc = current_io_context(GFP_ATOMIC, q->node); >> /* >> * The queue request descriptor group will fill after this >> @@ -852,7 +866,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, >> * This process will be allowed to complete a batch of >> * requests, others will be blocked. >> */ >> - if (rl->count[is_sync]<= q->nr_group_requests) >> + if (rl->count[is_sync]<= nr_group_requests) >> ioc_set_batching(q, ioc); >> else { >> if (may_queue != ELV_MQUEUE_MUST >> @@ -898,7 +912,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, >> * from per group request list >> */ >> >> - if (rl->count[is_sync]>= (3 * q->nr_group_requests / 2)) >> + if (nr_group_requests&& >> + rl->count[is_sync]>= (3 * nr_group_requests / 2)) >> goto out; >> >> rl->starved[is_sync] = 0; >> diff --git a/block/blk-settings.c b/block/blk-settings.c >> index 78b8aec..bd582a7 100644 >> --- a/block/blk-settings.c >> +++ b/block/blk-settings.c >> @@ -148,7 +148,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) >> * set defaults >> */ >> q->nr_requests = BLKDEV_MAX_RQ; >> - q->nr_group_requests = BLKDEV_MAX_GROUP_RQ; >> >> q->make_request_fn = mfn; >> blk_queue_dma_alignment(q, 511); >> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c >> index 92b9f25..706d852 100644 >> --- a/block/blk-sysfs.c >> +++ b/block/blk-sysfs.c >> @@ -78,40 +78,8 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) >> return ret; >> } >> #ifdef CONFIG_GROUP_IOSCHED >> -static ssize_t queue_group_requests_show(struct request_queue *q, char *page) >> -{ >> - return queue_var_show(q->nr_group_requests, (page)); >> -} >> - >> extern void elv_io_group_congestion_threshold(struct request_queue *q, >> struct io_group *iog); >> - >> -static ssize_t >> -queue_group_requests_store(struct request_queue *q, const char *page, >> - size_t count) >> -{ >> - struct hlist_node *n; >> - struct io_group *iog; >> - struct elv_fq_data *efqd; >> - unsigned long nr; >> - int ret = queue_var_store(&nr, page, count); >> - >> - if (nr< BLKDEV_MIN_RQ) >> - nr = BLKDEV_MIN_RQ; >> - >> - spin_lock_irq(q->queue_lock); >> - >> - q->nr_group_requests = nr; >> - >> - efqd =&q->elevator->efqd; >> - >> - hlist_for_each_entry(iog, n,&efqd->group_list, elv_data_node) { >> - elv_io_group_congestion_threshold(q, iog); >> - } >> - >> - spin_unlock_irq(q->queue_lock); >> - return ret; >> -} >> #endif >> >> static ssize_t queue_ra_show(struct request_queue *q, char *page) >> @@ -278,14 +246,6 @@ static struct queue_sysfs_entry queue_requests_entry = { >> .store = queue_requests_store, >> }; >> >> -#ifdef CONFIG_GROUP_IOSCHED >> -static struct queue_sysfs_entry queue_group_requests_entry = { >> - .attr = {.name = "nr_group_requests", .mode = S_IRUGO | S_IWUSR }, >> - .show = queue_group_requests_show, >> - .store = queue_group_requests_store, >> -}; >> -#endif >> - >> static struct queue_sysfs_entry queue_ra_entry = { >> .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, >> .show = queue_ra_show, >> @@ -360,9 +320,6 @@ static struct queue_sysfs_entry queue_iostats_entry = { >> >> static struct attribute *default_attrs[] = { >> &queue_requests_entry.attr, >> -#ifdef CONFIG_GROUP_IOSCHED >> - &queue_group_requests_entry.attr, >> -#endif >> &queue_ra_entry.attr, >> &queue_max_hw_sectors_entry.attr, >> &queue_max_sectors_entry.attr, >> diff --git a/block/elevator-fq.c b/block/elevator-fq.c >> index 29392e7..bfb0210 100644 >> --- a/block/elevator-fq.c >> +++ b/block/elevator-fq.c >> @@ -59,6 +59,35 @@ elv_release_ioq(struct elevator_queue *eq, struct io_queue **ioq_ptr); >> #define for_each_entity_safe(entity, parent) \ >> for (; entity&& ({ parent = entity->parent; 1; }); entity = parent) >> >> +unsigned short get_group_requests(struct request_queue *q, >> + struct io_group *iog) >> +{ >> + struct cgroup_subsys_state *css; >> + struct io_cgroup *iocg; >> + unsigned long nr_group_requests; >> + >> + if (!iog) >> + return q->nr_requests; >> + >> + rcu_read_lock(); >> + >> + if (!iog->iocg_id) { >> + nr_group_requests = 0; >> + goto out; >> + } >> + >> + css = css_lookup(&io_subsys, iog->iocg_id); >> + if (!css) { >> + nr_group_requests = 0; >> + goto out; >> + } >> + >> + iocg = container_of(css, struct io_cgroup, css); >> + nr_group_requests = iocg->nr_group_requests; >> +out: >> + rcu_read_unlock(); >> + return nr_group_requests; >> +} >> >> static struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd, >> int extract); >> @@ -1257,14 +1286,17 @@ void elv_io_group_congestion_threshold(struct request_queue *q, >> struct io_group *iog) >> { >> int nr; >> + unsigned long nr_group_requests; >> >> - nr = q->nr_group_requests - (q->nr_group_requests / 8) + 1; >> - if (nr> q->nr_group_requests) >> - nr = q->nr_group_requests; >> + nr_group_requests = get_group_requests(q, iog); >> + >> + nr = nr_group_requests - (nr_group_requests / 8) + 1; >> + if (nr> nr_group_requests) >> + nr = nr_group_requests; >> iog->nr_congestion_on = nr; >> >> - nr = q->nr_group_requests - (q->nr_group_requests / 8) >> - - (q->nr_group_requests / 16) - 1; >> + nr = nr_group_requests - (nr_group_requests / 8) >> + - (nr_group_requests / 16) - 1; >> if (nr< 1) >> nr = 1; >> iog->nr_congestion_off = nr; >> @@ -1283,6 +1315,7 @@ int elv_io_group_congested(struct request_queue *q, struct page *page, int sync) >> { >> struct io_group *iog; >> int ret = 0; >> + unsigned long nr_group_requests; >> >> rcu_read_lock(); >> >> @@ -1300,10 +1333,11 @@ int elv_io_group_congested(struct request_queue *q, struct page *page, int sync) >> } >> >> ret = elv_is_iog_congested(q, iog, sync); >> + nr_group_requests = get_group_requests(q, iog); >> if (ret) >> elv_log_iog(&q->elevator->efqd, iog, "iog congested=%d sync=%d" >> " rl.count[sync]=%d nr_group_requests=%d", >> - ret, sync, iog->rl.count[sync], q->nr_group_requests); >> + ret, sync, iog->rl.count[sync], nr_group_requests); >> rcu_read_unlock(); >> return ret; >> } >> @@ -1549,6 +1583,48 @@ free_buf: >> return ret; >> } >> >> +static u64 io_cgroup_nr_requests_read(struct cgroup *cgroup, >> + struct cftype *cftype) >> +{ >> + struct io_cgroup *iocg; >> + u64 ret; >> + >> + if (!cgroup_lock_live_group(cgroup)) >> + return -ENODEV; >> + >> + iocg = cgroup_to_io_cgroup(cgroup); >> + spin_lock_irq(&iocg->lock); >> + ret = iocg->nr_group_requests; >> + spin_unlock_irq(&iocg->lock); >> + >> + cgroup_unlock(); >> + >> + return ret; >> +} >> + >> +static int io_cgroup_nr_requests_write(struct cgroup *cgroup, >> + struct cftype *cftype, >> + u64 val) >> +{ >> + struct io_cgroup *iocg; >> + >> + if (val< BLKDEV_MIN_RQ) >> + val = BLKDEV_MIN_RQ; >> + >> + if (!cgroup_lock_live_group(cgroup)) >> + return -ENODEV; >> + >> + iocg = cgroup_to_io_cgroup(cgroup); >> + >> + spin_lock_irq(&iocg->lock); >> + iocg->nr_group_requests = (unsigned long)val; >> + spin_unlock_irq(&iocg->lock); >> + >> + cgroup_unlock(); >> + >> + return 0; >> +} >> + >> #define SHOW_FUNCTION(__VAR) \ >> static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \ >> struct cftype *cftype) \ >> @@ -1735,6 +1811,11 @@ static int io_cgroup_disk_dequeue_read(struct cgroup *cgroup, >> >> struct cftype bfqio_files[] = { >> { >> + .name = "nr_group_requests", >> + .read_u64 = io_cgroup_nr_requests_read, >> + .write_u64 = io_cgroup_nr_requests_write, >> + }, >> + { >> .name = "policy", >> .read_seq_string = io_cgroup_policy_read, >> .write_string = io_cgroup_policy_write, >> @@ -1790,6 +1871,7 @@ static struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys, >> >> spin_lock_init(&iocg->lock); >> INIT_HLIST_HEAD(&iocg->group_data); >> + iocg->nr_group_requests = BLKDEV_MAX_GROUP_RQ; >> iocg->weight = IO_DEFAULT_GRP_WEIGHT; >> iocg->ioprio_class = IO_DEFAULT_GRP_CLASS; >> INIT_LIST_HEAD(&iocg->policy_list); >> diff --git a/block/elevator-fq.h b/block/elevator-fq.h >> index f089a55..df077d0 100644 >> --- a/block/elevator-fq.h >> +++ b/block/elevator-fq.h >> @@ -308,6 +308,7 @@ struct io_cgroup { >> unsigned int weight; >> unsigned short ioprio_class; >> >> + unsigned long nr_group_requests; >> /* list of io_policy_node */ >> struct list_head policy_list; >> >> @@ -386,6 +387,9 @@ struct elv_fq_data { >> unsigned int fairness; >> }; >> >> +extern unsigned short get_group_requests(struct request_queue *q, >> + struct io_group *iog); >> + >> /* Logging facilities. */ >> #ifdef CONFIG_DEBUG_GROUP_IOSCHED >> #define elv_log_ioq(efqd, ioq, fmt, args...) \ >> -- >> 1.5.4.rc3 >
Vivek Goyal wrote: > On Fri, Jul 10, 2009 at 09:56:21AM +0800, Gui Jianfeng wrote: >> Hi Vivek, >> >> This patch exports a cgroup based per group request limits interface. >> and removes the global one. Now we can use this interface to perform >> different request allocation limitation for different groups. >> > > Thanks Gui. Few points come to mind. > > - You seem to be making this as per cgroup limit on all devices. I guess > that different devices in the system can have different settings of > q->nr_requests and hence will probably want different per group limit. > So we might have to make it per cgroup per device limit. Yes, per cgroup per device limitation seems more reasonable. I'll see what i can do. > > - There does not seem to be any checks for making sure that children > cgroups don't have more request descriptors allocated than parent group. Do we really need to make it hierarchical? IMHO, maintaining this limitation for cgroups independently is enough. > > - I am re-thinking that what's the advantage of configuring request > descriptors also through cgroups. It does bring in additional complexity > with it and it should justfiy the advantages. Can you think of some? I'll try, but at least, this feature lets us be able to do more accurate limitation. :) > > Until and unless we can come up with some significant advantages, I will > prefer to continue to use per group limit through q->nr_group_requests > interface instead of cgroup. Once things stablize, we can revisit it and > see how this interface can be improved. I agree.
Munehiro Ikeda wrote: > Vivek Goyal wrote, on 07/13/2009 12:03 PM: >> On Fri, Jul 10, 2009 at 09:56:21AM +0800, Gui Jianfeng wrote: >>> Hi Vivek, >>> >>> This patch exports a cgroup based per group request limits interface. >>> and removes the global one. Now we can use this interface to perform >>> different request allocation limitation for different groups. >>> >> >> Thanks Gui. Few points come to mind. >> >> - You seem to be making this as per cgroup limit on all devices. I guess >> that different devices in the system can have different settings of >> q->nr_requests and hence will probably want different per group limit. >> So we might have to make it per cgroup per device limit. > > From the viewpoint of implementation, there is a difficulty in my mind to > implement per cgroup per device limit arising from that io_group is > allocated > when associated device is firstly used. I guess Gui chose per cgroup limit > on all devices approach because of this, right? Yes, I choose this solution from the simplicity point of view, the code will get complicated if choosing per cgroup per device limit. But it seems per cgroup per device limits is more reasonable. > > >> - There does not seem to be any checks for making sure that children >> cgroups don't have more request descriptors allocated than parent >> group. >> >> - I am re-thinking that what's the advantage of configuring request >> descriptors also through cgroups. It does bring in additional >> complexity >> with it and it should justfiy the advantages. Can you think of some? >> >> Until and unless we can come up with some significant advantages, I >> will >> prefer to continue to use per group limit through q->nr_group_requests >> interface instead of cgroup. Once things stablize, we can revisit >> it and >> see how this interface can be improved. > > I agree. I will try to clarify if per group per device limitation is > needed > or not (or, if it has the advantage beyond the complexity) through some > tests. Great, hope to hear you soon.
Gui Jianfeng wrote, on 07/14/2009 03:45 AM: > Munehiro Ikeda wrote: >> Vivek Goyal wrote, on 07/13/2009 12:03 PM: >>> On Fri, Jul 10, 2009 at 09:56:21AM +0800, Gui Jianfeng wrote: >>>> Hi Vivek, >>>> >>>> This patch exports a cgroup based per group request limits interface. >>>> and removes the global one. Now we can use this interface to perform >>>> different request allocation limitation for different groups. >>>> >>> Thanks Gui. Few points come to mind. >>> >>> - You seem to be making this as per cgroup limit on all devices. I guess >>> that different devices in the system can have different settings of >>> q->nr_requests and hence will probably want different per group limit. >>> So we might have to make it per cgroup per device limit. >> From the viewpoint of implementation, there is a difficulty in my mind to >> implement per cgroup per device limit arising from that io_group is >> allocated >> when associated device is firstly used. I guess Gui chose per cgroup limit >> on all devices approach because of this, right? > > Yes, I choose this solution from the simplicity point of view, the code will > get complicated if choosing per cgroup per device limit. But it seems per > cgroup per device limits is more reasonable. > >> >>> - There does not seem to be any checks for making sure that children >>> cgroups don't have more request descriptors allocated than parent >>> group. >>> >>> - I am re-thinking that what's the advantage of configuring request >>> descriptors also through cgroups. It does bring in additional >>> complexity >>> with it and it should justfiy the advantages. Can you think of some? >>> >>> Until and unless we can come up with some significant advantages, I >>> will >>> prefer to continue to use per group limit through q->nr_group_requests >>> interface instead of cgroup. Once things stablize, we can revisit >>> it and >>> see how this interface can be improved. >> I agree. I will try to clarify if per group per device limitation is >> needed >> or not (or, if it has the advantage beyond the complexity) through some >> tests. > > Great, hope to hear you soon. Sorry for so late. I tried it, and write the result and my opinion below... Scenario ==================== The possible scenario where per-cgroup nr_requests limitation is beneficial in my mind is that: - Process P1 in cgroup "g1" is running with submitting many requests to a device. The number of the requests in the device queue is almost nr_requests for the device. - After a while, process P2 in cgroup "g2" starts running. P2 also tries to submit requests as many as P1. - Assuming that user wants P2 to grab bandwidth as soon as possible and keep it certain level. In this scenario, I predicted the bandwidth behavior of P2 along with tuning global nr_group_requests like below. - If having nr_group_requests almost same as nr_requests, P1 can allocate requests up to nr_requests and there is no room for P2 at the beginning of its running. As a result of it, P2 has to wait for a while till P1's requests are completed and rising of bandwidth is delayed. - If having nr_group_requests fewer to restrict requests from P1 and make room for P2, the bandwidth of P2 may be lower than the case that P1 can allocate more requests. If the prediction is correct and per-cgroup nr_requests limitation can make the situation better, per-cgroup nr_requests is supposed to be beneficial. Verification Conditions ======================== - Kernel: 2.6.31-rc1 + Patches from Vivek on Jul 2, 2009 (IO scheduler based IO controller V6) https://lists.linux-foundation.org/pipermail/containers/2009-July/018948.html + Patches from Gui Jianfeng on Jul 7, 2009 (Bug fixes) https://lists.linux-foundation.org/pipermail/containers/2009-July/019086.html https://lists.linux-foundation.org/pipermail/containers/2009-July/019087.html + Patch from Gui Jianfeng on Jul 9, 2009 (per-cgroup requests limit) https://lists.linux-foundation.org/pipermail/containers/2009-July/019123.html + Patch from me on Jul 16, 2009 (Bug fix) https://lists.linux-foundation.org/pipermail/containers/2009-July/019286.html + 2 local bug fix patches (Not posted yet, I'm posting them in following mails) - All results are measured under nr_requests=500. - Used fio to make I/O. Job file is like below. Used libaio and direct-I/O and tuned iodepth to make rl->count[1] approx 500 always. ----- fio job file : from here ----- [global] size=128m directory=/mnt/b1 runtime=30 time_based write_bw_log bwavgtime=200 rw=randread direct=1 ioengine=libaio iodepth=500 [g1] exec_prerun=./pre.sh /mnt/cgroups/g1 exec_postrun=./log.sh /mnt/cgroups/g1 sdb "_post" [g2] startdelay=10 exec_prerun=./pre.sh /mnt/cgroups/g2 exec_postrun=./log.sh /mnt/cgroups/g2 sdb "_post" ----- fio job file : till here ----- Note: pre.sh and log.sh used in exec_{pre|post}run are to assign processes to expected cgroups and record the conditions. Let me omit the detail of them because they are not fundamental part of this verification. Results ==================== Bandwidth of g2 (=P2) was measured under some conditions. Conditions and bandwidth logs are shown below. Bandwidth logs are shown only the beginning part (from starting of P2 to 3000[ms] after aprox.) because the full logs are so long. Average bandwidth from the beginning of log to ~10[sec] is also calculated. Note1: fio seems to log bandwidth only when actual data transfer occurs (correct me if it's not true). This means that there is no line with BW=0. In there is no data transfer, the time-stamp are simply skipped to record. Note2: Graph picture of the bandwidth logs is attached. Result(1): orange Result(2): green Result(3): brown Result(4): black ---------- Result (1) ---------- * Both of g1 and g2 have nr_group_requests=500 < Conditions > nr_requests = 500 g1/ io.nr_group_requests = 500 io.weight = 500 io.ioprio_class = 2 g2/ io.nr_group_requests = 500 io.weight = 500 io.ioprio_class = 2 < Bandwidth log of g2 > t [ms] bw[KiB/s] 969 4 1170 1126 1374 1084 1576 876 1776 901 1980 1069 2191 1087 2400 1117 2612 1087 2822 1136 ... < Average bandwidth > 1063 [KiB/s] (969~9979[ms]) ---------- Result (2) ---------- * Both of g1 and g2 have nr_group_requests=100 < Conditions > nr_requests = 500 g1/ io.nr_group_requests = 100 io.weight = 500 io.ioprio_class = 2 g2/ io.nr_group_requests = 100 io.weight = 500 io.ioprio_class = 2 < Bandwidth log of g2 > t [ms] bw[KiB/s] 1498 2 1733 892 2096 722 2311 1224 2534 1180 2753 1197 2988 1137 ... < Average bandwidth > 998 [KiB/s] (1498~9898[ms]) ---------- Result (3) ---------- * To set different nr_group_requests on g1 and g2 < Conditions > nr_requests = 500 g1/ io.nr_group_requests = 100 io.weight = 500 io.ioprio_class = 2 g2/ io.nr_group_requests = 500 io.weight = 500 io.ioprio_class = 2 < Bandwidth log of g2 > t [ms] bw[KiB/s] 244 839 451 1133 659 964 877 1038 1088 1125 1294 979 1501 1068 1708 934 1916 1048 2117 1126 2328 1111 2533 1118 2758 1206 2969 990 ... < Average bandwidth > 1048 [KiB/s] (244~9906[ms]) ---------- Result (4) ---------- * To make g2/io.ioprio_class as RT < Conditions > nr_requests = 500 g1/ io.nr_group_requests = 500 io.weight = 500 io.ioprio_class = 2 g2/ io.nr_group_requests = 500 io.weight = 500 io.ioprio_class = 1 < Bandwidth log of g2 > t [ms] bw[KiB/s] 476 8 677 2211 878 2221 1080 2486 1281 2241 1481 2109 1681 2334 1882 2129 2082 2211 2283 1915 2489 1778 2690 1915 2891 1997 ... < Average bandwidth > 2132[KiB/s] (476~9954[ms]) Consideration and Conclusion ============================= From result(1), it is observed that it takes 1000~1200[ms] to rise P2 bandwidth. In result(2), where both of g1 and g2 have nr_group_requests=100, the delay gets longer as 1800~2000[ms]. In addition to it, the average bandwidth becomes ~5% lower than result(1). This is supposed that P2 couldn't allocate enough requests. Then, result(3) shows that bandwidth of P2 can rise quickly (~300[ms]) if nr_group_requests can be set per-cgroup. Result(4) shows that the delay can be shortened by setting g2 as RT class, however, the delay is still longer than result(3). I think it is confirmed that "per-cgroup nr_requests limitation is useful in a certain situation". Beyond that, the discussion topic is the benefit pointed out above is eligible for the complication of the implementation. IMHO, I don't think the implementation of per-cgroup request limitation is too complicated to accept. On the other hand I guess it suddenly gets complicated if we try to implement further more, especially hierarchical support. It is also true that I have a feeling that implementation without per-device limitation and hierarchical support is like "unfinished work". So, my opinion so far is that, per-cgroup nr_requests limitation should be merged only if hierarchical support is concluded "unnecessary" for it. If merging it tempts hierarchical support, it shouldn't be. How about your opinion, all? My considerations or verification method might be wrong. Please correct them if any. And if you have any other idea of scenario to verify the effect of per-cgroup nr_requests limitation, please let me know. I'll try it.
Munehiro Ikeda wrote: ... > > Consideration and Conclusion > ============================= > > From result(1), it is observed that it takes 1000~1200[ms] to rise P2 > bandwidth. In result(2), where both of g1 and g2 have > nr_group_requests=100, the delay gets longer as 1800~2000[ms]. In > addition to it, the average bandwidth becomes ~5% lower than result(1). > This is supposed that P2 couldn't allocate enough requests. > Then, result(3) shows that bandwidth of P2 can rise quickly (~300[ms]) > if nr_group_requests can be set per-cgroup. Result(4) shows that the > delay can be shortened by setting g2 as RT class, however, the delay is > still longer than result(3). > > I think it is confirmed that "per-cgroup nr_requests limitation is > useful in a certain situation". Beyond that, the discussion topic is > the benefit pointed out above is eligible for the complication of the > implementation. IMHO, I don't think the implementation of per-cgroup > request limitation is too complicated to accept. On the other hand I > guess it suddenly gets complicated if we try to implement further more, > especially hierarchical support. It is also true that I have a feeling > that implementation without per-device limitation and hierarchical > support is like "unfinished work". > > So, my opinion so far is that, per-cgroup nr_requests limitation should > be merged only if hierarchical support is concluded "unnecessary" for > it. If merging it tempts hierarchical support, it shouldn't be. > How about your opinion, all? Hi Munehiro-san, Thanks for the great job. It seems Per-cgroup requests allocation limits has its value in some cases. IMHO, for the time being, we can just drop the hierarchical support for "Per-cgroup requests allocation limits", and see whether it can work well. > > My considerations or verification method might be wrong. Please correct > them if any. And if you have any other idea of scenario to verify the > effect of per-cgroup nr_requests limitation, please let me know. I'll > try it. > > > > > ------------------------------------------------------------------------ >
On Mon, Aug 03, 2009 at 10:00:42PM -0400, Munehiro Ikeda wrote: > Gui Jianfeng wrote, on 07/14/2009 03:45 AM: >> Munehiro Ikeda wrote: >>> Vivek Goyal wrote, on 07/13/2009 12:03 PM: >>>> On Fri, Jul 10, 2009 at 09:56:21AM +0800, Gui Jianfeng wrote: >>>>> Hi Vivek, >>>>> >>>>> This patch exports a cgroup based per group request limits interface. >>>>> and removes the global one. Now we can use this interface to perform >>>>> different request allocation limitation for different groups. >>>>> >>>> Thanks Gui. Few points come to mind. >>>> >>>> - You seem to be making this as per cgroup limit on all devices. I guess >>>> that different devices in the system can have different settings of >>>> q->nr_requests and hence will probably want different per group limit. >>>> So we might have to make it per cgroup per device limit. >>> From the viewpoint of implementation, there is a difficulty in my mind to >>> implement per cgroup per device limit arising from that io_group is >>> allocated >>> when associated device is firstly used. I guess Gui chose per cgroup limit >>> on all devices approach because of this, right? >> >> Yes, I choose this solution from the simplicity point of view, the code will >> get complicated if choosing per cgroup per device limit. But it seems per >> cgroup per device limits is more reasonable. >> >>> >>>> - There does not seem to be any checks for making sure that children >>>> cgroups don't have more request descriptors allocated than parent >>>> group. >>>> >>>> - I am re-thinking that what's the advantage of configuring request >>>> descriptors also through cgroups. It does bring in additional >>>> complexity >>>> with it and it should justfiy the advantages. Can you think of some? >>>> >>>> Until and unless we can come up with some significant advantages, I >>>> will >>>> prefer to continue to use per group limit through q->nr_group_requests >>>> interface instead of cgroup. Once things stablize, we can revisit >>>> it and >>>> see how this interface can be improved. >>> I agree. I will try to clarify if per group per device limitation is >>> needed >>> or not (or, if it has the advantage beyond the complexity) through some >>> tests. >> >> Great, hope to hear you soon. > > Sorry for so late. I tried it, and write the result and my opinion > below... > > Hi Ikeda, Nice analysis. Few questions/comments inline... > Scenario > ==================== > > The possible scenario where per-cgroup nr_requests limitation is > beneficial in my mind is that: > > - Process P1 in cgroup "g1" is running with submitting many requests > to a device. The number of the requests in the device queue is > almost nr_requests for the device. > > - After a while, process P2 in cgroup "g2" starts running. P2 also > tries to submit requests as many as P1. > > - Assuming that user wants P2 to grab bandwidth as soon as possible > and keep it certain level. > > In this scenario, I predicted the bandwidth behavior of P2 along with > tuning global nr_group_requests like below. > > - If having nr_group_requests almost same as nr_requests, P1 can > allocate requests up to nr_requests and there is no room for P2 at > the beginning of its running. As a result of it, P2 has to wait > for a while till P1's requests are completed and rising of > bandwidth is delayed. > > - If having nr_group_requests fewer to restrict requests from P1 and > make room for P2, the bandwidth of P2 may be lower than the case > that P1 can allocate more requests. > > If the prediction is correct and per-cgroup nr_requests limitation can > make the situation better, per-cgroup nr_requests is supposed to be > beneficial. > > > Verification Conditions > ======================== > > - Kernel: > 2.6.31-rc1 > + Patches from Vivek on Jul 2, 2009 > (IO scheduler based IO controller V6) > > https://lists.linux-foundation.org/pipermail/containers/2009-July/018948.html > + Patches from Gui Jianfeng on Jul 7, 2009 (Bug fixes) > > https://lists.linux-foundation.org/pipermail/containers/2009-July/019086.html > > https://lists.linux-foundation.org/pipermail/containers/2009-July/019087.html > + Patch from Gui Jianfeng on Jul 9, 2009 (per-cgroup requests limit) > > https://lists.linux-foundation.org/pipermail/containers/2009-July/019123.html > + Patch from me on Jul 16, 2009 (Bug fix) > > https://lists.linux-foundation.org/pipermail/containers/2009-July/019286.html > + 2 local bug fix patches > (Not posted yet, I'm posting them in following mails) > > - All results are measured under nr_requests=500. > > - Used fio to make I/O. Job file is like below. Used libaio and > direct-I/O and tuned iodepth to make rl->count[1] approx 500 always. > > ----- fio job file : from here ----- > > [global] > size=128m > directory=/mnt/b1 > > runtime=30 > time_based > > write_bw_log > bwavgtime=200 > > rw=randread > direct=1 > ioengine=libaio > iodepth=500 > > [g1] > exec_prerun=./pre.sh /mnt/cgroups/g1 > exec_postrun=./log.sh /mnt/cgroups/g1 sdb "_post" > > [g2] > startdelay=10 > exec_prerun=./pre.sh /mnt/cgroups/g2 > exec_postrun=./log.sh /mnt/cgroups/g2 sdb "_post" > > ----- fio job file : till here ----- > > Note: > pre.sh and log.sh used in exec_{pre|post}run are to assign processes to > expected cgroups and record the conditions. Let me omit the detail of > them because they are not fundamental part of this verification. > > > Results > ==================== > > Bandwidth of g2 (=P2) was measured under some conditions. Conditions > and bandwidth logs are shown below. > Bandwidth logs are shown only the beginning part (from starting of P2 to > 3000[ms] after aprox.) because the full logs are so long. Average > bandwidth from the beginning of log to ~10[sec] is also calculated. > > Note1: > fio seems to log bandwidth only when actual data transfer occurs > (correct me if it's not true). This means that there is no line with > BW=0. In there is no data transfer, the time-stamp are simply skipped > to record. > > Note2: > Graph picture of the bandwidth logs is attached. > Result(1): orange > Result(2): green > Result(3): brown > Result(4): black > > > ---------- Result (1) ---------- > > * Both of g1 and g2 have nr_group_requests=500 > > < Conditions > > nr_requests = 500 > g1/ > io.nr_group_requests = 500 > io.weight = 500 > io.ioprio_class = 2 > g2/ > io.nr_group_requests = 500 > io.weight = 500 > io.ioprio_class = 2 > > < Bandwidth log of g2 > > t [ms] bw[KiB/s] > 969 4 > 1170 1126 > 1374 1084 > 1576 876 > 1776 901 > 1980 1069 > 2191 1087 > 2400 1117 > 2612 1087 > 2822 1136 > ... > > < Average bandwidth > > 1063 [KiB/s] > (969~9979[ms]) > > > ---------- Result (2) ---------- > > * Both of g1 and g2 have nr_group_requests=100 > > < Conditions > > nr_requests = 500 > g1/ > io.nr_group_requests = 100 > io.weight = 500 > io.ioprio_class = 2 > g2/ > io.nr_group_requests = 100 > io.weight = 500 > io.ioprio_class = 2 > > < Bandwidth log of g2 > > t [ms] bw[KiB/s] > 1498 2 > 1733 892 > 2096 722 > 2311 1224 > 2534 1180 > 2753 1197 > 2988 1137 > ... > > < Average bandwidth > > 998 [KiB/s] > (1498~9898[ms]) > > > ---------- Result (3) ---------- > > * To set different nr_group_requests on g1 and g2 > > < Conditions > > nr_requests = 500 > g1/ > io.nr_group_requests = 100 > io.weight = 500 > io.ioprio_class = 2 > g2/ > io.nr_group_requests = 500 > io.weight = 500 > io.ioprio_class = 2 > > < Bandwidth log of g2 > > t [ms] bw[KiB/s] > 244 839 > 451 1133 > 659 964 > 877 1038 > 1088 1125 > 1294 979 > 1501 1068 > 1708 934 > 1916 1048 > 2117 1126 > 2328 1111 > 2533 1118 > 2758 1206 > 2969 990 > ... > > < Average bandwidth > > 1048 [KiB/s] > (244~9906[ms]) > > > ---------- Result (4) ---------- > > * To make g2/io.ioprio_class as RT > > < Conditions > > nr_requests = 500 > g1/ > io.nr_group_requests = 500 > io.weight = 500 > io.ioprio_class = 2 > g2/ > io.nr_group_requests = 500 > io.weight = 500 > io.ioprio_class = 1 > > < Bandwidth log of g2 > > t [ms] bw[KiB/s] > 476 8 > 677 2211 > 878 2221 > 1080 2486 > 1281 2241 > 1481 2109 > 1681 2334 > 1882 2129 > 2082 2211 > 2283 1915 > 2489 1778 > 2690 1915 > 2891 1997 > ... > > < Average bandwidth > > 2132[KiB/s] > (476~9954[ms]) > > > Consideration and Conclusion > ============================= > > From result(1), it is observed that it takes 1000~1200[ms] to rise P2 > bandwidth. In result(2), where both of g1 and g2 have > nr_group_requests=100, the delay gets longer as 1800~2000[ms]. Result (2) is surprising in terms of delay. Queue limit is 500 and per group limit is 100. That means that group g2 should have got request descriptor allocated as soon as it sent a request to device. That should also mean that it should start getting serviced soon and BW available should rise soon. I am not sure why reverse is happening. Sounds like a bug somewhere.. > In > addition to it, the average bandwidth becomes ~5% lower than result(1). > This is supposed that P2 couldn't allocate enough requests. > Then, result(3) shows that bandwidth of P2 can rise quickly (~300[ms]) > if nr_group_requests can be set per-cgroup. Result(4) shows that the > delay can be shortened by setting g2 as RT class, however, the delay is > still longer than result(3). > > I think it is confirmed that "per-cgroup nr_requests limitation is > useful in a certain situation". I am still not clear how per cgroup request descriptor limit is useful. Having higher number of descriptor reserved for a cgroup probably should help a bit on queuing hardware where more requests from same queue can be dispatched at one go. May be it can also help a bit in more merging in certain scenarios and gives higher throughput. I think key question here is how relevant the number of request descriptors is when it comes to fairness for the group. I have not experimented but probably a few request descriptors (25-30) probably should be enough to ensure group gets fair amount of disk. That's a different thing that throughput of group might suffer and bandwith allocation among task with-in group will also have adverse effect. > Beyond that, the discussion topic is > the benefit pointed out above is eligible for the complication of the > implementation. IMHO, I don't think the implementation of per-cgroup > request limitation is too complicated to accept. On the other hand I > guess it suddenly gets complicated if we try to implement further more, > especially hierarchical support. It is also true that I have a feeling > that implementation without per-device limitation and hierarchical > support is like "unfinished work". > IMHO, configuring request descriptors per cgroup can be a future TODO item depending on how useful people consider it. For the time being, it would be better to keep things simple and code small. In fact currently I am trying to replace BFQ with a CFS style scheduler in io controller patches to reduce the code size as well as its complexity. Thanks Vivek > So, my opinion so far is that, per-cgroup nr_requests limitation should > be merged only if hierarchical support is concluded "unnecessary" for it. > If merging it tempts hierarchical support, it shouldn't be. > How about your opinion, all? > > My considerations or verification method might be wrong. Please correct > them if any. And if you have any other idea of scenario to verify the > effect of per-cgroup nr_requests limitation, please let me know. I'll > try it. > > > > -- > IKEDA, Munehiro > NEC Corporation of America > m-ikeda@ds.jp.nec.com > > -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
different request allocation limitation for different groups. Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com> --- block/blk-core.c | 23 ++++++++++-- block/blk-settings.c | 1 - block/blk-sysfs.c | 43 ----------------------- block/elevator-fq.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++--- block/elevator-fq.h | 4 ++ 5 files changed, 111 insertions(+), 54 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 79fe6a9..7010b76 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -722,13 +722,20 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) static void __freed_request(struct request_queue *q, int sync, struct request_list *rl) { + struct io_group *iog; + unsigned long nr_group_requests; + if (q->rq_data.count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); if (q->rq_data.count[sync] + 1 <= q->nr_requests) blk_clear_queue_full(q, sync); - if (rl->count[sync] + 1 <= q->nr_group_requests) { + iog = rl_iog(rl); + + nr_group_requests = get_group_requests(q, iog); + + if (nr_group_requests && rl->count[sync] + 1 <= nr_group_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); } @@ -828,6 +835,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, const bool is_sync = rw_is_sync(rw_flags) != 0; int may_queue, priv; int sleep_on_global = 0; + struct io_group *iog; + unsigned long nr_group_requests; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -843,7 +852,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags, if (q->rq_data.count[is_sync]+1 >= q->nr_requests) blk_set_queue_full(q, is_sync); - if (rl->count[is_sync]+1 >= q->nr_group_requests) { + iog = rl_iog(rl); + + nr_group_requests = get_group_requests(q, iog); + + if (nr_group_requests && + rl->count[is_sync]+1 >= nr_group_requests) { ioc = current_io_context(GFP_ATOMIC, q->node); /* * The queue request descriptor group will fill after this @@ -852,7 +866,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * This process will be allowed to complete a batch of * requests, others will be blocked. */ - if (rl->count[is_sync] <= q->nr_group_requests) + if (rl->count[is_sync] <= nr_group_requests) ioc_set_batching(q, ioc); else { if (may_queue != ELV_MQUEUE_MUST @@ -898,7 +912,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * from per group request list */ - if (rl->count[is_sync] >= (3 * q->nr_group_requests / 2)) + if (nr_group_requests && + rl->count[is_sync] >= (3 * nr_group_requests / 2)) goto out; rl->starved[is_sync] = 0; diff --git a/block/blk-settings.c b/block/blk-settings.c index 78b8aec..bd582a7 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -148,7 +148,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) * set defaults */ q->nr_requests = BLKDEV_MAX_RQ; - q->nr_group_requests = BLKDEV_MAX_GROUP_RQ; q->make_request_fn = mfn; blk_queue_dma_alignment(q, 511); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 92b9f25..706d852 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -78,40 +78,8 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) return ret; } #ifdef CONFIG_GROUP_IOSCHED -static ssize_t queue_group_requests_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->nr_group_requests, (page)); -} - extern void elv_io_group_congestion_threshold(struct request_queue *q, struct io_group *iog); - -static ssize_t -queue_group_requests_store(struct request_queue *q, const char *page, - size_t count) -{ - struct hlist_node *n; - struct io_group *iog; - struct elv_fq_data *efqd; - unsigned long nr; - int ret = queue_var_store(&nr, page, count); - - if (nr < BLKDEV_MIN_RQ) - nr = BLKDEV_MIN_RQ; - - spin_lock_irq(q->queue_lock); - - q->nr_group_requests = nr; - - efqd = &q->elevator->efqd; - - hlist_for_each_entry(iog, n, &efqd->group_list, elv_data_node) { - elv_io_group_congestion_threshold(q, iog); - } - - spin_unlock_irq(q->queue_lock); - return ret; -} #endif static ssize_t queue_ra_show(struct request_queue *q, char *page) @@ -278,14 +246,6 @@ static struct queue_sysfs_entry queue_requests_entry = { .store = queue_requests_store, }; -#ifdef CONFIG_GROUP_IOSCHED -static struct queue_sysfs_entry queue_group_requests_entry = { - .attr = {.name = "nr_group_requests", .mode = S_IRUGO | S_IWUSR }, - .show = queue_group_requests_show, - .store = queue_group_requests_store, -}; -#endif - static struct queue_sysfs_entry queue_ra_entry = { .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, .show = queue_ra_show, @@ -360,9 +320,6 @@ static struct queue_sysfs_entry queue_iostats_entry = { static struct attribute *default_attrs[] = { &queue_requests_entry.attr, -#ifdef CONFIG_GROUP_IOSCHED - &queue_group_requests_entry.attr, -#endif &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, diff --git a/block/elevator-fq.c b/block/elevator-fq.c index 29392e7..bfb0210 100644 --- a/block/elevator-fq.c +++ b/block/elevator-fq.c @@ -59,6 +59,35 @@ elv_release_ioq(struct elevator_queue *eq, struct io_queue **ioq_ptr); #define for_each_entity_safe(entity, parent) \ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) +unsigned short get_group_requests(struct request_queue *q, + struct io_group *iog) +{ + struct cgroup_subsys_state *css; + struct io_cgroup *iocg; + unsigned long nr_group_requests; + + if (!iog) + return q->nr_requests; + + rcu_read_lock(); + + if (!iog->iocg_id) { + nr_group_requests = 0; + goto out; + } + + css = css_lookup(&io_subsys, iog->iocg_id); + if (!css) { + nr_group_requests = 0; + goto out; + } + + iocg = container_of(css, struct io_cgroup, css); + nr_group_requests = iocg->nr_group_requests; +out: + rcu_read_unlock(); + return nr_group_requests; +} static struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd, int extract); @@ -1257,14 +1286,17 @@ void elv_io_group_congestion_threshold(struct request_queue *q, struct io_group *iog) { int nr; + unsigned long nr_group_requests; - nr = q->nr_group_requests - (q->nr_group_requests / 8) + 1; - if (nr > q->nr_group_requests) - nr = q->nr_group_requests; + nr_group_requests = get_group_requests(q, iog); + + nr = nr_group_requests - (nr_group_requests / 8) + 1; + if (nr > nr_group_requests) + nr = nr_group_requests; iog->nr_congestion_on = nr; - nr = q->nr_group_requests - (q->nr_group_requests / 8) - - (q->nr_group_requests / 16) - 1; + nr = nr_group_requests - (nr_group_requests / 8) + - (nr_group_requests / 16) - 1; if (nr < 1) nr = 1; iog->nr_congestion_off = nr; @@ -1283,6 +1315,7 @@ int elv_io_group_congested(struct request_queue *q, struct page *page, int sync) { struct io_group *iog; int ret = 0; + unsigned long nr_group_requests; rcu_read_lock(); @@ -1300,10 +1333,11 @@ int elv_io_group_congested(struct request_queue *q, struct page *page, int sync) } ret = elv_is_iog_congested(q, iog, sync); + nr_group_requests = get_group_requests(q, iog); if (ret) elv_log_iog(&q->elevator->efqd, iog, "iog congested=%d sync=%d" " rl.count[sync]=%d nr_group_requests=%d", - ret, sync, iog->rl.count[sync], q->nr_group_requests); + ret, sync, iog->rl.count[sync], nr_group_requests); rcu_read_unlock(); return ret; } @@ -1549,6 +1583,48 @@ free_buf: return ret; } +static u64 io_cgroup_nr_requests_read(struct cgroup *cgroup, + struct cftype *cftype) +{ + struct io_cgroup *iocg; + u64 ret; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + spin_lock_irq(&iocg->lock); + ret = iocg->nr_group_requests; + spin_unlock_irq(&iocg->lock); + + cgroup_unlock(); + + return ret; +} + +static int io_cgroup_nr_requests_write(struct cgroup *cgroup, + struct cftype *cftype, + u64 val) +{ + struct io_cgroup *iocg; + + if (val < BLKDEV_MIN_RQ) + val = BLKDEV_MIN_RQ; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + + spin_lock_irq(&iocg->lock); + iocg->nr_group_requests = (unsigned long)val; + spin_unlock_irq(&iocg->lock); + + cgroup_unlock(); + + return 0; +} + #define SHOW_FUNCTION(__VAR) \ static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \ struct cftype *cftype) \ @@ -1735,6 +1811,11 @@ static int io_cgroup_disk_dequeue_read(struct cgroup *cgroup, struct cftype bfqio_files[] = { { + .name = "nr_group_requests", + .read_u64 = io_cgroup_nr_requests_read, + .write_u64 = io_cgroup_nr_requests_write, + }, + { .name = "policy", .read_seq_string = io_cgroup_policy_read, .write_string = io_cgroup_policy_write, @@ -1790,6 +1871,7 @@ static struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys, spin_lock_init(&iocg->lock); INIT_HLIST_HEAD(&iocg->group_data); + iocg->nr_group_requests = BLKDEV_MAX_GROUP_RQ; iocg->weight = IO_DEFAULT_GRP_WEIGHT; iocg->ioprio_class = IO_DEFAULT_GRP_CLASS; INIT_LIST_HEAD(&iocg->policy_list); diff --git a/block/elevator-fq.h b/block/elevator-fq.h index f089a55..df077d0 100644 --- a/block/elevator-fq.h +++ b/block/elevator-fq.h @@ -308,6 +308,7 @@ struct io_cgroup { unsigned int weight; unsigned short ioprio_class; + unsigned long nr_group_requests; /* list of io_policy_node */ struct list_head policy_list; @@ -386,6 +387,9 @@ struct elv_fq_data { unsigned int fairness; }; +extern unsigned short get_group_requests(struct request_queue *q, + struct io_group *iog); + /* Logging facilities. */ #ifdef CONFIG_DEBUG_GROUP_IOSCHED #define elv_log_ioq(efqd, ioq, fmt, args...) \