@@ -460,20 +460,53 @@ void blk_cleanup_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_cleanup_queue);
-static int blk_init_free_list(struct request_queue *q)
+struct request_list *
+blk_get_request_list(struct request_queue *q, struct bio *bio)
+{
+#ifdef CONFIG_GROUP_IOSCHED
+ /*
+ * Determine which request list bio will be allocated from. This
+ * is dependent on which io group bio belongs to
+ */
+ return elv_get_request_list_bio(q, bio);
+#else
+ return &q->rq;
+#endif
+}
+
+static struct request_list *rq_rl(struct request_queue *q, struct request *rq)
+{
+#ifdef CONFIG_GROUP_IOSCHED
+ int priv = rq->cmd_flags & REQ_ELVPRIV;
+
+ return elv_get_request_list_rq(q, rq, priv);
+#else
+ return &q->rq;
+#endif
+}
+
+void blk_init_request_list(struct request_list *rl)
{
- struct request_list *rl = &q->rq;
rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
- rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
- rl->elvpriv = 0;
init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
+}
- rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
- mempool_free_slab, request_cachep, q->node);
+static int blk_init_free_list(struct request_queue *q)
+{
+ /*
+ * In case of group scheduling, request list is inside group and is
+ * initialized when group is instanciated.
+ */
+#ifndef CONFIG_GROUP_IOSCHED
+ blk_init_request_list(&q->rq);
+#endif
+ q->rq_data.rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
+ mempool_alloc_slab, mempool_free_slab,
+ request_cachep, q->node);
- if (!rl->rq_pool)
+ if (!q->rq_data.rq_pool)
return -ENOMEM;
return 0;
@@ -581,6 +614,9 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
q->queue_flags = QUEUE_FLAG_DEFAULT;
q->queue_lock = lock;
+ /* init starved waiter wait queue */
+ init_waitqueue_head(&q->rq_data.starved_wait);
+
/*
* This also sets hw/phys segments, boundary and size
*/
@@ -615,14 +651,14 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
{
if (rq->cmd_flags & REQ_ELVPRIV)
elv_put_request(q, rq);
- mempool_free(rq, q->rq.rq_pool);
+ mempool_free(rq, q->rq_data.rq_pool);
}
static struct request *
blk_alloc_request(struct request_queue *q, struct bio *bio, int flags, int priv,
gfp_t gfp_mask)
{
- struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
+ struct request *rq = mempool_alloc(q->rq_data.rq_pool, gfp_mask);
if (!rq)
return NULL;
@@ -633,7 +669,7 @@ blk_alloc_request(struct request_queue *q, struct bio *bio, int flags, int priv,
if (priv) {
if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
- mempool_free(rq, q->rq.rq_pool);
+ mempool_free(rq, q->rq_data.rq_pool);
return NULL;
}
rq->cmd_flags |= REQ_ELVPRIV;
@@ -676,18 +712,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
ioc->last_waited = jiffies;
}
-static void __freed_request(struct request_queue *q, int sync)
+static void __freed_request(struct request_queue *q, int sync,
+ struct request_list *rl)
{
- struct request_list *rl = &q->rq;
-
- if (rl->count[sync] < queue_congestion_off_threshold(q))
+ if (q->rq_data.count[sync] < queue_congestion_off_threshold(q))
blk_clear_queue_congested(q, sync);
- if (rl->count[sync] + 1 <= q->nr_requests) {
+ if (q->rq_data.count[sync] + 1 <= q->nr_requests)
+ blk_clear_queue_full(q, sync);
+
+ if (rl->count[sync] + 1 <= q->nr_group_requests) {
if (waitqueue_active(&rl->wait[sync]))
wake_up(&rl->wait[sync]);
-
- blk_clear_queue_full(q, sync);
}
}
@@ -695,63 +731,130 @@ static void __freed_request(struct request_queue *q, int sync)
* A request has just been released. Account for it, update the full and
* congestion status, wake up any waiters. Called under q->queue_lock.
*/
-static void freed_request(struct request_queue *q, int sync, int priv)
+static void freed_request(struct request_queue *q, int sync, int priv,
+ struct request_list *rl)
{
- struct request_list *rl = &q->rq;
+ /*
+ * There is a window during request allocation where request is
+ * mapped to one group but by the time a queue for the group is
+ * allocated, it is possible that original cgroup/io group has been
+ * deleted and now io queue is allocated in a different group (root)
+ * altogether.
+ *
+ * One solution to the problem is that rq should take io group
+ * reference. But it looks too much to do that to solve this issue.
+ * The only side affect to the hard to hit issue seems to be that
+ * we will try to decrement the rl->count for a request list which
+ * did not allocate that request. Chcek for rl->count going less than
+ * zero and do not decrement it if that's the case.
+ */
+
+ if (priv && rl->count[sync] > 0)
+ rl->count[sync]--;
+
+ BUG_ON(!q->rq_data.count[sync]);
+ q->rq_data.count[sync]--;
- rl->count[sync]--;
if (priv)
- rl->elvpriv--;
+ q->rq_data.elvpriv--;
- __freed_request(q, sync);
+ __freed_request(q, sync, rl);
if (unlikely(rl->starved[sync ^ 1]))
- __freed_request(q, sync ^ 1);
+ __freed_request(q, sync ^ 1, rl);
+
+ /* Wake up the starved process on global list, if any */
+ if (unlikely(q->rq_data.starved)) {
+ if (waitqueue_active(&q->rq_data.starved_wait))
+ wake_up(&q->rq_data.starved_wait);
+ q->rq_data.starved--;
+ }
+}
+
+/*
+ * Returns whether one can sleep on this request list or not. There are
+ * cases (elevator switch) where request list might not have allocated
+ * any request descriptor but we deny request allocation due to gloabl
+ * limits. In that case one should sleep on global list as on this request
+ * list no wakeup will take place.
+ *
+ * Also sets the request list starved flag if there are no requests pending
+ * in the direction of rq.
+ *
+ * Return 1 --> sleep on request list, 0 --> sleep on global list
+ */
+static int can_sleep_on_request_list(struct request_list *rl, int is_sync)
+{
+ if (unlikely(rl->count[is_sync] == 0)) {
+ /*
+ * If there is a request pending in other direction
+ * in same io group, then set the starved flag of
+ * the group request list. Otherwise, we need to
+ * make this process sleep in global starved list
+ * to make sure it will not sleep indefinitely.
+ */
+ if (rl->count[is_sync ^ 1] != 0) {
+ rl->starved[is_sync] = 1;
+ return 1;
+ } else
+ return 0;
+ }
+
+ return 1;
}
/*
* Get a free request, queue_lock must be held.
- * Returns NULL on failure, with queue_lock held.
+ * Returns NULL on failure, with queue_lock held. Also sets the "reason" field
+ * in case of failure. This reason field helps caller decide to whether sleep
+ * on per group list or global per queue list.
+ * reason = 0 sleep on per group list
+ * reason = 1 sleep on global list
+ *
* Returns !NULL on success, with queue_lock *not held*.
*/
static struct request *get_request(struct request_queue *q, int rw_flags,
- struct bio *bio, gfp_t gfp_mask)
+ struct bio *bio, gfp_t gfp_mask,
+ struct request_list *rl, int *reason)
{
struct request *rq = NULL;
- struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
int may_queue, priv;
+ int sleep_on_global = 0;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
goto rq_starved;
- if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
- if (rl->count[is_sync]+1 >= q->nr_requests) {
- ioc = current_io_context(GFP_ATOMIC, q->node);
- /*
- * The queue will fill after this allocation, so set
- * it as full, and mark this process as "batching".
- * This process will be allowed to complete a batch of
- * requests, others will be blocked.
- */
- if (!blk_queue_full(q, is_sync)) {
- ioc_set_batching(q, ioc);
- blk_set_queue_full(q, is_sync);
- } else {
- if (may_queue != ELV_MQUEUE_MUST
- && !ioc_batching(q, ioc)) {
- /*
- * The queue is full and the allocating
- * process is not a "batcher", and not
- * exempted by the IO scheduler
- */
- goto out;
- }
+ if (q->rq_data.count[is_sync]+1 >= queue_congestion_on_threshold(q))
+ blk_set_queue_congested(q, is_sync);
+
+ /* queue full seems redundant now */
+ if (q->rq_data.count[is_sync]+1 >= q->nr_requests)
+ blk_set_queue_full(q, is_sync);
+
+ if (rl->count[is_sync]+1 >= q->nr_group_requests) {
+ ioc = current_io_context(GFP_ATOMIC, q->node);
+ /*
+ * The queue request descriptor group will fill after this
+ * allocation, so set it as full, and mark this process as
+ * "batching". This process will be allowed to complete a
+ * batch of requests, others will be blocked.
+ */
+ if (rl->count[is_sync] <= q->nr_group_requests)
+ ioc_set_batching(q, ioc);
+ else {
+ if (may_queue != ELV_MQUEUE_MUST
+ && !ioc_batching(q, ioc)) {
+ /*
+ * The queue is full and the allocating
+ * process is not a "batcher", and not
+ * exempted by the IO scheduler
+ */
+ goto out;
}
}
- blk_set_queue_congested(q, is_sync);
}
/*
@@ -759,21 +862,60 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
* limit of requests, otherwise we could have thousands of requests
* allocated with any setting of ->nr_requests
*/
- if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
+
+ if (q->rq_data.count[is_sync] >= (3 * q->nr_requests / 2)) {
+ /*
+ * Queue is too full for allocation. On which request queue
+ * the task should sleep? Generally it should sleep on its
+ * request list but if elevator switch is happening, in that
+ * window, request descriptors are allocated from global
+ * pool and are not accounted against any particular request
+ * list as group is going away.
+ *
+ * So it might happen that request list does not have any
+ * requests allocated at all and if process sleeps on per
+ * group request list, it will not be woken up. In such case,
+ * make it sleep on global starved list.
+ */
+ if (test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)
+ || !can_sleep_on_request_list(rl, is_sync))
+ sleep_on_global = 1;
+ goto out;
+ }
+
+ /*
+ * Allocation of request is allowed from queue perspective. Now check
+ * from per group request list
+ */
+
+ if (rl->count[is_sync] >= (3 * q->nr_group_requests / 2))
goto out;
- rl->count[is_sync]++;
rl->starved[is_sync] = 0;
+ q->rq_data.count[is_sync]++;
+
priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
- rl->elvpriv++;
+ if (priv) {
+ q->rq_data.elvpriv++;
+ /*
+ * Account the request to request list only if request is
+ * going to elevator. During elevator switch, there will
+ * be small window where group is going away and new group
+ * will not be allocated till elevator switch is complete.
+ * So till then instead of slowing down the application,
+ * we will continue to allocate request from total common
+ * pool instead of per group limit
+ */
+ rl->count[is_sync]++;
+ }
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
spin_unlock_irq(q->queue_lock);
rq = blk_alloc_request(q, bio, rw_flags, priv, gfp_mask);
+
if (unlikely(!rq)) {
/*
* Allocation failed presumably due to memory. Undo anything
@@ -783,7 +925,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
* wait queue, but this is pretty rare.
*/
spin_lock_irq(q->queue_lock);
- freed_request(q, is_sync, priv);
+ freed_request(q, is_sync, priv, rl);
/*
* in the very unlikely event that allocation failed and no
@@ -793,9 +935,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
* rq mempool into READ and WRITE
*/
rq_starved:
- if (unlikely(rl->count[is_sync] == 0))
- rl->starved[is_sync] = 1;
-
+ if (!can_sleep_on_request_list(rl, is_sync))
+ sleep_on_global = 1;
goto out;
}
@@ -810,6 +951,8 @@ rq_starved:
trace_block_getrq(q, bio, rw_flags & 1);
out:
+ if (reason && sleep_on_global)
+ *reason = 1;
return rq;
}
@@ -823,16 +966,39 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
struct bio *bio)
{
const bool is_sync = rw_is_sync(rw_flags) != 0;
+ int sleep_on_global = 0;
struct request *rq;
+ struct request_list *rl = blk_get_request_list(q, bio);
- rq = get_request(q, rw_flags, bio, GFP_NOIO);
+ rq = get_request(q, rw_flags, bio, GFP_NOIO, rl, &sleep_on_global);
while (!rq) {
DEFINE_WAIT(wait);
struct io_context *ioc;
- struct request_list *rl = &q->rq;
- prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
- TASK_UNINTERRUPTIBLE);
+ if (sleep_on_global) {
+ /*
+ * Task failed allocation and needs to wait and
+ * try again. There are no requests pending from
+ * the io group hence need to sleep on global
+ * wait queue. Most likely the allocation failed
+ * because of memory issues.
+ */
+
+ q->rq_data.starved++;
+ prepare_to_wait_exclusive(&q->rq_data.starved_wait,
+ &wait, TASK_UNINTERRUPTIBLE);
+ } else {
+ /*
+ * We are about to sleep on a request list and we
+ * drop queue lock. After waking up, we will do
+ * finish_wait() on request list and in the mean
+ * time group might be gone. Take a reference to
+ * the group now.
+ */
+ prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
+ TASK_UNINTERRUPTIBLE);
+ elv_get_rl_iog(rl);
+ }
trace_block_sleeprq(q, bio, rw_flags & 1);
@@ -850,9 +1016,25 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
ioc_set_batching(q, ioc);
spin_lock_irq(q->queue_lock);
- finish_wait(&rl->wait[is_sync], &wait);
- rq = get_request(q, rw_flags, bio, GFP_NOIO);
+ if (sleep_on_global) {
+ finish_wait(&q->rq_data.starved_wait, &wait);
+ sleep_on_global = 0;
+ } else {
+ /*
+ * We had taken a reference to the rl/iog. Put that now
+ */
+ finish_wait(&rl->wait[is_sync], &wait);
+ elv_put_rl_iog(rl);
+ }
+
+ /*
+ * After the sleep check the rl again in case cgrop bio
+ * belonged to is gone and it is mapped to root group now
+ */
+ rl = blk_get_request_list(q, bio);
+ rq = get_request(q, rw_flags, bio, GFP_NOIO, rl,
+ &sleep_on_global);
};
return rq;
@@ -861,14 +1043,16 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
{
struct request *rq;
+ struct request_list *rl;
BUG_ON(rw != READ && rw != WRITE);
spin_lock_irq(q->queue_lock);
+ rl = blk_get_request_list(q, NULL);
if (gfp_mask & __GFP_WAIT) {
rq = get_request_wait(q, rw, NULL);
} else {
- rq = get_request(q, rw, NULL, gfp_mask);
+ rq = get_request(q, rw, NULL, gfp_mask, rl, NULL);
if (!rq)
spin_unlock_irq(q->queue_lock);
}
@@ -1085,12 +1269,19 @@ void __blk_put_request(struct request_queue *q, struct request *req)
if (req->cmd_flags & REQ_ALLOCED) {
int is_sync = rq_is_sync(req) != 0;
int priv = req->cmd_flags & REQ_ELVPRIV;
+ struct request_list *rl = rq_rl(q, req);
BUG_ON(!list_empty(&req->queuelist));
BUG_ON(!hlist_unhashed(&req->hash));
+ /*
+ * Call freed request before actually freeing the request
+ * freeing the request might cause freeing up of io queue, and
+ * in turn io group. That mean rl pointer will no more be
+ * valid.
+ */
+ freed_request(q, is_sync, priv, rl);
blk_free_request(q, req);
- freed_request(q, is_sync, priv);
}
}
EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -149,6 +149,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
* set defaults
*/
q->nr_requests = BLKDEV_MAX_RQ;
+ q->nr_group_requests = BLKDEV_MAX_GROUP_RQ;
q->make_request_fn = mfn;
blk_queue_dma_alignment(q, 511);
@@ -38,42 +38,67 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
static ssize_t
queue_requests_store(struct request_queue *q, const char *page, size_t count)
{
- struct request_list *rl = &q->rq;
+ struct request_list *rl;
unsigned long nr;
int ret = queue_var_store(&nr, page, count);
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
spin_lock_irq(q->queue_lock);
+ rl = blk_get_request_list(q, NULL);
q->nr_requests = nr;
blk_queue_congestion_threshold(q);
- if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+ if (q->rq_data.count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
blk_set_queue_congested(q, BLK_RW_SYNC);
- else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+ else if (q->rq_data.count[BLK_RW_SYNC] <
+ queue_congestion_off_threshold(q))
blk_clear_queue_congested(q, BLK_RW_SYNC);
- if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+ if (q->rq_data.count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
blk_set_queue_congested(q, BLK_RW_ASYNC);
- else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+ else if (q->rq_data.count[BLK_RW_ASYNC] <
+ queue_congestion_off_threshold(q))
blk_clear_queue_congested(q, BLK_RW_ASYNC);
- if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+ if (q->rq_data.count[BLK_RW_SYNC] >= q->nr_requests) {
blk_set_queue_full(q, BLK_RW_SYNC);
- } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+ } else if (q->rq_data.count[BLK_RW_SYNC]+1 <= q->nr_requests) {
blk_clear_queue_full(q, BLK_RW_SYNC);
wake_up(&rl->wait[BLK_RW_SYNC]);
}
- if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+ if (q->rq_data.count[BLK_RW_ASYNC] >= q->nr_requests) {
blk_set_queue_full(q, BLK_RW_ASYNC);
- } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+ } else if (q->rq_data.count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
blk_clear_queue_full(q, BLK_RW_ASYNC);
wake_up(&rl->wait[BLK_RW_ASYNC]);
}
spin_unlock_irq(q->queue_lock);
return ret;
}
+#ifdef CONFIG_GROUP_IOSCHED
+static ssize_t queue_group_requests_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(q->nr_group_requests, (page));
+}
+
+static ssize_t
+queue_group_requests_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ unsigned long nr;
+ int ret = queue_var_store(&nr, page, count);
+
+ if (nr < BLKDEV_MIN_RQ)
+ nr = BLKDEV_MIN_RQ;
+
+ spin_lock_irq(q->queue_lock);
+ q->nr_group_requests = nr;
+ spin_unlock_irq(q->queue_lock);
+ return ret;
+}
+#endif
static ssize_t queue_ra_show(struct request_queue *q, char *page)
{
@@ -240,6 +265,14 @@ static struct queue_sysfs_entry queue_requests_entry = {
.store = queue_requests_store,
};
+#ifdef CONFIG_GROUP_IOSCHED
+static struct queue_sysfs_entry queue_group_requests_entry = {
+ .attr = {.name = "nr_group_requests", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_group_requests_show,
+ .store = queue_group_requests_store,
+};
+#endif
+
static struct queue_sysfs_entry queue_ra_entry = {
.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
.show = queue_ra_show,
@@ -314,6 +347,9 @@ static struct queue_sysfs_entry queue_iostats_entry = {
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
+#ifdef CONFIG_GROUP_IOSCHED
+ &queue_group_requests_entry.attr,
+#endif
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
@@ -393,12 +429,11 @@ static void blk_release_queue(struct kobject *kobj)
{
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
- struct request_list *rl = &q->rq;
blk_sync_queue(q);
- if (rl->rq_pool)
- mempool_destroy(rl->rq_pool);
+ if (q->rq_data.rq_pool)
+ mempool_destroy(q->rq_data.rq_pool);
if (q->queue_tags)
__blk_queue_free_tags(q);
@@ -1245,6 +1245,39 @@ static struct io_cgroup *cgroup_to_io_cgroup(struct cgroup *cgroup)
struct io_cgroup, css);
}
+struct request_list *
+elv_get_request_list_bio(struct request_queue *q, struct bio *bio)
+{
+ struct io_group *iog;
+
+ if (!elv_iosched_fair_queuing_enabled(q->elevator))
+ iog = q->elevator->efqd->root_group;
+ else
+ iog = elv_io_get_io_group_bio(q, bio, 1);
+
+ BUG_ON(!iog);
+ return &iog->rl;
+}
+
+struct request_list *
+elv_get_request_list_rq(struct request_queue *q, struct request *rq, int priv)
+{
+ struct io_group *iog;
+
+ if (!elv_iosched_fair_queuing_enabled(q->elevator))
+ return &q->elevator->efqd->root_group->rl;
+
+ BUG_ON(priv && !rq->ioq);
+
+ if (priv)
+ iog = ioq_to_io_group(rq->ioq);
+ else
+ iog = q->elevator->efqd->root_group;
+
+ BUG_ON(!iog);
+ return &iog->rl;
+}
+
/*
* Search the io_group for efqd into the hash table (by now only a list)
* of bgrp. Must be called under rcu_read_lock().
@@ -1601,6 +1634,8 @@ io_group_chain_alloc(struct request_queue *q, void *key, struct cgroup *cgroup)
elv_get_iog(iog);
io_group_path(iog);
+ blk_init_request_list(&iog->rl);
+
if (leaf == NULL) {
leaf = iog;
prev = leaf;
@@ -1830,6 +1865,7 @@ static struct io_group *io_alloc_root_group(struct request_queue *q,
for (i = 0; i < IO_IOPRIO_CLASSES; i++)
iog->sched_data.service_tree[i] = ELV_SERVICE_TREE_INIT;
+ blk_init_request_list(&iog->rl);
spin_lock_irq(&iocg->lock);
rcu_assign_pointer(iog->key, key);
hlist_add_head_rcu(&iog->group_node, &iocg->group_data);
@@ -140,6 +140,9 @@ struct io_group {
/* Single ioq per group, used for noop, deadline, anticipatory */
struct io_queue *ioq;
+
+ /* request list associated with the group */
+ struct request_list rl;
};
struct io_cgroup {
@@ -440,11 +443,31 @@ static inline void elv_get_iog(struct io_group *iog)
atomic_inc(&iog->ref);
}
+static inline struct io_group *rl_iog(struct request_list *rl)
+{
+ return container_of(rl, struct io_group, rl);
+}
+
+static inline void elv_get_rl_iog(struct request_list *rl)
+{
+ elv_get_iog(rl_iog(rl));
+}
+
+static inline void elv_put_rl_iog(struct request_list *rl)
+{
+ elv_put_iog(rl_iog(rl));
+}
+
extern int elv_set_request_ioq(struct request_queue *q, struct request *rq,
struct bio *bio, gfp_t gfp_mask);
extern void elv_reset_request_ioq(struct request_queue *q, struct request *rq);
extern struct io_queue *elv_lookup_ioq_bio(struct request_queue *q,
struct bio *bio);
+struct request_list *
+elv_get_request_list_bio(struct request_queue *q, struct bio *bio);
+
+struct request_list *
+elv_get_request_list_rq(struct request_queue *q, struct request *rq, int priv);
#else /* !GROUP_IOSCHED */
@@ -484,6 +507,9 @@ elv_lookup_ioq_bio(struct request_queue *q, struct bio *bio)
return NULL;
}
+static inline void elv_get_rl_iog(struct request_list *rl) { }
+static inline void elv_put_rl_iog(struct request_list *rl) { }
+
#endif /* GROUP_IOSCHED */
extern ssize_t elv_slice_sync_show(struct elevator_queue *q, char *name);
@@ -594,6 +620,9 @@ static inline struct io_queue *elv_lookup_ioq_bio(struct request_queue *q,
return NULL;
}
+static inline void elv_get_rl_iog(struct request_list *rl) { }
+static inline void elv_put_rl_iog(struct request_list *rl) { }
+
#endif /* CONFIG_ELV_FAIR_QUEUING */
#endif /* _ELV_SCHED_H */
#endif /* CONFIG_BLOCK */
@@ -678,7 +678,7 @@ void elv_quiesce_start(struct request_queue *q)
* make sure we don't have any requests in flight
*/
elv_drain_elevator(q);
- while (q->rq.elvpriv) {
+ while (q->rq_data.elvpriv) {
__blk_run_queue(q);
spin_unlock_irq(q->queue_lock);
msleep(10);
@@ -777,8 +777,9 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
}
if (unplug_it && blk_queue_plugged(q)) {
- int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
- - queue_in_flight(q);
+ int nrq = q->rq_data.count[BLK_RW_SYNC] +
+ q->rq_data.count[BLK_RW_ASYNC] -
+ queue_in_flight(q);
if (nrq >= q->unplug_thresh)
__generic_unplug_device(q);
@@ -32,21 +32,51 @@ struct request;
struct sg_io_hdr;
#define BLKDEV_MIN_RQ 4
+
+#ifdef CONFIG_GROUP_IOSCHED
+#define BLKDEV_MAX_RQ 512 /* Default maximum for queue */
+#define BLKDEV_MAX_GROUP_RQ 128 /* Default maximum per group*/
+#else
#define BLKDEV_MAX_RQ 128 /* Default maximum */
+/*
+ * This is eqivalent to case of only one group present (root group). Let
+ * it consume all the request descriptors available on the queue .
+ */
+#define BLKDEV_MAX_GROUP_RQ BLKDEV_MAX_RQ /* Default maximum */
+#endif
struct request;
typedef void (rq_end_io_fn)(struct request *, int);
struct request_list {
/*
- * count[], starved[], and wait[] are indexed by
+ * count[], starved and wait[] are indexed by
* BLK_RW_SYNC/BLK_RW_ASYNC
*/
int count[2];
int starved[2];
+ wait_queue_head_t wait[2];
+};
+
+/*
+ * This data structures keeps track of mempool of requests for the queue
+ * and some overall statistics.
+ */
+struct request_data {
+ /*
+ * Per queue request descriptor count. This is in addition to per
+ * cgroup count
+ */
+ int count[2];
int elvpriv;
mempool_t *rq_pool;
- wait_queue_head_t wait[2];
+ int starved;
+ /*
+ * Global list for starved tasks. A task will be queued here if
+ * it could not allocate request descriptor and the associated
+ * group request list does not have any requests pending.
+ */
+ wait_queue_head_t starved_wait;
};
/*
@@ -339,10 +369,17 @@ struct request_queue
struct request *last_merge;
struct elevator_queue *elevator;
+#ifndef CONFIG_GROUP_IOSCHED
/*
* the queue request freelist, one for reads and one for writes
+ * In case of group io scheduling, this request list is per group
+ * and is present in group data structure.
*/
struct request_list rq;
+#endif
+
+ /* Contains request pool and other data like starved data */
+ struct request_data rq_data;
request_fn_proc *request_fn;
make_request_fn *make_request_fn;
@@ -405,6 +442,8 @@ struct request_queue
* queue settings
*/
unsigned long nr_requests; /* Max # of requests */
+ /* Max # of per io group requests */
+ unsigned long nr_group_requests;
unsigned int nr_congestion_on;
unsigned int nr_congestion_off;
unsigned int nr_batching;
@@ -784,6 +823,10 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
struct scsi_ioctl_command __user *);
+extern void blk_init_request_list(struct request_list *rl);
+
+extern struct request_list *blk_get_request_list(struct request_queue *q,
+ struct bio *bio);
/*
* A queue has just exitted congestion. Note this in the global counter of
* congested queues, and wake up anyone who was waiting for requests to be
@@ -397,7 +397,8 @@ TRACE_EVENT(block_unplug_timer,
),
TP_fast_assign(
- __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
+ __entry->nr_rq = q->rq_data.count[READ] +
+ q->rq_data.count[WRITE];
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
),
@@ -416,7 +417,8 @@ TRACE_EVENT(block_unplug_io,
),
TP_fast_assign(
- __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE];
+ __entry->nr_rq = q->rq_data.count[READ] +
+ q->rq_data.count[WRITE];
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
),
@@ -786,7 +786,8 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt) {
- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+ unsigned int pdu = q->rq_data.count[READ] +
+ q->rq_data.count[WRITE];
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
@@ -799,7 +800,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt) {
- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+ unsigned int pdu = q->rq_data.count[READ] +
+ q->rq_data.count[WRITE];
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,