@@ -90,6 +90,27 @@ void blk_queue_congestion_threshold(struct request_queue *q)
q->nr_congestion_off = nr;
}
+#ifdef CONFIG_GROUP_IOSCHED
+int blk_queue_io_group_congested(struct backing_dev_info *bdi, int bdi_bits,
+ struct page *page)
+{
+ int ret = 0;
+ struct request_queue *q = bdi->unplug_io_data;
+
+ if (!q || !q->elevator)
+ return bdi_congested(bdi, bdi_bits);
+
+ /* Do we need to hold queue lock? */
+ if (bdi_bits & (1 << BDI_sync_congested))
+ ret |= elv_page_io_group_congested(q, page, 1);
+
+ if (bdi_bits & (1 << BDI_async_congested))
+ ret |= elv_page_io_group_congested(q, page, 0);
+
+ return ret;
+}
+#endif
+
/**
* blk_get_backing_dev_info - get the address of a queue's backing_dev_info
* @bdev: device
@@ -721,6 +742,8 @@ static void __freed_request(struct request_queue *q, int sync,
if (q->rq_data.count[sync] + 1 <= q->nr_requests)
blk_clear_queue_full(q, sync);
+ elv_freed_request(rl, sync);
+
if (rl->count[sync] + 1 <= q->nr_group_requests) {
if (waitqueue_active(&rl->wait[sync]))
wake_up(&rl->wait[sync]);
@@ -830,6 +853,9 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
if (q->rq_data.count[is_sync]+1 >= queue_congestion_on_threshold(q))
blk_set_queue_congested(q, is_sync);
+ /* check if io group will get congested after this allocation*/
+ elv_get_request(rl, is_sync);
+
/* queue full seems redundant now */
if (q->rq_data.count[is_sync]+1 >= q->nr_requests)
blk_set_queue_full(q, is_sync);
@@ -83,9 +83,8 @@ static ssize_t queue_group_requests_show(struct request_queue *q, char *page)
return queue_var_show(q->nr_group_requests, (page));
}
-static ssize_t
-queue_group_requests_store(struct request_queue *q, const char *page,
- size_t count)
+static ssize_t queue_group_requests_store(struct request_queue *q,
+ const char *page, size_t count)
{
unsigned long nr;
int ret = queue_var_store(&nr, page, count);
@@ -95,6 +94,7 @@ queue_group_requests_store(struct request_queue *q, const char *page,
spin_lock_irq(q->queue_lock);
q->nr_group_requests = nr;
+ elv_updated_nr_group_requests(q);
spin_unlock_irq(q->queue_lock);
return ret;
}
@@ -1278,6 +1278,139 @@ elv_get_request_list_rq(struct request_queue *q, struct request *rq, int priv)
return &iog->rl;
}
+/* Set io group congestion on and off thresholds */
+void elv_io_group_congestion_threshold(struct request_queue *q,
+ struct io_group *iog)
+{
+ int nr;
+
+ nr = q->nr_group_requests - (q->nr_group_requests / 8) + 1;
+ if (nr > q->nr_group_requests)
+ nr = q->nr_group_requests;
+ iog->nr_congestion_on = nr;
+
+ nr = q->nr_group_requests - (q->nr_group_requests / 8)
+ - (q->nr_group_requests / 16) - 1;
+ if (nr < 1)
+ nr = 1;
+ iog->nr_congestion_off = nr;
+}
+
+void elv_clear_iog_congested(struct io_group *iog, int sync)
+{
+ enum io_group_state bit;
+
+ bit = sync ? IOG_sync_congested : IOG_async_congested;
+ clear_bit(bit, &iog->state);
+ smp_mb__after_clear_bit();
+ congestion_wake_up(sync);
+}
+
+void elv_set_iog_congested(struct io_group *iog, int sync)
+{
+ enum io_group_state bit;
+
+ bit = sync ? IOG_sync_congested : IOG_async_congested;
+ set_bit(bit, &iog->state);
+}
+
+static inline int elv_iog_congested(struct io_group *iog, int iog_bits)
+{
+ return iog->state & iog_bits;
+}
+
+/* Determine if io group page maps to is congested or not */
+int elv_page_io_group_congested(struct request_queue *q, struct page *page,
+ int sync)
+{
+ struct io_group *iog;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ iog = elv_io_get_io_group(q, page, 0);
+
+ if (!iog) {
+ /*
+ * Either cgroup got deleted or this is first request in the
+ * group and associated io group object has not been created
+ * yet. Map it to root group.
+ *
+ * TODO: Fix the case of group not created yet.
+ */
+ iog = q->elevator->efqd->root_group;
+ }
+
+ if (sync)
+ ret = elv_iog_congested(iog, 1 << IOG_sync_congested);
+ else
+ ret = elv_iog_congested(iog, 1 << IOG_async_congested);
+
+ if (ret)
+ elv_log_iog(q->elevator->efqd, iog, "iog congested=%d sync=%d"
+ " rl.count[sync]=%d nr_group_requests=%d",
+ ret, sync, iog->rl.count[sync], q->nr_group_requests);
+ rcu_read_unlock();
+ return ret;
+}
+
+static inline int
+elv_iog_congestion_on_threshold(struct io_group *iog)
+{
+ return iog->nr_congestion_on;
+}
+
+static inline int
+elv_iog_congestion_off_threshold(struct io_group *iog)
+{
+ return iog->nr_congestion_off;
+}
+
+void elv_freed_request(struct request_list *rl, int sync)
+{
+ struct io_group *iog = rl_iog(rl);
+
+ if (iog->rl.count[sync] < elv_iog_congestion_off_threshold(iog))
+ elv_clear_iog_congested(iog, sync);
+}
+
+void elv_get_request(struct request_list *rl, int sync)
+{
+ struct io_group *iog = rl_iog(rl);
+
+ if (iog->rl.count[sync]+1 >= elv_iog_congestion_on_threshold(iog))
+ elv_set_iog_congested(iog, sync);
+}
+
+static void iog_nr_requests_updated(struct io_group *iog)
+{
+ if (iog->rl.count[BLK_RW_SYNC] >= elv_iog_congestion_on_threshold(iog))
+ elv_set_iog_congested(iog, BLK_RW_SYNC);
+ else if (iog->rl.count[BLK_RW_SYNC] <
+ elv_iog_congestion_off_threshold(iog))
+ elv_clear_iog_congested(iog, BLK_RW_SYNC);
+
+ if (iog->rl.count[BLK_RW_ASYNC] >= elv_iog_congestion_on_threshold(iog))
+ elv_set_iog_congested(iog, BLK_RW_ASYNC);
+ else if (iog->rl.count[BLK_RW_ASYNC] <
+ elv_iog_congestion_off_threshold(iog))
+ elv_clear_iog_congested(iog, BLK_RW_ASYNC);
+}
+
+void elv_updated_nr_group_requests(struct request_queue *q)
+{
+ struct elv_fq_data *efqd;
+ struct hlist_node *n;
+ struct io_group *iog;
+
+ efqd = q->elevator->efqd;
+
+ hlist_for_each_entry(iog, n, &efqd->group_list, elv_data_node) {
+ elv_io_group_congestion_threshold(q, iog);
+ iog_nr_requests_updated(iog);
+ }
+}
+
/*
* Search the io_group for efqd into the hash table (by now only a list)
* of bgrp. Must be called under rcu_read_lock().
@@ -1635,6 +1768,7 @@ io_group_chain_alloc(struct request_queue *q, void *key, struct cgroup *cgroup)
io_group_path(iog);
blk_init_request_list(&iog->rl);
+ elv_io_group_congestion_threshold(q, iog);
if (leaf == NULL) {
leaf = iog;
@@ -1866,6 +2000,7 @@ static struct io_group *io_alloc_root_group(struct request_queue *q,
iog->sched_data.service_tree[i] = ELV_SERVICE_TREE_INIT;
blk_init_request_list(&iog->rl);
+ elv_io_group_congestion_threshold(q, iog);
spin_lock_irq(&iocg->lock);
rcu_assign_pointer(iog->key, key);
hlist_add_head_rcu(&iog->group_node, &iocg->group_data);
@@ -106,6 +106,13 @@ struct io_queue {
};
#ifdef CONFIG_GROUP_IOSCHED /* CONFIG_GROUP_IOSCHED */
+
+enum io_group_state {
+ IOG_async_congested, /* The async queue of group is getting full */
+ IOG_sync_congested, /* The sync queue of group is getting full */
+ IOG_unused, /* Available bits start here */
+};
+
struct io_group {
struct io_entity entity;
atomic_t ref;
@@ -141,6 +148,11 @@ struct io_group {
/* Single ioq per group, used for noop, deadline, anticipatory */
struct io_queue *ioq;
+ /* io group congestion on and off threshold for request descriptors */
+ unsigned int nr_congestion_on;
+ unsigned int nr_congestion_off;
+
+ unsigned long state;
/* request list associated with the group */
struct request_list rl;
};
@@ -468,6 +480,11 @@ elv_get_request_list_bio(struct request_queue *q, struct bio *bio);
struct request_list *
elv_get_request_list_rq(struct request_queue *q, struct request *rq, int priv);
+extern int elv_page_io_group_congested(struct request_queue *q,
+ struct page *page, int sync);
+extern void elv_freed_request(struct request_list *rl, int sync);
+extern void elv_get_request(struct request_list *rl, int sync);
+extern void elv_updated_nr_group_requests(struct request_queue *q);
#else /* !GROUP_IOSCHED */
@@ -506,9 +523,11 @@ elv_lookup_ioq_bio(struct request_queue *q, struct bio *bio)
{
return NULL;
}
-
static inline void elv_get_rl_iog(struct request_list *rl) { }
static inline void elv_put_rl_iog(struct request_list *rl) { }
+static inline void elv_updated_nr_group_requests(struct request_queue *q) { }
+static inline void elv_freed_request(struct request_list *rl, int sync) { }
+static inline void elv_get_request(struct request_list *rl, int sync) { }
#endif /* GROUP_IOSCHED */
@@ -622,6 +641,9 @@ static inline struct io_queue *elv_lookup_ioq_bio(struct request_queue *q,
static inline void elv_get_rl_iog(struct request_list *rl) { }
static inline void elv_put_rl_iog(struct request_list *rl) { }
+static inline void elv_updated_nr_group_requests(struct request_queue *q) { }
+static inline void elv_freed_request(struct request_list *rl, int sync) { }
+static inline void elv_get_request(struct request_list *rl, int sync) { }
#endif /* CONFIG_ELV_FAIR_QUEUING */
#endif /* _ELV_SCHED_H */
@@ -1185,7 +1185,8 @@ int dm_table_resume_targets(struct dm_table *t)
return 0;
}
-int dm_table_any_congested(struct dm_table *t, int bdi_bits)
+int dm_table_any_congested(struct dm_table *t, int bdi_bits, struct page *page,
+ int group)
{
struct dm_dev_internal *dd;
struct list_head *devices = dm_table_get_devices(t);
@@ -1195,9 +1196,11 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
char b[BDEVNAME_SIZE];
- if (likely(q))
- r |= bdi_congested(&q->backing_dev_info, bdi_bits);
- else
+ if (likely(q)) {
+ struct backing_dev_info *bdi = &q->backing_dev_info;
+ r |= group ? bdi_congested_group(bdi, bdi_bits, page)
+ : bdi_congested(bdi, bdi_bits);
+ } else
DMWARN_LIMIT("%s: any_congested: nonexistent device %s",
dm_device_name(t->md),
bdevname(dd->dm_dev.bdev, b));
@@ -1613,7 +1613,8 @@ static void dm_unplug_all(struct request_queue *q)
}
}
-static int dm_any_congested(void *congested_data, int bdi_bits)
+static int dm_any_congested(void *congested_data, int bdi_bits,
+ struct page *page, int group)
{
int r = bdi_bits;
struct mapped_device *md = congested_data;
@@ -1630,8 +1631,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
r = md->queue->backing_dev_info.state &
bdi_bits;
else
- r = dm_table_any_congested(map, bdi_bits);
-
+ r = dm_table_any_congested(map, bdi_bits, page,
+ group);
dm_table_put(map);
}
}
@@ -57,7 +57,8 @@ struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
int dm_table_resume_targets(struct dm_table *t);
-int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+int dm_table_any_congested(struct dm_table *t, int bdi_bits, struct page *page,
+ int group);
int dm_table_any_busy_target(struct dm_table *t);
int dm_table_set_type(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
@@ -102,7 +102,7 @@ static void linear_unplug(struct request_queue *q)
rcu_read_unlock();
}
-static int linear_congested(void *data, int bits)
+static int linear_congested(void *data, int bits, struct page *page, int group)
{
mddev_t *mddev = data;
linear_conf_t *conf;
@@ -113,7 +113,10 @@ static int linear_congested(void *data, int bits)
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ struct backing_dev_info *bdi = &q->backing_dev_info;
+
+ ret |= group ? bdi_congested_group(bdi, bits, page) :
+ bdi_congested(bdi, bits);
}
rcu_read_unlock();
@@ -192,7 +192,8 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
seq_printf (seq, "]");
}
-static int multipath_congested(void *data, int bits)
+static int multipath_congested(void *data, int bits, struct page *page,
+ int group)
{
mddev_t *mddev = data;
multipath_conf_t *conf = mddev->private;
@@ -203,8 +204,10 @@ static int multipath_congested(void *data, int bits)
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
+ struct backing_dev_info *bdi = &q->backing_dev_info;
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= group ? bdi_congested_group(bdi, bits, page)
+ : bdi_congested(bdi, bits);
/* Just like multipath_map, we just check the
* first available device
*/
@@ -37,7 +37,7 @@ static void raid0_unplug(struct request_queue *q)
}
}
-static int raid0_congested(void *data, int bits)
+static int raid0_congested(void *data, int bits, struct page *page, int group)
{
mddev_t *mddev = data;
raid0_conf_t *conf = mddev->private;
@@ -46,8 +46,10 @@ static int raid0_congested(void *data, int bits)
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
+ struct backing_dev_info *bdi = &q->backing_dev_info;
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= group ? bdi_congested_group(bdi, bits, page)
+ : bdi_congested(bdi, bits);
}
return ret;
}
@@ -570,7 +570,7 @@ static void raid1_unplug(struct request_queue *q)
md_wakeup_thread(mddev->thread);
}
-static int raid1_congested(void *data, int bits)
+static int raid1_congested(void *data, int bits, struct page *page, int group)
{
mddev_t *mddev = data;
conf_t *conf = mddev->private;
@@ -581,14 +581,17 @@ static int raid1_congested(void *data, int bits)
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
+ struct backing_dev_info *bdi = &q->backing_dev_info;
/* Note the '|| 1' - when read_balance prefers
* non-congested targets, it can be removed
*/
if ((bits & (1<<BDI_async_congested)) || 1)
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= group ? bdi_congested_group(bdi, bits,
+ page) : bdi_congested(bdi, bits);
else
- ret &= bdi_congested(&q->backing_dev_info, bits);
+ ret &= group ? bdi_congested_group(bdi, bits,
+ page) : bdi_congested(bdi, bits);
}
}
rcu_read_unlock();
@@ -625,7 +625,7 @@ static void raid10_unplug(struct request_queue *q)
md_wakeup_thread(mddev->thread);
}
-static int raid10_congested(void *data, int bits)
+static int raid10_congested(void *data, int bits, struct page *page, int group)
{
mddev_t *mddev = data;
conf_t *conf = mddev->private;
@@ -636,8 +636,10 @@ static int raid10_congested(void *data, int bits)
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
+ struct backing_dev_info *bdi = &q->backing_dev_info;
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= group ? bdi_congested_group(bdi, bits, page)
+ : bdi_congested(bdi, bits);
}
}
rcu_read_unlock();
@@ -3323,7 +3323,7 @@ static void raid5_unplug_device(struct request_queue *q)
unplug_slaves(mddev);
}
-static int raid5_congested(void *data, int bits)
+static int raid5_congested(void *data, int bits, struct page *page, int group)
{
mddev_t *mddev = data;
raid5_conf_t *conf = mddev->private;
@@ -455,7 +455,7 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
}
wbc->nr_to_write -= ret;
- if (wbc->nonblocking && bdi_write_congested(bdi))
+ if (wbc->nonblocking && bdi_or_group_write_congested(bdi, page))
wbc->encountered_congestion = 1;
_leave(" = 0");
@@ -491,6 +491,12 @@ static int afs_writepages_region(struct address_space *mapping,
return 0;
}
+ if (wbc->nonblocking && bdi_write_congested_group(bdi, page)) {
+ wbc->encountered_congestion = 1;
+ page_cache_release(page);
+ break;
+ }
+
/* at this point we hold neither mapping->tree_lock nor lock on
* the page itself: the page may be truncated or invalidated
* (changing page->mapping to NULL), or even swizzled back from
@@ -1249,7 +1249,8 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
return root;
}
-static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+static int btrfs_congested_fn(void *congested_data, int bdi_bits,
+ struct page *page, int group)
{
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
int ret = 0;
@@ -1260,7 +1261,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
if (!device->bdev)
continue;
bdi = blk_get_backing_dev_info(device->bdev);
- if (bdi && bdi_congested(bdi, bdi_bits)) {
+ if (bdi && (group ? bdi_congested_group(bdi, bdi_bits, page) :
+ bdi_congested(bdi, bdi_bits))) {
ret = 1;
break;
}
@@ -2368,6 +2368,18 @@ retry:
unsigned i;
scanned = 1;
+
+ /*
+ * If the io group page will go into is congested, bail out.
+ */
+ if (wbc->nonblocking
+ && bdi_write_congested_group(bdi, pvec.pages[0])) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ pagevec_release(&pvec);
+ break;
+ }
+
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -165,6 +165,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
unsigned long limit;
unsigned long last_waited = 0;
int force_reg = 0;
+ struct page *page;
bdi = blk_get_backing_dev_info(device->bdev);
fs_info = device->dev_root->fs_info;
@@ -276,8 +277,11 @@ loop_lock:
* is now congested. Back off and let other work structs
* run instead
*/
- if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
- fs_info->fs_devices->open_devices > 1) {
+ if (pending)
+ page = bio_iovec_idx(pending, 0)->bv_page;
+
+ if (pending && bdi_or_group_write_congested(bdi, page) &&
+ num_run > 32 && fs_info->fs_devices->open_devices > 1) {
struct io_context *ioc;
ioc = current->io_context;
@@ -1470,6 +1470,17 @@ retry:
n_iov = 0;
bytes_to_write = 0;
+ /*
+ * If the io group page will go into is congested, bail out.
+ */
+ if (wbc->nonblocking &&
+ bdi_write_congested_group(bdi, pvec.pages[0])) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ pagevec_release(&pvec);
+ break;
+ }
+
for (i = 0; i < nr_pages; i++) {
page = pvec.pages[i];
/*
@@ -179,7 +179,7 @@ static void ext2_preread_inode(struct inode *inode)
struct backing_dev_info *bdi;
bdi = inode->i_mapping->backing_dev_info;
- if (bdi_read_congested(bdi))
+ if (bdi_or_group_read_congested(bdi, NULL))
return;
if (bdi_write_congested(bdi))
return;
@@ -371,6 +371,18 @@ retry:
PAGECACHE_TAG_DIRTY,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
scanned = 1;
+
+ /*
+ * If io group page belongs to is congested. bail out.
+ */
+ if (wbc->nonblocking
+ && bdi_write_congested_group(bdi, pvec.pages[0])) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ pagevec_release(&pvec);
+ break;
+ }
+
ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
if (ret)
done = 1;
@@ -266,8 +266,9 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
{
struct bio *bio = wi->bio;
int err;
+ struct page *page = bio_iovec_idx(bio, 0)->bv_page;
- if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
+ if (wi->nbio > 0 && bdi_or_group_write_congested(wi->bdi, page)) {
wait_for_completion(&wi->bio_event);
wi->nbio--;
if (unlikely(atomic_read(&wi->err))) {
@@ -891,7 +891,7 @@ xfs_convert_page(
bdi = inode->i_mapping->backing_dev_info;
wbc->nr_to_write--;
- if (bdi_write_congested(bdi)) {
+ if (bdi_or_group_write_congested(bdi, page)) {
wbc->encountered_congestion = 1;
done = 1;
} else if (wbc->nr_to_write <= 0) {
@@ -714,7 +714,7 @@ xfs_buf_readahead(
struct backing_dev_info *bdi;
bdi = target->bt_mapping->backing_dev_info;
- if (bdi_read_congested(bdi))
+ if (bdi_or_group_read_congested(bdi, NULL))
return;
flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
@@ -29,7 +29,7 @@ enum bdi_state {
BDI_unused, /* Available bits start here */
};
-typedef int (congested_fn)(void *, int);
+typedef int (congested_fn)(void *, int, struct page *, int);
enum bdi_stat_item {
BDI_RECLAIMABLE,
@@ -209,7 +209,7 @@ int writeback_in_progress(struct backing_dev_info *bdi);
static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
{
if (bdi->congested_fn)
- return bdi->congested_fn(bdi->congested_data, bdi_bits);
+ return bdi->congested_fn(bdi->congested_data, bdi_bits, NULL, 0);
return (bdi->state & bdi_bits);
}
@@ -229,6 +229,63 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
(1 << BDI_async_congested));
}
+#ifdef CONFIG_GROUP_IOSCHED
+extern int bdi_congested_group(struct backing_dev_info *bdi, int bdi_bits,
+ struct page *page);
+
+extern int bdi_read_congested_group(struct backing_dev_info *bdi,
+ struct page *page);
+
+extern int bdi_or_group_read_congested(struct backing_dev_info *bdi,
+ struct page *page);
+
+extern int bdi_write_congested_group(struct backing_dev_info *bdi,
+ struct page *page);
+
+extern int bdi_or_group_write_congested(struct backing_dev_info *bdi,
+ struct page *page);
+
+extern int bdi_rw_congested_group(struct backing_dev_info *bdi,
+ struct page *page);
+#else /* CONFIG_GROUP_IOSCHED */
+static inline int bdi_congested_group(struct backing_dev_info *bdi,
+ int bdi_bits, struct page *page)
+{
+ return bdi_congested(bdi, bdi_bits);
+}
+
+static inline int bdi_read_congested_group(struct backing_dev_info *bdi,
+ struct page *page)
+{
+ return bdi_read_congested(bdi);
+}
+
+static inline int bdi_or_group_read_congested(struct backing_dev_info *bdi,
+ struct page *page)
+{
+ return bdi_read_congested(bdi);
+}
+
+static inline int bdi_write_congested_group(struct backing_dev_info *bdi,
+ struct page *page)
+{
+ return bdi_write_congested(bdi);
+}
+
+static inline int bdi_or_group_write_congested(struct backing_dev_info *bdi,
+ struct page *page)
+{
+ return bdi_write_congested(bdi);
+}
+
+static inline int bdi_rw_congested_group(struct backing_dev_info *bdi,
+ struct page *page)
+{
+ return bdi_rw_congested(bdi);
+}
+
+#endif /* CONFIG_GROUP_IOSCHED */
+
enum {
BLK_RW_ASYNC = 0,
BLK_RW_SYNC = 1,
@@ -237,7 +294,7 @@ enum {
void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
void set_bdi_congested(struct backing_dev_info *bdi, int sync);
long congestion_wait(int sync, long timeout);
-
+extern void congestion_wake_up(int sync);
static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
{
@@ -846,6 +846,11 @@ static inline void blk_set_queue_congested(struct request_queue *q, int sync)
set_bdi_congested(&q->backing_dev_info, sync);
}
+#ifdef CONFIG_GROUP_IOSCHED
+extern int blk_queue_io_group_congested(struct backing_dev_info *bdi,
+ int bdi_bits, struct page *page);
+#endif
+
extern void blk_start_queue(struct request_queue *q);
extern void blk_stop_queue(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
+#include "../block/elevator-fq.h"
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
{
@@ -283,16 +284,22 @@ static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
+void congestion_wake_up(int sync)
+{
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
enum bdi_state bit;
- wait_queue_head_t *wqh = &congestion_wqh[sync];
bit = sync ? BDI_sync_congested : BDI_async_congested;
clear_bit(bit, &bdi->state);
smp_mb__after_clear_bit();
- if (waitqueue_active(wqh))
- wake_up(wqh);
+ congestion_wake_up(sync);
}
EXPORT_SYMBOL(clear_bdi_congested);
@@ -327,3 +334,64 @@ long congestion_wait(int sync, long timeout)
}
EXPORT_SYMBOL(congestion_wait);
+/*
+ * With group IO scheduling, there are request descriptors per io group per
+ * queue. So generic notion of whether queue is congested or not is not
+ * very accurate. Queue might not be congested but the io group in which
+ * request will go might actually be congested.
+ *
+ * Hence to get the correct idea about congestion level, one should query
+ * the io group congestion status on the queue. Pass in the page information
+ * which can be used to determine the io group of the page and congestion
+ * status can be determined accordingly.
+ *
+ * If page info is not passed, io group is determined from the current task
+ * context.
+ */
+#ifdef CONFIG_GROUP_IOSCHED
+int bdi_congested_group(struct backing_dev_info *bdi, int bdi_bits,
+ struct page *page)
+{
+ if (bdi->congested_fn)
+ return bdi->congested_fn(bdi->congested_data, bdi_bits, page, 1);
+
+ return blk_queue_io_group_congested(bdi, bdi_bits, page);
+}
+EXPORT_SYMBOL(bdi_congested_group);
+
+int bdi_read_congested_group(struct backing_dev_info *bdi, struct page *page)
+{
+ return bdi_congested_group(bdi, 1 << BDI_sync_congested, page);
+}
+EXPORT_SYMBOL(bdi_read_congested_group);
+
+/* Checks if either bdi or associated group is read congested */
+int bdi_or_group_read_congested(struct backing_dev_info *bdi,
+ struct page *page)
+{
+ return bdi_read_congested(bdi) || bdi_read_congested_group(bdi, page);
+}
+EXPORT_SYMBOL(bdi_or_group_read_congested);
+
+int bdi_write_congested_group(struct backing_dev_info *bdi, struct page *page)
+{
+ return bdi_congested_group(bdi, 1 << BDI_async_congested, page);
+}
+EXPORT_SYMBOL(bdi_write_congested_group);
+
+/* Checks if either bdi or associated group is write congested */
+int bdi_or_group_write_congested(struct backing_dev_info *bdi,
+ struct page *page)
+{
+ return bdi_write_congested(bdi) || bdi_write_congested_group(bdi, page);
+}
+EXPORT_SYMBOL(bdi_or_group_write_congested);
+
+int bdi_rw_congested_group(struct backing_dev_info *bdi, struct page *page)
+{
+ return bdi_congested_group(bdi, (1 << BDI_sync_congested) |
+ (1 << BDI_async_congested), page);
+}
+EXPORT_SYMBOL(bdi_rw_congested_group);
+
+#endif /* CONFIG_GROUP_IOSCHED */
@@ -985,6 +985,17 @@ retry:
if (nr_pages == 0)
break;
+ /*
+ * If the io group page will go into is congested, bail out.
+ */
+ if (wbc->nonblocking
+ && bdi_write_congested_group(bdi, pvec.pages[0])) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ pagevec_release(&pvec);
+ break;
+ }
+
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -542,7 +542,7 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (bdi_read_congested(mapping->backing_dev_info))
+ if (bdi_or_group_read_congested(mapping->backing_dev_info, NULL))
return;
/* do read-ahead */