@@ -340,54 +340,6 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
-static void bio_alloc_rescue(struct work_struct *work)
-{
- struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
- struct bio *bio;
-
- while (1) {
- spin_lock(&bs->rescue_lock);
- bio = bio_list_pop(&bs->rescue_list);
- spin_unlock(&bs->rescue_lock);
-
- if (!bio)
- break;
-
- generic_make_request(bio);
- }
-}
-
-static void punt_bios_to_rescuer(struct bio_set *bs)
-{
- struct bio_list punt, nopunt;
- struct bio *bio;
-
- /*
- * In order to guarantee forward progress we must punt only bios that
- * were allocated from this bio_set; otherwise, if there was a bio on
- * there for a stacking driver higher up in the stack, processing it
- * could require allocating bios from this bio_set, and doing that from
- * our own rescuer would be bad.
- *
- * Since bio lists are singly linked, pop them all instead of trying to
- * remove from the middle of the list:
- */
-
- bio_list_init(&punt);
- bio_list_init(&nopunt);
-
- while ((bio = bio_list_pop(current->bio_list)))
- bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
-
- *current->bio_list = nopunt;
-
- spin_lock(&bs->rescue_lock);
- bio_list_merge(&bs->rescue_list, &punt);
- spin_unlock(&bs->rescue_lock);
-
- queue_work(bs->rescue_workqueue, &bs->rescue_work);
-}
-
/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_ mask given to the slab allocator
@@ -425,17 +377,20 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
*/
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
- gfp_t saved_gfp = gfp_mask;
unsigned front_pad;
unsigned inline_vecs;
struct bio_vec *bvl = NULL;
struct bio *bio;
void *p;
- if (!bs) {
- if (nr_iovecs > UIO_MAXIOV)
- return NULL;
+ WARN(current->bio_list &&
+ !current->bio_list->q->rescue_workqueue,
+ "allocating bio beneath generic_make_request() without rescuer");
+ if (nr_iovecs > UIO_MAXIOV)
+ return NULL;
+
+ if (!bs) {
p = kmalloc(sizeof(struct bio) +
nr_iovecs * sizeof(struct bio_vec),
gfp_mask);
@@ -445,37 +400,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
/* should not use nobvec bioset for nr_iovecs > 0 */
if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0))
return NULL;
- /*
- * generic_make_request() converts recursion to iteration; this
- * means if we're running beneath it, any bios we allocate and
- * submit will not be submitted (and thus freed) until after we
- * return.
- *
- * This exposes us to a potential deadlock if we allocate
- * multiple bios from the same bio_set() while running
- * underneath generic_make_request(). If we were to allocate
- * multiple bios (say a stacking block driver that was splitting
- * bios), we would deadlock if we exhausted the mempool's
- * reserve.
- *
- * We solve this, and guarantee forward progress, with a rescuer
- * workqueue per bio_set. If we go to allocate and there are
- * bios on current->bio_list, we first try the allocation
- * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
- * bios we would be blocking to the rescuer workqueue before
- * we retry with the original gfp_flags.
- */
-
- if (current->bio_list && !bio_list_empty(current->bio_list))
- gfp_mask &= ~__GFP_DIRECT_RECLAIM;
p = mempool_alloc(bs->bio_pool, gfp_mask);
- if (!p && gfp_mask != saved_gfp) {
- punt_bios_to_rescuer(bs);
- gfp_mask = saved_gfp;
- p = mempool_alloc(bs->bio_pool, gfp_mask);
- }
-
front_pad = bs->front_pad;
inline_vecs = BIO_INLINE_VECS;
}
@@ -490,12 +416,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
unsigned long idx = 0;
bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
- if (!bvl && gfp_mask != saved_gfp) {
- punt_bios_to_rescuer(bs);
- gfp_mask = saved_gfp;
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
- }
-
if (unlikely(!bvl))
goto err_free;
@@ -1892,9 +1812,6 @@ mempool_t *biovec_create_pool(int pool_entries)
void bioset_free(struct bio_set *bs)
{
- if (bs->rescue_workqueue)
- destroy_workqueue(bs->rescue_workqueue);
-
if (bs->bio_pool)
mempool_destroy(bs->bio_pool);
@@ -1921,10 +1838,6 @@ static struct bio_set *__bioset_create(unsigned int pool_size,
bs->front_pad = front_pad;
- spin_lock_init(&bs->rescue_lock);
- bio_list_init(&bs->rescue_list);
- INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
-
bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
if (!bs->bio_slab) {
kfree(bs);
@@ -1941,10 +1854,6 @@ static struct bio_set *__bioset_create(unsigned int pool_size,
goto bad;
}
- bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
- if (!bs->rescue_workqueue)
- goto bad;
-
return bs;
bad:
bioset_free(bs);
@@ -49,6 +49,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
DEFINE_IDA(blk_queue_ida);
+static void bio_rescue_work(struct work_struct *);
+
/*
* For the allocated request tables
*/
@@ -643,9 +645,9 @@ void blk_exit_rl(struct request_list *rl)
mempool_destroy(rl->rq_pool);
}
-struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
+struct request_queue *blk_alloc_queue(gfp_t gfp_mask, int flags)
{
- return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
+ return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, flags);
}
EXPORT_SYMBOL(blk_alloc_queue);
@@ -690,7 +692,7 @@ static void blk_rq_timed_out_timer(unsigned long data)
kblockd_schedule_work(&q->timeout_work);
}
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, int flags)
{
struct request_queue *q;
int err;
@@ -760,11 +762,23 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
goto fail_bdi;
+ spin_lock_init(&q->rescue_lock);
+ bio_list_init(&q->rescue_list);
+ INIT_WORK(&q->rescue_work, bio_rescue_work);
+
+ if (!(flags & BLK_QUEUE_NO_RESCUER)) {
+ q->rescue_workqueue = alloc_workqueue("rescue", WQ_MEM_RECLAIM, 0);
+ if (!q->rescue_workqueue)
+ goto fail_ref;
+ }
+
if (blkcg_init_queue(q))
- goto fail_ref;
+ goto fail_rescue;
return q;
+fail_rescue:
+ destroy_workqueue(q->rescue_workqueue);
fail_ref:
percpu_ref_exit(&q->q_usage_counter);
fail_bdi:
@@ -823,7 +837,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
struct request_queue *uninit_q, *q;
- uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+ uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id,
+ BLK_QUEUE_NO_RESCUER);
if (!uninit_q)
return NULL;
@@ -1977,7 +1992,7 @@ generic_make_request_checks(struct bio *bio)
*/
blk_qc_t generic_make_request(struct bio *bio)
{
- struct bio_list bio_list_on_stack;
+ struct bio_plug_list bio_list_on_stack;
blk_qc_t ret = BLK_QC_T_NONE;
if (!generic_make_request_checks(bio))
@@ -1994,7 +2009,9 @@ blk_qc_t generic_make_request(struct bio *bio)
* should be added at the tail
*/
if (current->bio_list) {
- bio_list_add(current->bio_list, bio);
+ WARN(!current->bio_list->q->rescue_workqueue,
+ "submitting bio beneath generic_make_request() without rescuer");
+ bio_list_add(¤t->bio_list->bios, bio);
goto out;
}
@@ -2013,19 +2030,23 @@ blk_qc_t generic_make_request(struct bio *bio)
* bio_list, and call into ->make_request() again.
*/
BUG_ON(bio->bi_next);
- bio_list_init(&bio_list_on_stack);
+ bio_list_init(&bio_list_on_stack.bios);
current->bio_list = &bio_list_on_stack;
+
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ current->bio_list->q = q;
+
if (likely(blk_queue_enter(q, false) == 0)) {
ret = q->make_request_fn(q, bio);
blk_queue_exit(q);
- bio = bio_list_pop(current->bio_list);
+ bio = bio_list_pop(¤t->bio_list->bios);
} else {
- struct bio *bio_next = bio_list_pop(current->bio_list);
+ struct bio *bio_next =
+ bio_list_pop(¤t->bio_list->bios);
bio_io_error(bio);
bio = bio_next;
@@ -2038,6 +2059,34 @@ blk_qc_t generic_make_request(struct bio *bio)
}
EXPORT_SYMBOL(generic_make_request);
+static void bio_rescue_work(struct work_struct *work)
+{
+ struct request_queue *q =
+ container_of(work, struct request_queue, rescue_work);
+ struct bio *bio;
+
+ while (1) {
+ spin_lock(&q->rescue_lock);
+ bio = bio_list_pop(&q->rescue_list);
+ spin_unlock(&q->rescue_lock);
+
+ if (!bio)
+ break;
+
+ generic_make_request(bio);
+ }
+}
+
+void blk_punt_blocked_bios(struct bio_plug_list *list)
+{
+ spin_lock(&list->q->rescue_lock);
+ bio_list_merge(&list->q->rescue_list, &list->bios);
+ bio_list_init(&list->bios);
+ spin_unlock(&list->q->rescue_lock);
+
+ queue_work(list->q->rescue_workqueue, &list->q->rescue_work);
+}
+
/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
@@ -2043,7 +2043,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
struct request_queue *uninit_q, *q;
- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
+ uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node,
+ BLK_QUEUE_NO_RESCUER);
if (!uninit_q)
return ERR_PTR(-ENOMEM);
@@ -821,6 +821,8 @@ static void blk_release_queue(struct kobject *kobj)
blk_trace_shutdown(q);
+ if (q->rescue_workqueue)
+ destroy_workqueue(q->rescue_workqueue);
if (q->bio_split)
bioset_free(q->bio_split);
@@ -449,7 +449,7 @@ static struct brd_device *brd_alloc(int i)
spin_lock_init(&brd->brd_lock);
INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
- brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
+ brd->brd_queue = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!brd->brd_queue)
goto out_free_dev;
@@ -2810,7 +2810,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_init_set_defaults(device);
- q = blk_alloc_queue(GFP_KERNEL);
+ q = blk_alloc_queue(GFP_KERNEL, 0);
if (!q)
goto out_no_q;
device->rq_queue = q;
@@ -734,7 +734,8 @@ static int null_add_dev(void)
goto out_cleanup_tags;
}
} else if (queue_mode == NULL_Q_BIO) {
- nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+ nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node,
+ BLK_QUEUE_NO_RESCUER);
if (!nullb->q) {
rv = -ENOMEM;
goto out_cleanup_queues;
@@ -2737,7 +2737,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
strcpy(disk->disk_name, pd->name);
disk->devnode = pktcdvd_devnode;
disk->private_data = pd;
- disk->queue = blk_alloc_queue(GFP_KERNEL);
+ disk->queue = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!disk->queue)
goto out_mem2;
@@ -746,7 +746,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
ps3vram_cache_init(dev);
ps3vram_proc_init(dev);
- queue = blk_alloc_queue(GFP_KERNEL);
+ queue = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!queue) {
dev_err(&dev->core, "blk_alloc_queue failed\n");
error = -ENOMEM;
@@ -266,7 +266,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
return -ENOMEM;
}
- card->queue = blk_alloc_queue(GFP_KERNEL);
+ card->queue = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!card->queue) {
dev_err(CARD_TO_DEV(card), "Failed queue alloc\n");
unregister_blkdev(card->major, DRIVER_NAME);
@@ -890,7 +890,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
card->bio = NULL;
card->biotail = &card->bio;
- card->queue = blk_alloc_queue(GFP_KERNEL);
+ card->queue = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!card->queue)
goto failed_alloc;
@@ -1245,7 +1245,7 @@ static int zram_add(void)
init_rwsem(&zram->init_lock);
- queue = blk_alloc_queue(GFP_KERNEL);
+ queue = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!queue) {
pr_err("Error allocating disk queue for device %d\n",
device_id);
@@ -233,7 +233,7 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
goto err_reserve;
}
- tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
+ tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, 0);
if (!tqueue)
goto err_dev;
blk_queue_make_request(tqueue, tt->make_rq);
@@ -800,7 +800,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
d->disk->fops = &bcache_ops;
d->disk->private_data = d;
- q = blk_alloc_queue(GFP_KERNEL);
+ q = blk_alloc_queue(GFP_KERNEL, 0);
if (!q)
return -ENOMEM;
@@ -1490,7 +1490,7 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
+ md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, 0);
if (!md->queue)
goto bad;
@@ -5061,7 +5061,7 @@ static int md_alloc(dev_t dev, char *name)
}
error = -ENOMEM;
- mddev->queue = blk_alloc_queue(GFP_KERNEL);
+ mddev->queue = blk_alloc_queue(GFP_KERNEL, 0);
if (!mddev->queue)
goto abort;
mddev->queue->queuedata = mddev;
@@ -264,7 +264,7 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk));
available_disk_size = internal_nlba * nsblk_sector_size(nsblk);
- q = blk_alloc_queue(GFP_KERNEL);
+ q = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!q)
return -ENOMEM;
if (devm_add_action_or_reset(dev, nd_blk_release_queue, q))
@@ -1232,7 +1232,7 @@ static int btt_blk_init(struct btt *btt)
struct nd_namespace_common *ndns = nd_btt->ndns;
/* create a new disk and request queue for btt */
- btt->btt_queue = blk_alloc_queue(GFP_KERNEL);
+ btt->btt_queue = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!btt->btt_queue)
return -ENOMEM;
@@ -280,7 +280,8 @@ static int pmem_attach_disk(struct device *dev,
return -EBUSY;
}
- q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
+ q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev),
+ BLK_QUEUE_NO_RESCUER);
if (!q)
return -ENOMEM;
@@ -612,7 +612,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
}
dev_info->gd->major = dcssblk_major;
dev_info->gd->fops = &dcssblk_devops;
- dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL);
+ dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
dev_info->gd->queue = dev_info->dcssblk_queue;
dev_info->gd->private_data = dev_info;
blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
@@ -342,7 +342,7 @@ static int __init xpram_setup_blkdev(void)
xpram_disks[i] = alloc_disk(1);
if (!xpram_disks[i])
goto out;
- xpram_queues[i] = blk_alloc_queue(GFP_KERNEL);
+ xpram_queues[i] = blk_alloc_queue(GFP_KERNEL, BLK_QUEUE_NO_RESCUER);
if (!xpram_queues[i]) {
put_disk(xpram_disks[i]);
goto out;
@@ -656,6 +656,13 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
return bio;
}
+struct bio_plug_list {
+ struct bio_list bios;
+ struct request_queue *q;
+};
+
+void blk_punt_blocked_bios(struct bio_plug_list *);
+
/*
* Increment chain count for the bio. Make sure the CHAIN flag update
* is visible before the raised count.
@@ -685,15 +692,6 @@ struct bio_set {
mempool_t *bio_integrity_pool;
mempool_t *bvec_integrity_pool;
#endif
-
- /*
- * Deadlock avoidance for stacking block drivers: see comments in
- * bio_alloc_bioset() for details
- */
- spinlock_t rescue_lock;
- struct bio_list rescue_list;
- struct work_struct rescue_work;
- struct workqueue_struct *rescue_workqueue;
};
struct biovec_slab {
@@ -570,6 +570,16 @@ struct request_queue {
struct bio_set *bio_split;
bool mq_sysfs_init_done;
+
+ /*
+ * Deadlock avoidance, to deal with the plugging in
+ * generic_make_request() that converts recursion to iteration to avoid
+ * stack overflow:
+ */
+ spinlock_t rescue_lock;
+ struct bio_list rescue_list;
+ struct work_struct rescue_work;
+ struct workqueue_struct *rescue_workqueue;
};
#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
@@ -1192,9 +1202,11 @@ extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatte
extern void blk_dump_rq_flags(struct request *, char *);
extern long nr_blockdev_pages(void);
+#define BLK_QUEUE_NO_RESCUER 1
+
bool __must_check blk_get_queue(struct request_queue *);
-struct request_queue *blk_alloc_queue(gfp_t);
-struct request_queue *blk_alloc_queue_node(gfp_t, int);
+struct request_queue *blk_alloc_queue(gfp_t, int);
+struct request_queue *blk_alloc_queue_node(gfp_t, int, int);
extern void blk_put_queue(struct request_queue *);
extern void blk_set_queue_dying(struct request_queue *);
@@ -1797,7 +1797,7 @@ struct task_struct {
void *journal_info;
/* stacked block device info */
- struct bio_list *bio_list;
+ struct bio_plug_list *bio_list;
#ifdef CONFIG_BLOCK
/* stack plugging */
@@ -3440,6 +3440,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
{
if (!tsk->state || tsk_is_pi_blocked(tsk))
return;
+
+ if (tsk->bio_list &&
+ !bio_list_empty(&tsk->bio_list->bios) &&
+ tsk->bio_list->q->rescue_workqueue)
+ blk_punt_blocked_bios(tsk->bio_list);
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.