@@ -54,6 +54,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
static bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
@@ -88,6 +89,8 @@ static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ WARN_ON(!bio_list_empty(&blkg->async_bios));
+
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
if (blkg->parent)
@@ -113,6 +116,23 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release);
}
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+ async_bio_work);
+ struct bio_list bios = BIO_EMPTY_LIST;
+ struct bio *bio;
+
+ /* as long as there are pending bios, @blkg can't go away */
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_merge(&bios, &blkg->async_bios);
+ bio_list_init(&blkg->async_bios);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ while ((bio = bio_list_pop(&bios)))
+ submit_bio(bio);
+}
+
/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
@@ -141,6 +161,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
+ spin_lock_init(&blkg->async_bio_lock);
+ bio_list_init(&blkg->async_bios);
+ INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;
for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1527,6 +1550,25 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+
+ /* consume the flag first */
+ bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+ /* never bounce for the root cgroup */
+ if (!blkg->parent)
+ return false;
+
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_add(&blkg->async_bios, bio);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+ return true;
+}
+
/*
* Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a
@@ -1727,5 +1769,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}
+static int __init blkcg_init(void)
+{
+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!blkcg_punt_bio_wq)
+ return -ENOMEM;
+ return 0;
+}
+subsys_initcall(blkcg_init);
+
module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
@@ -1127,6 +1127,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*/
blk_qc_t submit_bio(struct bio *bio)
{
+ if (blkcg_punt_bio_submit(bio))
+ return BLK_QC_T_NONE;
+
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
extern struct workqueue_struct *bdi_wq;
+extern struct workqueue_struct *bdi_async_bio_wq;
static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
@@ -132,13 +132,17 @@ struct blkcg_gq {
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
- struct rcu_head rcu_head;
+ spinlock_t async_bio_lock;
+ struct bio_list async_bios;
+ struct work_struct async_bio_work;
atomic_t use_delay;
atomic64_t delay_nsec;
atomic64_t delay_start;
u64 last_delay;
int last_use;
+
+ struct rcu_head rcu_head;
};
typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
struct bio *bio) { return false; }
#endif
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+ if (bio->bi_opf & REQ_CGROUP_PUNT)
+ return __blkcg_punt_bio_submit(bio);
+ else
+ return false;
+}
static inline void blkcg_bio_issue_init(struct bio *bio)
{
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline bool blkcg_bio_issue_check(struct request_queue *q,
struct bio *bio) { return true; }
@@ -311,6 +311,14 @@ enum req_flag_bits {
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
__REQ_NOWAIT, /* Don't wait if request will block */
+ /*
+ * When a shared kthread needs to issue a bio for a cgroup, doing
+ * so synchronously can lead to priority inversions as the kthread
+ * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
+ * submit_bio() punt the actual issuing to a dedicated per-blkcg
+ * work item to avoid such priority inversions.
+ */
+ __REQ_CGROUP_PUNT,
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
+#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
+
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
#define REQ_HIPRI (1ULL << __REQ_HIPRI)
@@ -78,6 +78,8 @@ struct writeback_control {
*/
unsigned no_cgroup_owner:1;
+ unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *wb; /* wb this writeback is issued under */
struct inode *inode; /* inode being written out */
@@ -94,12 +96,17 @@ struct writeback_control {
static inline int wbc_to_write_flags(struct writeback_control *wbc)
{
+ int flags = 0;
+
+ if (wbc->punt_to_cgroup)
+ flags = REQ_CGROUP_PUNT;
+
if (wbc->sync_mode == WB_SYNC_ALL)
- return REQ_SYNC;
+ flags |= REQ_SYNC;
else if (wbc->for_kupdate || wbc->for_background)
- return REQ_BACKGROUND;
+ flags |= REQ_BACKGROUND;
- return 0;
+ return flags;
}
static inline struct cgroup_subsys_state *