diff mbox series

[v2,2/2] blk-cgroup: Optimize blkcg_rstat_flush()

Message ID 20220601165324.60892-2-longman@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v2,1/2] blk-cgroup: Correctly free percpu iostat_cpu in blkg on error exit | expand

Commit Message

Waiman Long June 1, 2022, 4:53 p.m. UTC
For a system with many CPUs and block devices, the time to do
blkcg_rstat_flush() from cgroup_rstat_flush() can be rather long. It
can be especially problematic as interrupt is disabled during the flush.
It was reported that it might take seconds to complete in some extreme
cases leading to hard lockup messages.

As it is likely that not all the percpu blkg_iostat_set's has been
updated since the last flush, those stale blkg_iostat_set's don't need
to be flushed in this case. This patch optimizes blkcg_rstat_flush()
by keeping a lockless list of recently updated blkg_iostat_set's in a
newly added percpu blkcg->lhead pointer.

The blkg_iostat_set is added to the lockless list on the update side
in blk_cgroup_bio_start(). It is removed from the lockless list when
flushed in blkcg_rstat_flush(). Due to racing, it is possible that
blk_iostat_set's in the lockless list may have no new iostat's to
be flushed.

A blkg_iostat_set can determine if it is in a lockless list by checking
the content of its lnode.next pointer which will be non-NULL when in
a lockless list. This requires the presence of a special llist_last
sentinel to be put at the end of the lockless list.

When booting up an instrumented test kernel with this patch on a
2-socket 96-thread system with cgroup v2, out of the 2051 calls to
cgroup_rstat_flush() after bootup, 1788 of the calls were exited
immediately because of empty lockless list. After an all-cpu kernel
build, the ratio became 6295424/6340513. That was more than 99%.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 block/blk-cgroup.c | 74 ++++++++++++++++++++++++++++++++++++++++------
 block/blk-cgroup.h |  9 ++++++
 2 files changed, 74 insertions(+), 9 deletions(-)

Comments

Tejun Heo June 1, 2022, 5:48 p.m. UTC | #1
Hello,

On Wed, Jun 01, 2022 at 12:53:24PM -0400, Waiman Long wrote:
> +static struct llist_node llist_last;	/* Last sentinel node of llist */

Can you please add comment explaining why we need the special sentinel and
empty helper?

> +static inline bool blkcg_llist_empty(struct llist_head *lhead)
> +{
> +	return lhead->first == &llist_last;
> +}
> +
> +static inline void init_blkcg_llists(struct blkcg *blkcg)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu)
> +		per_cpu_ptr(blkcg->lhead, cpu)->first = &llist_last;
> +}
> +
> +static inline struct llist_node *
> +fetch_delete_blkcg_llist(struct llist_head *lhead)
> +{
> +	return xchg(&lhead->first, &llist_last);
> +}
> +
> +/*
> + * The retrieved blkg_iostat_set is immediately marked as not in the
> + * lockless list by clearing its node->next pointer. It could be put
> + * back into the list by a parallel update before the iostat's are
> + * finally flushed. So being in the list doesn't always mean it has new
> + * iostat's to be flushed.
> + */

Isn't the above true for any sort of mechanism which tracking pending state?
You gotta clear the pending state before consuming so that you don't miss
the events which happen while data is being consumed.

> +#define blkcg_llist_for_each_entry_safe(pos, node, nxt)			\
> +	for (; (node != &llist_last) &&					\
> +	       (pos = llist_entry(node, struct blkg_iostat_set, lnode),	\
> +		nxt = node->next, node->next = NULL, true);		\
> +		node = nxt)
> +
>  /**
>   * blkcg_css - find the current css
>   *
...
> @@ -852,17 +888,26 @@ static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
>  static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>  {
>  	struct blkcg *blkcg = css_to_blkcg(css);
> -	struct blkcg_gq *blkg;
> +	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> +	struct llist_node *lnode, *lnext;
> +	struct blkg_iostat_set *bisc;
>  
>  	/* Root-level stats are sourced from system-wide IO stats */
>  	if (!cgroup_parent(css->cgroup))
>  		return;
>  
> -	rcu_read_lock();
> +	if (blkcg_llist_empty(lhead))
> +		return;
>  
> -	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
> +	lnode = fetch_delete_blkcg_llist(lhead);
> +
> +	/*
> +	 * No RCU protection is needed as it is assumed that blkg_iostat_set's
> +	 * in the percpu lockless list won't go away until the flush is done.
> +	 */

Can you please elaborate on why this is safe?

> +	blkcg_llist_for_each_entry_safe(bisc, lnode, lnext) {
> +		struct blkcg_gq *blkg = bisc->blkg;
>  		struct blkcg_gq *parent = blkg->parent;
> -		struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
>  		struct blkg_iostat cur, delta;
>  		unsigned long flags;
>  		unsigned int seq;

Overall, looks fantastic to me. Thanks a lot for working on it.
Waiman Long June 1, 2022, 6:15 p.m. UTC | #2
On 6/1/22 13:48, Tejun Heo wrote:
> Hello,
>
> On Wed, Jun 01, 2022 at 12:53:24PM -0400, Waiman Long wrote:
>> +static struct llist_node llist_last;	/* Last sentinel node of llist */
> Can you please add comment explaining why we need the special sentinel and
> empty helper?

It was mentioned in the commit log, but I will add a comment to repeat 
that. It is because lnode.next is used as a flag to indicate its 
presence in the lockless list. By default, the first one that go into 
the lockless list will have a NULL value in its next pointer. So I have 
to put a sentinel node that to make sure that the next pointer is always 
non-NULL.


>
>> +static inline bool blkcg_llist_empty(struct llist_head *lhead)
>> +{
>> +	return lhead->first == &llist_last;
>> +}
>> +
>> +static inline void init_blkcg_llists(struct blkcg *blkcg)
>> +{
>> +	int cpu;
>> +
>> +	for_each_possible_cpu(cpu)
>> +		per_cpu_ptr(blkcg->lhead, cpu)->first = &llist_last;
>> +}
>> +
>> +static inline struct llist_node *
>> +fetch_delete_blkcg_llist(struct llist_head *lhead)
>> +{
>> +	return xchg(&lhead->first, &llist_last);
>> +}
>> +
>> +/*
>> + * The retrieved blkg_iostat_set is immediately marked as not in the
>> + * lockless list by clearing its node->next pointer. It could be put
>> + * back into the list by a parallel update before the iostat's are
>> + * finally flushed. So being in the list doesn't always mean it has new
>> + * iostat's to be flushed.
>> + */
> Isn't the above true for any sort of mechanism which tracking pending state?
> You gotta clear the pending state before consuming so that you don't miss
> the events which happen while data is being consumed.

That is true. I was about thinking what race conditions can happen with 
these changes. The above comment is for the race that can happen which 
is benign. I am remove it if you think it is necessary.

>
>> +#define blkcg_llist_for_each_entry_safe(pos, node, nxt)			\
>> +	for (; (node != &llist_last) &&					\
>> +	       (pos = llist_entry(node, struct blkg_iostat_set, lnode),	\
>> +		nxt = node->next, node->next = NULL, true);		\
>> +		node = nxt)
>> +
>>   /**
>>    * blkcg_css - find the current css
>>    *
> ...
>> @@ -852,17 +888,26 @@ static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
>>   static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>>   {
>>   	struct blkcg *blkcg = css_to_blkcg(css);
>> -	struct blkcg_gq *blkg;
>> +	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
>> +	struct llist_node *lnode, *lnext;
>> +	struct blkg_iostat_set *bisc;
>>   
>>   	/* Root-level stats are sourced from system-wide IO stats */
>>   	if (!cgroup_parent(css->cgroup))
>>   		return;
>>   
>> -	rcu_read_lock();
>> +	if (blkcg_llist_empty(lhead))
>> +		return;
>>   
>> -	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
>> +	lnode = fetch_delete_blkcg_llist(lhead);
>> +
>> +	/*
>> +	 * No RCU protection is needed as it is assumed that blkg_iostat_set's
>> +	 * in the percpu lockless list won't go away until the flush is done.
>> +	 */
> Can you please elaborate on why this is safe?

You are right that the comment is probably not quite right. I will put 
the rcu_read_lock/unlock() back in. However, we don't have a rcu 
iterator for the lockless list. On the other hand, blkcg_rstat_flush() 
is now called with irq disabled. So rcu_read_lock() is not technically 
needed.

Will send out a v3 soon.

Thanks,
Longman

>
>> +	blkcg_llist_for_each_entry_safe(bisc, lnode, lnext) {
>> +		struct blkcg_gq *blkg = bisc->blkg;
>>   		struct blkcg_gq *parent = blkg->parent;
>> -		struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
>>   		struct blkg_iostat cur, delta;
>>   		unsigned long flags;
>>   		unsigned int seq;
> Overall, looks fantastic to me. Thanks a lot for working on it.
>
Tejun Heo June 1, 2022, 6:35 p.m. UTC | #3
Hello,

On Wed, Jun 01, 2022 at 02:15:46PM -0400, Waiman Long wrote:
> It was mentioned in the commit log, but I will add a comment to repeat that.
> It is because lnode.next is used as a flag to indicate its presence in the
> lockless list. By default, the first one that go into the lockless list will
> have a NULL value in its next pointer. So I have to put a sentinel node that
> to make sure that the next pointer is always non-NULL.

Oh yeah, I noticed that in the commit log, but I think it really warrants an
inline comment.

> > > + * The retrieved blkg_iostat_set is immediately marked as not in the
> > > + * lockless list by clearing its node->next pointer. It could be put
> > > + * back into the list by a parallel update before the iostat's are
> > > + * finally flushed. So being in the list doesn't always mean it has new
> > > + * iostat's to be flushed.
> > > + */
> > Isn't the above true for any sort of mechanism which tracking pending state?
> > You gotta clear the pending state before consuming so that you don't miss
> > the events which happen while data is being consumed.
> 
> That is true. I was about thinking what race conditions can happen with
> these changes. The above comment is for the race that can happen which is
> benign. I am remove it if you think it is necessary.

I don't have too strong an opinion. It just felt a bit disproportionate for
it to be sticking out like that. Maybe toning it down a little bit would
help?

> > > +	/*
> > > +	 * No RCU protection is needed as it is assumed that blkg_iostat_set's
> > > +	 * in the percpu lockless list won't go away until the flush is done.
> > > +	 */
> > Can you please elaborate on why this is safe?
> 
> You are right that the comment is probably not quite right. I will put the
> rcu_read_lock/unlock() back in. However, we don't have a rcu iterator for
> the lockless list. On the other hand, blkcg_rstat_flush() is now called with
> irq disabled. So rcu_read_lock() is not technically needed.

Maybe we just need an rcu_read_lock_held() - does that cover irq being
disabled? I'm not sure what the rules are since the different rcu variants
got merged. Anyways, the right thing to do would be asserting and
documenting that the section is RCU protected.

As for llist not having rcu iterators. The llists aren't RCU protected or
assigned. What's RCU protected is the lifetime of the elements. That said,
we'd need an rmb after fetching llist_head to guarantee that the flusher
sees all the updates which took place before the node got added to the
llist, right?

Can you also add an explanation on how the pending llist is synchronized
against blkg destructions?

Thanks.
Waiman Long June 1, 2022, 6:52 p.m. UTC | #4
On 6/1/22 14:35, Tejun Heo wrote:
> Hello,
>
> On Wed, Jun 01, 2022 at 02:15:46PM -0400, Waiman Long wrote:
>> It was mentioned in the commit log, but I will add a comment to repeat that.
>> It is because lnode.next is used as a flag to indicate its presence in the
>> lockless list. By default, the first one that go into the lockless list will
>> have a NULL value in its next pointer. So I have to put a sentinel node that
>> to make sure that the next pointer is always non-NULL.
> Oh yeah, I noticed that in the commit log, but I think it really warrants an
> inline comment.
>
>>>> + * The retrieved blkg_iostat_set is immediately marked as not in the
>>>> + * lockless list by clearing its node->next pointer. It could be put
>>>> + * back into the list by a parallel update before the iostat's are
>>>> + * finally flushed. So being in the list doesn't always mean it has new
>>>> + * iostat's to be flushed.
>>>> + */
>>> Isn't the above true for any sort of mechanism which tracking pending state?
>>> You gotta clear the pending state before consuming so that you don't miss
>>> the events which happen while data is being consumed.
>> That is true. I was about thinking what race conditions can happen with
>> these changes. The above comment is for the race that can happen which is
>> benign. I am remove it if you think it is necessary.
> I don't have too strong an opinion. It just felt a bit disproportionate for
> it to be sticking out like that. Maybe toning it down a little bit would
> help?

Will do.


>>>> +	/*
>>>> +	 * No RCU protection is needed as it is assumed that blkg_iostat_set's
>>>> +	 * in the percpu lockless list won't go away until the flush is done.
>>>> +	 */
>>> Can you please elaborate on why this is safe?
>> You are right that the comment is probably not quite right. I will put the
>> rcu_read_lock/unlock() back in. However, we don't have a rcu iterator for
>> the lockless list. On the other hand, blkcg_rstat_flush() is now called with
>> irq disabled. So rcu_read_lock() is not technically needed.
> Maybe we just need an rcu_read_lock_held() - does that cover irq being
> disabled? I'm not sure what the rules are since the different rcu variants
> got merged. Anyways, the right thing to do would be asserting and
> documenting that the section is RCU protected.

I will leave rcu_read_lock() in for now. We can worry about the proper 
way to remove it or document it later on.


>
> As for llist not having rcu iterators. The llists aren't RCU protected or
> assigned. What's RCU protected is the lifetime of the elements. That said,
> we'd need an rmb after fetching llist_head to guarantee that the flusher
> sees all the updates which took place before the node got added to the
> llist, right?

Fetching of llist head is done by an atomic xchg(). So it has all the 
necessary barrier.

Iterating the nodes of the llist and clearing them are not atomic. That 
is the reason I put a comment previously about a possible race. However 
that race is benign. Making it atomic does not eliminate the race as the 
iostat update data themselves are synchronized separately with sequence 
lock.

> Can you also add an explanation on how the pending llist is synchronized
> against blkg destructions?

Sure. I will need to think about that and put a proper comment there.

Cheers,
Longman
Waiman Long June 1, 2022, 9:25 p.m. UTC | #5
On 6/1/22 14:52, Waiman Long wrote:
> On 6/1/22 14:35, Tejun Heo wrote:
>
>> Can you also add an explanation on how the pending llist is synchronized
>> against blkg destructions?
>
> Sure. I will need to think about that and put a proper comment there.

I think the best way to protect against blkg destruction is to get a 
percpu reference when put into lockless list and put it back when removed.

BTW, when I ran a test that continuously create and destroy containers, 
the total number of blkcg's kept on increasing. There are some freeing 
of blkcg's but no freeing of blkg's at all. Maybe we have a similar 
dying blkcg's problem here. I will take a further look at that when I 
have time.

Cheers,
Longman
Tejun Heo June 1, 2022, 9:28 p.m. UTC | #6
On Wed, Jun 01, 2022 at 05:25:53PM -0400, Waiman Long wrote:
> I think the best way to protect against blkg destruction is to get a percpu
> reference when put into lockless list and put it back when removed.
> 
> BTW, when I ran a test that continuously create and destroy containers, the
> total number of blkcg's kept on increasing. There are some freeing of
> blkcg's but no freeing of blkg's at all. Maybe we have a similar dying
> blkcg's problem here. I will take a further look at that when I have time.

They get pinned by per-cgroup writebacks which gets pinned by lingering page
cache and other remaining accounted memory areas, so I think they can hang
around if there's no memory pressure. But, yeah, it'd be great to verify
that they actually go away under memory pressure.

Thanks.
Waiman Long June 1, 2022, 9:32 p.m. UTC | #7
On 6/1/22 17:28, Tejun Heo wrote:
> On Wed, Jun 01, 2022 at 05:25:53PM -0400, Waiman Long wrote:
>> I think the best way to protect against blkg destruction is to get a percpu
>> reference when put into lockless list and put it back when removed.
>>
>> BTW, when I ran a test that continuously create and destroy containers, the
>> total number of blkcg's kept on increasing. There are some freeing of
>> blkcg's but no freeing of blkg's at all. Maybe we have a similar dying
>> blkcg's problem here. I will take a further look at that when I have time.
> They get pinned by per-cgroup writebacks which gets pinned by lingering page
> cache and other remaining accounted memory areas, so I think they can hang
> around if there's no memory pressure. But, yeah, it'd be great to verify
> that they actually go away under memory pressure.
>
> Thanks.
>
Thanks for the explanation. It makes sense to me.

Cheers,
Longman
kernel test robot June 2, 2022, 1:52 a.m. UTC | #8
Hi Waiman,

I love your patch! Perhaps something to improve:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master next-20220601]
[cannot apply to v5.18]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Waiman-Long/blk-cgroup-Correctly-free-percpu-iostat_cpu-in-blkg-on-error-exit/20220602-005557
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: x86_64-randconfig-a005 (https://download.01.org/0day-ci/archive/20220602/202206020948.yBhTYYDS-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project c825abd6b0198fb088d9752f556a70705bc99dfd)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/7f5ef1493e681454e71c11b2547638b94665a78f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Waiman-Long/blk-cgroup-Correctly-free-percpu-iostat_cpu-in-blkg-on-error-exit/20220602-005557
        git checkout 7f5ef1493e681454e71c11b2547638b94665a78f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> block/blk-cgroup.c:1239:6: warning: variable 'ret' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
           if (!blkcg->lhead)
               ^~~~~~~~~~~~~
   block/blk-cgroup.c:1290:9: note: uninitialized use occurs here
           return ret;
                  ^~~
   block/blk-cgroup.c:1239:2: note: remove the 'if' if its condition is always false
           if (!blkcg->lhead)
           ^~~~~~~~~~~~~~~~~~
   block/blk-cgroup.c:1223:33: note: initialize the variable 'ret' to silence this warning
           struct cgroup_subsys_state *ret;
                                          ^
                                           = NULL
   1 warning generated.


vim +1239 block/blk-cgroup.c

  1218	
  1219	static struct cgroup_subsys_state *
  1220	blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
  1221	{
  1222		struct blkcg *blkcg;
  1223		struct cgroup_subsys_state *ret;
  1224		int i;
  1225	
  1226		mutex_lock(&blkcg_pol_mutex);
  1227	
  1228		if (!parent_css) {
  1229			blkcg = &blkcg_root;
  1230		} else {
  1231			blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
  1232			if (!blkcg) {
  1233				ret = ERR_PTR(-ENOMEM);
  1234				goto unlock;
  1235			}
  1236		}
  1237	
  1238		blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
> 1239		if (!blkcg->lhead)
  1240			goto free_blkcg;
  1241		init_blkcg_llists(blkcg);
  1242	
  1243		for (i = 0; i < BLKCG_MAX_POLS ; i++) {
  1244			struct blkcg_policy *pol = blkcg_policy[i];
  1245			struct blkcg_policy_data *cpd;
  1246	
  1247			/*
  1248			 * If the policy hasn't been attached yet, wait for it
  1249			 * to be attached before doing anything else. Otherwise,
  1250			 * check if the policy requires any specific per-cgroup
  1251			 * data: if it does, allocate and initialize it.
  1252			 */
  1253			if (!pol || !pol->cpd_alloc_fn)
  1254				continue;
  1255	
  1256			cpd = pol->cpd_alloc_fn(GFP_KERNEL);
  1257			if (!cpd) {
  1258				ret = ERR_PTR(-ENOMEM);
  1259				goto free_pd_blkcg;
  1260			}
  1261			blkcg->cpd[i] = cpd;
  1262			cpd->blkcg = blkcg;
  1263			cpd->plid = i;
  1264			if (pol->cpd_init_fn)
  1265				pol->cpd_init_fn(cpd);
  1266		}
  1267	
  1268		spin_lock_init(&blkcg->lock);
  1269		refcount_set(&blkcg->online_pin, 1);
  1270		INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
  1271		INIT_HLIST_HEAD(&blkcg->blkg_list);
  1272	#ifdef CONFIG_CGROUP_WRITEBACK
  1273		INIT_LIST_HEAD(&blkcg->cgwb_list);
  1274	#endif
  1275		list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
  1276	
  1277		mutex_unlock(&blkcg_pol_mutex);
  1278		return &blkcg->css;
  1279	
  1280	free_pd_blkcg:
  1281		for (i--; i >= 0; i--)
  1282			if (blkcg->cpd[i])
  1283				blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
  1284		free_percpu(blkcg->lhead);
  1285	free_blkcg:
  1286		if (blkcg != &blkcg_root)
  1287			kfree(blkcg);
  1288	unlock:
  1289		mutex_unlock(&blkcg_pol_mutex);
  1290		return ret;
  1291	}
  1292
diff mbox series

Patch

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index acd9b0aa8dc8..fcdac724970c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -59,6 +59,40 @@  static struct workqueue_struct *blkcg_punt_bio_wq;
 
 #define BLKG_DESTROY_BATCH_SIZE  64
 
+static struct llist_node llist_last;	/* Last sentinel node of llist */
+
+static inline bool blkcg_llist_empty(struct llist_head *lhead)
+{
+	return lhead->first == &llist_last;
+}
+
+static inline void init_blkcg_llists(struct blkcg *blkcg)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(blkcg->lhead, cpu)->first = &llist_last;
+}
+
+static inline struct llist_node *
+fetch_delete_blkcg_llist(struct llist_head *lhead)
+{
+	return xchg(&lhead->first, &llist_last);
+}
+
+/*
+ * The retrieved blkg_iostat_set is immediately marked as not in the
+ * lockless list by clearing its node->next pointer. It could be put
+ * back into the list by a parallel update before the iostat's are
+ * finally flushed. So being in the list doesn't always mean it has new
+ * iostat's to be flushed.
+ */
+#define blkcg_llist_for_each_entry_safe(pos, node, nxt)			\
+	for (; (node != &llist_last) &&					\
+	       (pos = llist_entry(node, struct blkg_iostat_set, lnode),	\
+		nxt = node->next, node->next = NULL, true);		\
+		node = nxt)
+
 /**
  * blkcg_css - find the current css
  *
@@ -236,8 +270,10 @@  static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	blkg->blkcg = blkcg;
 
 	u64_stats_init(&blkg->iostat.sync);
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
 		u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+		per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+	}
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
@@ -852,17 +888,26 @@  static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
 static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
-	struct blkcg_gq *blkg;
+	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+	struct llist_node *lnode, *lnext;
+	struct blkg_iostat_set *bisc;
 
 	/* Root-level stats are sourced from system-wide IO stats */
 	if (!cgroup_parent(css->cgroup))
 		return;
 
-	rcu_read_lock();
+	if (blkcg_llist_empty(lhead))
+		return;
 
-	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+	lnode = fetch_delete_blkcg_llist(lhead);
+
+	/*
+	 * No RCU protection is needed as it is assumed that blkg_iostat_set's
+	 * in the percpu lockless list won't go away until the flush is done.
+	 */
+	blkcg_llist_for_each_entry_safe(bisc, lnode, lnext) {
+		struct blkcg_gq *blkg = bisc->blkg;
 		struct blkcg_gq *parent = blkg->parent;
-		struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
 		struct blkg_iostat cur, delta;
 		unsigned long flags;
 		unsigned int seq;
@@ -891,8 +936,6 @@  static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 			u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
 		}
 	}
-
-	rcu_read_unlock();
 }
 
 /*
@@ -1192,6 +1235,11 @@  blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		}
 	}
 
+	blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+	if (!blkcg->lhead)
+		goto free_blkcg;
+	init_blkcg_llists(blkcg);
+
 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkcg_policy_data *cpd;
@@ -1233,7 +1281,8 @@  blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	for (i--; i >= 0; i--)
 		if (blkcg->cpd[i])
 			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+	free_percpu(blkcg->lhead);
+free_blkcg:
 	if (blkcg != &blkcg_root)
 		kfree(blkcg);
 unlock:
@@ -1997,6 +2046,7 @@  static int blk_cgroup_io_type(struct bio *bio)
 
 void blk_cgroup_bio_start(struct bio *bio)
 {
+	struct blkcg *blkcg = bio->bi_blkg->blkcg;
 	int rwd = blk_cgroup_io_type(bio), cpu;
 	struct blkg_iostat_set *bis;
 	unsigned long flags;
@@ -2015,9 +2065,15 @@  void blk_cgroup_bio_start(struct bio *bio)
 	}
 	bis->cur.ios[rwd]++;
 
+	if (!bis->lnode.next) {
+		struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+
+		llist_add(&bis->lnode, lhead);
+	}
+
 	u64_stats_update_end_irqrestore(&bis->sync, flags);
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys))
-		cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+		cgroup_rstat_updated(blkcg->css.cgroup, cpu);
 	put_cpu();
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index d4de0a35e066..65ac38e0708a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -18,6 +18,7 @@ 
 #include <linux/cgroup.h>
 #include <linux/kthread.h>
 #include <linux/blk-mq.h>
+#include <linux/llist.h>
 
 struct blkcg_gq;
 struct blkg_policy_data;
@@ -43,8 +44,10 @@  struct blkg_iostat {
 
 struct blkg_iostat_set {
 	struct u64_stats_sync		sync;
+	struct llist_node		lnode;
 	struct blkg_iostat		cur;
 	struct blkg_iostat		last;
+	struct blkcg_gq		       *blkg;
 };
 
 /* association between a blk cgroup and a request queue */
@@ -97,6 +100,12 @@  struct blkcg {
 	struct blkcg_policy_data	*cpd[BLKCG_MAX_POLS];
 
 	struct list_head		all_blkcgs_node;
+
+	/*
+	 * List of updated percpu blkg_iostat_set's since the last flush.
+	 */
+	struct llist_head __percpu	*lhead;
+
 #ifdef CONFIG_BLK_CGROUP_FC_APPID
 	char                            fc_app_id[FC_APPID_LEN];
 #endif