diff mbox series

[5/6] blk-cgroup: reimplement basic IO stats using cgroup rstat

Message ID 20191107191804.3735303-6-tj@kernel.org (mailing list archive)
State New, archived
Headers show
Series [1/6] bfq-iosched: relocate bfqg_*rwstat*() helpers | expand

Commit Message

Tejun Heo Nov. 7, 2019, 7:18 p.m. UTC
blk-cgroup has been using blkg_rwstat to track basic IO stats.
Unfortunately, reading recursive stats scales badly as itinvolves
walking all descendants.  On systems with a huge number of cgroups
(dead or alive), this can lead to substantial CPU cost when reading IO
stats.

This patch reimplements basic IO stats using cgroup rstat which uses
more memory but makes recursive stat reading O(# descendants which
have been active since last reading) instead of O(# descendants).

* blk-cgroup core no longer uses sync/async stats.  Introduce new stat
  enums - BLKG_IOSTAT_{READ|WRITE|DISCARD}.

* Add blkg_iostat[_set] which encapsulates byte and io stats, last
  values for propagation delta calculation and u64_stats_sync for
  correctness on 32bit archs.

* Update the new percpu stat counters directly and implement
  blkcg_rstat_flush() to implement propagation.

* blkg_print_stat() can now bring the stats up to date by calling
  cgroup_rstat_flush() and print them instead of directly summing up
  all descendants.

* It now allocates 96 bytes per cpu.  It used to be 40 bytes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Dan Schatzberg <dschatzberg@fb.com>
Cc: Daniel Xu <dlxu@fb.com>
---
 block/blk-cgroup.c         | 124 +++++++++++++++++++++++++++++--------
 include/linux/blk-cgroup.h |  48 ++++++++++++--
 2 files changed, 142 insertions(+), 30 deletions(-)

Comments

Faiz Abbas Nov. 13, 2019, 11:13 a.m. UTC | #1
Hi,

On 08/11/19 12:48 AM, Tejun Heo wrote:
> blk-cgroup has been using blkg_rwstat to track basic IO stats.
> Unfortunately, reading recursive stats scales badly as itinvolves
> walking all descendants.  On systems with a huge number of cgroups
> (dead or alive), this can lead to substantial CPU cost when reading IO
> stats.
> 
> This patch reimplements basic IO stats using cgroup rstat which uses
> more memory but makes recursive stat reading O(# descendants which
> have been active since last reading) instead of O(# descendants).
> 
> * blk-cgroup core no longer uses sync/async stats.  Introduce new stat
>   enums - BLKG_IOSTAT_{READ|WRITE|DISCARD}.
> 
> * Add blkg_iostat[_set] which encapsulates byte and io stats, last
>   values for propagation delta calculation and u64_stats_sync for
>   correctness on 32bit archs.
> 
> * Update the new percpu stat counters directly and implement
>   blkcg_rstat_flush() to implement propagation.
> 
> * blkg_print_stat() can now bring the stats up to date by calling
>   cgroup_rstat_flush() and print them instead of directly summing up
>   all descendants.
> 
> * It now allocates 96 bytes per cpu.  It used to be 40 bytes.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Dan Schatzberg <dschatzberg@fb.com>
> Cc: Daniel Xu <dlxu@fb.com>
> ---

I bisected a Kernel OOPs issue to this patch on linux-next. Any idea why
this is happening? Here is the log:

[   32.033025] 8<--- cut here ---
[   32.036136] Unable to handle kernel paging request at virtual address
2e83803c
[   32.043637] pgd = 75330198
[   32.046360] [2e83803c] *pgd=00000000
[   32.050008] Internal error: Oops: 5 [#1] SMP ARM
[   32.054647] Modules linked in:
[   32.057724] CPU: 0 PID: 780 Comm: (systemd) Tainted: G        W
  5.4.0-rc7-next-20191113 #172
[   32.066893] Hardware name: Generic AM33XX (Flattened Device Tree)
[   32.073026] PC is at cgroup_rstat_updated+0x30/0xe8
[   32.077939] LR is at generic_make_request_checks+0x3d4/0x748
[   32.083621] pc : [<c01e6f50>]    lr : [<c04af820>]    psr: a0040013
[   32.089912] sp : ed9b3b78  ip : 2e838000  fp : ed826c00
[   32.095156] r10: 00001000  r9 : 00000000  r8 : ff7ff428
[   32.100402] r7 : c0d05148  r6 : c0d0554c  r5 : c0c8b9ec  r4 : edb26180
[   32.106954] r3 : 2e838000  r2 : 2e838000  r1 : 00000000  r0 : eda32000
[   32.113510] Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM
Segment none
[   32.120674] Control: 10c5387d  Table: adac0019  DAC: 00000051
[   32.126444] Process (systemd) (pid: 780, stack limit = 0x5087843c)
[   32.132648] Stack: (0xed9b3b78 to 0xed9b4000)
[   32.137022] 3b60:
  edb26180 eee19550
[   32.145237] 3b80: 2e838000 c0d05148 ff7ff428 c04af820 00000004
00000800 0074e7f8 00000000
[   32.153452] 3ba0: a0040093 c08d1798 00000000 80040093 00002000
00000008 00000007 edb8168c
[   32.161667] 3bc0: 00000000 00000000 ffffe000 71b97da9 00000022
edb26180 c0d05148 00000008
[   32.169882] 3be0: c0d05148 00000001 00000000 edb26180 00000000
c04b0ad8 00000000 00000000
[   32.178097] 3c00: edb81a00 ed826c00 ed826cc4 71b97da9 c0de2c7c
edb26180 c0d05148 00000008
[   32.186312] 3c20: 00000001 00000001 00000000 0005fcfd 00000000
c04b0de0 c0de2c88 edb81600
[   32.194526] 3c40: ed826800 0005fcfd 00000000 c04ce968 00001000
c0d05148 edb26180 efd29a84
[   32.202741] 3c60: 00000000 00000000 0005fcfd 71b97da9 ed9b3c7b
00001000 00000001 00000001
[   32.210956] 3c80: 00000001 00000001 00000000 0005fcfd 00000000
c039bdb0 20040013 00000001
[   32.219170] 3ca0: 00000001 00000000 0005fcfd 00000000 ed9b3cc0
00000001 efd29a84 00000000
[   32.227385] 3cc0: 00000000 ed9b3e04 edb26180 ec8421b0 00000001
ec842100 0000000c ec8422b8
[   32.235600] 3ce0: 0005fcfd 00000000 00000fff 00000000 ee2a7b40
00080000 00000000 00112cca
[   32.243814] 3d00: ec8422bc c02983e0 0005fcfd 00000000 00000000
00000001 00000000 00000008
[   32.252028] 3d20: 0005fcfd 00000000 00000000 eef82400 00000010
00000000 00000004 ed9b3e88
[   32.260242] 3d40: 00000000 ed9b3d68 00000000 00000003 00000000
c0d05148 60040013 c01837f4
[   32.268457] 3d60: 00000000 71b97da9 00000000 00000001 00000001
c03783bc ec8422b8 ed9b3e04
[   32.276671] 3d80: ed9b3e04 00000001 ec8422bc c0378404 00000001
00000000 ec8421b0 c0255360
[   32.284886] 3da0: eeee0000 ed9d2180 ed9b3da8 ed9b3da8 ed9b3db0
ed9b3db0 00000000 71b97da9
[   32.293101] 3dc0: 00000000 00000001 00000001 00000000 00000003
ed9b3e04 00000000 00112cca
[   32.301316] 3de0: ec8422bc c025563c 00112cca 00000000 00000000
00000001 ec8422b8 ed9d2180
[   32.309531] 3e00: ed9b3dfc ed9b3e04 ed9b3e04 71b97da9 ec8422b8
ed9d21e8 ed9d2180 ec8422b8
[   32.317746] 3e20: 00000000 00000001 ffffffff 00000000 ed9d2180
c0255b8c 00000003 00000001
[   32.325961] 3e40: ec8421b0 ed9b3f00 00000000 00000000 ec8422b8
c024b73c 00000001 beba6ca0
[   32.334175] 3e60: c0d05148 00000000 00000000 beba6ca0 ed9b3ee8
ed9d2180 00000051 00000000
[   32.342389] 3e80: ed9d21e8 00000001 ffffffff 00000fff 000081a4
00000001 000003e8 000003e8
[   32.350604] 3ea0: 00000000 00000000 00000000 71b97da9 000000d2
ed9d2180 c0d05148 00000000
[   32.358819] 3ec0: 00000000 ed9b3f78 00001000 00000000 00000000
c02bdb6c 00001000 00020000
[   32.367033] 3ee0: 0058b9c8 00001000 00000004 00000000 00001000
ed9b3ee0 00000001 00000000
[   32.375248] 3f00: ed9d2180 00000000 00000000 00000000 00000000
00000000 00000000 00000000
[   32.383463] 3f20: 00000000 00000000 00000000 71b97da9 0058b9c8
00000001 00001000 ed9b3f78
[   32.391678] 3f40: ed9d2180 00000000 00000000 c02bdc78 00000000
eda3ce1c eda3cc00 ed9d2180
[   32.399893] 3f60: ed9d2180 c0d05148 0058b9c8 00001000 ed9b2000
c02bdf68 00000000 00000000
[   32.408107] 3f80: 000005e8 71b97da9 005868d8 b6c02f41 000005e8
00000003 c0101204 00000003
[   32.416322] 3fa0: 00000000 c01011e0 005868d8 b6c02f41 00000007
0058b9c8 00001000 00000000
[   32.424537] 3fc0: 005868d8 b6c02f41 000005e8 00000003 0000000a
beba6e88 00000000 00000000
[   32.432753] 3fe0: 00000000 beba6d24 b6c037e1 b6c3e4b8 40040030
00000007 00000000 00000000
[   32.440982] [<c01e6f50>] (cgroup_rstat_updated) from [<c04af820>]
(generic_make_request_checks+0x3d4/0
x748)
[   32.450770] [<c04af820>] (generic_make_request_checks) from
[<c04b0ad8>] (generic_make_request+0x1c/0x
2e4)
[   32.460468] [<c04b0ad8>] (generic_make_request) from [<c04b0de0>]
(submit_bio+0x40/0x1b4)
[   32.468686] [<c04b0de0>] (submit_bio) from [<c039bdb0>]
(ext4_mpage_readpages+0x704/0x904)
[   32.476995] [<c039bdb0>] (ext4_mpage_readpages) from [<c0378404>]
(ext4_readpages+0x48/0x50)
[   32.485481] [<c0378404>] (ext4_readpages) from [<c0255360>]
(read_pages+0x50/0x154)
[   32.493175] [<c0255360>] (read_pages) from [<c025563c>]
(__do_page_cache_readahead+0x1d8/0x1f8)
[   32.501914] [<c025563c>] (__do_page_cache_readahead) from
[<c0255b8c>] (page_cache_sync_readahead+0xa0
/0xf4)
[   32.511799] [<c0255b8c>] (page_cache_sync_readahead) from
[<c024b73c>] (generic_file_read_iter+0x75c/0
xc40)
[   32.521594] [<c024b73c>] (generic_file_read_iter) from [<c02bdb6c>]
(__vfs_read+0x138/0x1bc)
[   32.530073] [<c02bdb6c>] (__vfs_read) from [<c02bdc78>]
(vfs_read+0x88/0x114)
[   32.537241] [<c02bdc78>] (vfs_read) from [<c02bdf68>]
(ksys_read+0x54/0xd0)
[   32.544237] [<c02bdf68>] (ksys_read) from [<c01011e0>]
(__sys_trace_return+0x0/0x20)
[   32.552010] Exception stack(0xed9b3fa8 to 0xed9b3ff0)
[   32.557085] 3fa0:                   005868d8 b6c02f41 00000007
0058b9c8 00001000 00000000
[   32.565300] 3fc0: 005868d8 b6c02f41 000005e8 00000003 0000000a
beba6e88 00000000 00000000
[   32.573512] 3fe0: 00000000 beba6d24 b6c037e1 b6c3e4b8
[   32.578591] Code: ee073fba e7962101 e5903168 e0823003 (e593303c)
[   32.584889] ---[ end trace 08d6b7172e3ff29b ]---
[   32.797983] 8<--- cut here ---
[   32.801090] Unable to handle kernel paging request at virtual address
2e83803c
[   32.808421] pgd = f285aa90
[   32.811140] [2e83803c] *pgd=00000000
[   32.814739] Internal error: Oops: 5 [#2] SMP ARM
[   32.819378] Modules linked in:
[   32.822453] CPU: 0 PID: 527 Comm: login Tainted: G      D W
5.4.0-rc7-next-20191113 #172
[   32.831273] Hardware name: Generic AM33XX (Flattened Device Tree)
[   32.837406] PC is at cgroup_rstat_updated+0x30/0xe8
[   32.842320] LR is at generic_make_request_checks+0x3d4/0x748
[   32.848002] pc : [<c01e6f50>]    lr : [<c04af820>]    psr: a0070013
[   32.854292] sp : edbdfb78  ip : 2e838000  fp : eda49c00
[   32.859537] r10: 00001000  r9 : 00000000  r8 : ff7fff60
[   32.864782] r7 : c0d05148  r6 : c0d0554c  r5 : c0c8b9ec  r4 : edd8c6c0
[   32.871335] r3 : 2e838000  r2 : 2e838000  r1 : 00000000  r0 : ed9dec00
[   32.877891] Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM
Segment none
[   32.885056] Control: 10c5387d  Table: adb40019  DAC: 00000051
[   32.890826] Process login (pid: 527, stack limit = 0x1deade48)
[   32.896681] Stack: (0xedbdfb78 to 0xedbe0000)
[   32.901056] fb60:
  edd8c6c0 eee19550
[   32.909271] fb80: 2e838000 c0d05148 ff7fff60 c04af820 c0d0554c
c01023dc 0074e7f8 00000000
[   32.917487] fba0: 0000000a ffff979f 00400100 71b97da9 ee81ba00
ffffe000 00000000 c0d0554c
[   32.925702] fbc0: c0c90e7c 00000000 c0d0554c 71b97da9 00000001
edd8c6c0 c0d05148 00000008
[   32.933916] fbe0: c0d05148 00000001 00000000 edd8c6c0 00000000
c04b0ad8 00000000 c0101aec
[   32.942130] fc00: 00000000 00000000 00001000 71b97da9 edd8c6c0
edd8c6c0 c0d05148 00000008
[   32.950345] fc20: 00000001 00000001 00000000 0005fba9 00000000
c04b0de0 c04a8f24 c04a8340
[   32.958560] fc40: 20070013 ffffffff 00000051 bf000000 00001000
c0d05148 edd8c6c0 efd47fac
[   32.966775] fc60: 00000000 00000000 0005fba9 71b97da9 edbdfc7b
00001000 00000001 00000001
[   32.974990] fc80: 00000001 00000001 00000000 0005fba9 00000000
c039bdb0 20070013 00000001
[   32.983204] fca0: 00000001 00000000 0005fba9 00000000 edbdfcc0
00000001 efd47fac 00000000
[   32.991418] fcc0: 00000000 edbdfe04 edd8c6c0 ec85dd70 00000001
ec85dcc0 0000000c ec85de78
[   32.999633] fce0: 0005fba9 00000000 00000fff 00000000 ee2a7b40
00080000 00000000 00112cca
[   33.007848] fd00: ec85de7c c02983e0 0005fba9 00000000 00000000
00000001 00000000 00000008
[   33.016061] fd20: 0005fba9 00000000 00000000 eef82400 00000010
00000000 00000004 edbdfe88
[   33.024276] fd40: 00000000 edbdfd68 00000000 00000003 00000000
c0d05148 60070013 c01837f4
[   33.032491] fd60: 00000000 71b97da9 00000000 00000001 00000001
c03783bc ec85de78 edbdfe04
[   33.040705] fd80: edbdfe04 00000001 ec85de7c c0378404 00000001
00000000 ec85dd70 c0255360
[   33.048919] fda0: eeee0000 ed952d80 edbdfda8 edbdfda8 edbdfdb0
edbdfdb0 00000000 71b97da9
[   33.057134] fdc0: 00000000 00000001 00000001 00000000 00000003
edbdfe04 00000000 00112cca
[   33.065348] fde0: ec85de7c c025563c 00112cca 00000000 00000000
00000001 ec85de78 ed952d80
[   33.073563] fe00: edbdfdfc edbdfe04 edbdfe04 71b97da9 ec85de78
ed952de8 ed952d80 ec85de78
[   33.081777] fe20: 00000000 00000001 ffffffff 00000000 ed952d80
c0255b8c 00000003 00000001
[   33.089992] fe40: ec85dd70 edbdff00 00000000 00000000 ec85de78
c024b73c 00000001 00000041
[   33.098206] fe60: ffffe000 00000000 00000000 00000000 edbdfee8
ed952d80 00000000 00000000
[   33.106422] fe80: ed952de8 00000001 ffffffff 00000fff edbdfe8c
71b97da9 000003e8 00000004
[   33.114637] fea0: edbdff70 c0d05148 00000001 71b97da9 edbde000
ed952d80 c0d05148 00000000
[   33.122852] fec0: 00000000 edbdff78 000003e8 00000000 00000000
c02bdb6c 000003e8 00020000
[   33.131066] fee0: 000365a8 000003e8 00000004 00000000 000003e8
edbdfee0 00000001 00000000
[   33.139280] ff00: ed952d80 00000000 00000000 00000000 00000000
00000000 00000000 00000000
[   33.147495] ff20: 00000000 00000000 00000000 71b97da9 000365a8
00000001 000003e8 edbdff78
[   33.155709] ff40: ed952d80 00000000 00000000 c02bdc78 00000000
edd7721c edd77000 ed952d80
[   33.163923] ff60: ed952d80 c0d05148 000365a8 000003e8 edbde000
c02bdf68 00000000 00000000
[   33.172137] ff80: 00000000 71b97da9 000003e8 be9bb7ac 00000000
00000003 c0101204 00000003
[   33.180353] ffa0: 00000000 c01011e0 000003e8 be9bb7ac 00000004
000365a8 000003e8 00000000
[   33.188567] ffc0: 000003e8 be9bb7ac 00000000 00000003 00000004
000365a8 b6d74d64 00000000
[   33.196782] ffe0: 00000000 be9bb704 b6f7516c b6ef64b8 60070030
00000004 00000000 00000000
[   33.205010] [<c01e6f50>] (cgroup_rstat_updated) from [<c04af820>]
(generic_make_request_checks+0x3d4/0
x748)
[   33.214800] [<c04af820>] (generic_make_request_checks) from
[<c04b0ad8>] (generic_make_request+0x1c/0x
2e4)
[   33.224495] [<c04b0ad8>] (generic_make_request) from [<c04b0de0>]
(submit_bio+0x40/0x1b4)
[   33.232714] [<c04b0de0>] (submit_bio) from [<c039bdb0>]
(ext4_mpage_readpages+0x704/0x904)
[   33.241023] [<c039bdb0>] (ext4_mpage_readpages) from [<c0378404>]
(ext4_readpages+0x48/0x50)
[   33.249509] [<c0378404>] (ext4_readpages) from [<c0255360>]
(read_pages+0x50/0x154)
[   33.257203] [<c0255360>] (read_pages) from [<c025563c>]
(__do_page_cache_readahead+0x1d8/0x1f8)
[   33.265943] [<c025563c>] (__do_page_cache_readahead) from
[<c0255b8c>] (page_cache_sync_readahead+0xa0
/0xf4)
[   33.275826] [<c0255b8c>] (page_cache_sync_readahead) from
[<c024b73c>] (generic_file_read_iter+0x75c/0
xc40)
[   33.285621] [<c024b73c>] (generic_file_read_iter) from [<c02bdb6c>]
(__vfs_read+0x138/0x1bc)
[   33.294099] [<c02bdb6c>] (__vfs_read) from [<c02bdc78>]
(vfs_read+0x88/0x114)
[   33.301268] [<c02bdc78>] (vfs_read) from [<c02bdf68>]
(ksys_read+0x54/0xd0)
[   33.308264] [<c02bdf68>] (ksys_read) from [<c01011e0>]
(__sys_trace_return+0x0/0x20)
[   33.316038] Exception stack(0xedbdffa8 to 0xedbdfff0)
[   33.321112] ffa0:                   000003e8 be9bb7ac 00000004
000365a8 000003e8 00000000
[   33.329327] ffc0: 000003e8 be9bb7ac 00000000 00000003 00000004
000365a8 b6d74d64 00000000
[   33.337540] ffe0: 00000000 be9bb704 b6f7516c b6ef64b8
[   33.342619] Code: ee073fba e7962101 e5903168 e0823003 (e593303c)
[   33.348850] ---[ end trace 08d6b7172e3ff29c ]---

Thanks,
Faiz
Tejun Heo Nov. 13, 2019, 4:35 p.m. UTC | #2
Hello,

Can you please see whether the following patch fixes the issue?

Thanks.

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 48a66738143d..19394c77ed99 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -626,7 +626,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 		bis->cur.ios[rwd]++;
 
 		u64_stats_update_end(&bis->sync);
-		cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
+		if (cgroup_subsys_on_dfl(io_cgrp_subsys))
+			cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
 		put_cpu();
 	}
Ionela Voinescu Nov. 14, 2019, 12:17 p.m. UTC | #3
Hi Tejun,

On 13/11/2019 16:35, Tejun Heo wrote:
> Hello,
> 
> Can you please see whether the following patch fixes the issue?
> 

This patch does fix the issue for me.


Thanks,
Ionela.

> Thanks.
> 
> diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
> index 48a66738143d..19394c77ed99 100644
> --- a/include/linux/blk-cgroup.h
> +++ b/include/linux/blk-cgroup.h
> @@ -626,7 +626,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
>  		bis->cur.ios[rwd]++;
>  
>  		u64_stats_update_end(&bis->sync);
> -		cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
> +		if (cgroup_subsys_on_dfl(io_cgrp_subsys))
> +			cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
>  		put_cpu();
>  	}
>  
>
diff mbox series

Patch

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7e93377e320..b3429be62057 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -80,8 +80,7 @@  static void blkg_free(struct blkcg_gq *blkg)
 		if (blkg->pd[i])
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
-	blkg_rwstat_exit(&blkg->stat_ios);
-	blkg_rwstat_exit(&blkg->stat_bytes);
+	free_percpu(blkg->iostat_cpu);
 	percpu_ref_exit(&blkg->refcnt);
 	kfree(blkg);
 }
@@ -146,7 +145,7 @@  static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 				   gfp_t gfp_mask)
 {
 	struct blkcg_gq *blkg;
-	int i;
+	int i, cpu;
 
 	/* alloc and init base part */
 	blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
@@ -156,8 +155,8 @@  static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
 		goto err_free;
 
-	if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
-	    blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+	blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
+	if (!blkg->iostat_cpu)
 		goto err_free;
 
 	blkg->q = q;
@@ -167,6 +166,10 @@  static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
 	blkg->blkcg = blkcg;
 
+	u64_stats_init(&blkg->iostat.sync);
+	for_each_possible_cpu(cpu)
+		u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkg_policy_data *pd;
@@ -393,7 +396,6 @@  struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
-	struct blkcg_gq *parent = blkg->parent;
 	int i;
 
 	lockdep_assert_held(&blkg->q->queue_lock);
@@ -410,11 +412,6 @@  static void blkg_destroy(struct blkcg_gq *blkg)
 			pol->pd_offline_fn(blkg->pd[i]);
 	}
 
-	if (parent) {
-		blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
-		blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
-	}
-
 	blkg->online = false;
 
 	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -464,7 +461,7 @@  static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
-	int i;
+	int i, cpu;
 
 	mutex_lock(&blkcg_pol_mutex);
 	spin_lock_irq(&blkcg->lock);
@@ -475,8 +472,12 @@  static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	 * anyway.  If you get hit by a race, retry.
 	 */
 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
-		blkg_rwstat_reset(&blkg->stat_bytes);
-		blkg_rwstat_reset(&blkg->stat_ios);
+		for_each_possible_cpu(cpu) {
+			struct blkg_iostat_set *bis =
+				per_cpu_ptr(blkg->iostat_cpu, cpu);
+			memset(bis, 0, sizeof(*bis));
+		}
+		memset(&blkg->iostat, 0, sizeof(blkg->iostat));
 
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkcg_policy *pol = blkcg_policy[i];
@@ -840,16 +841,18 @@  static int blkcg_print_stat(struct seq_file *sf, void *v)
 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
 	struct blkcg_gq *blkg;
 
+	cgroup_rstat_flush(blkcg->css.cgroup);
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct blkg_iostat_set *bis = &blkg->iostat;
 		const char *dname;
 		char *buf;
-		struct blkg_rwstat_sample rwstat;
 		u64 rbytes, wbytes, rios, wios, dbytes, dios;
 		size_t size = seq_get_buf(sf, &buf), off = 0;
 		int i;
 		bool has_stats = false;
+		unsigned seq;
 
 		spin_lock_irq(&blkg->q->queue_lock);
 
@@ -868,17 +871,16 @@  static int blkcg_print_stat(struct seq_file *sf, void *v)
 		 */
 		off += scnprintf(buf+off, size-off, "%s ", dname);
 
-		blkg_rwstat_recursive_sum(blkg, NULL,
-				offsetof(struct blkcg_gq, stat_bytes), &rwstat);
-		rbytes = rwstat.cnt[BLKG_RWSTAT_READ];
-		wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE];
-		dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD];
+		do {
+			seq = u64_stats_fetch_begin(&bis->sync);
 
-		blkg_rwstat_recursive_sum(blkg, NULL,
-					offsetof(struct blkcg_gq, stat_ios), &rwstat);
-		rios = rwstat.cnt[BLKG_RWSTAT_READ];
-		wios = rwstat.cnt[BLKG_RWSTAT_WRITE];
-		dios = rwstat.cnt[BLKG_RWSTAT_DISCARD];
+			rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
+			wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
+			dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
+			rios = bis->cur.ios[BLKG_IOSTAT_READ];
+			wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
+			dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
+		} while (u64_stats_fetch_retry(&bis->sync, seq));
 
 		if (rbytes || wbytes || rios || wios) {
 			has_stats = true;
@@ -1214,6 +1216,77 @@  static int blkcg_can_attach(struct cgroup_taskset *tset)
 	return ret;
 }
 
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+	int i;
+
+	for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+		dst->bytes[i] = src->bytes[i];
+		dst->ios[i] = src->ios[i];
+	}
+}
+
+static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+	int i;
+
+	for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+		dst->bytes[i] += src->bytes[i];
+		dst->ios[i] += src->ios[i];
+	}
+}
+
+static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+	int i;
+
+	for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+		dst->bytes[i] -= src->bytes[i];
+		dst->ios[i] -= src->ios[i];
+	}
+}
+
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct blkcg_gq *parent = blkg->parent;
+		struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
+		struct blkg_iostat cur, delta;
+		unsigned seq;
+
+		/* fetch the current per-cpu values */
+		do {
+			seq = u64_stats_fetch_begin(&bisc->sync);
+			blkg_iostat_set(&cur, &bisc->cur);
+		} while (u64_stats_fetch_retry(&bisc->sync, seq));
+
+		/* propagate percpu delta to global */
+		u64_stats_update_begin(&blkg->iostat.sync);
+		blkg_iostat_set(&delta, &cur);
+		blkg_iostat_sub(&delta, &bisc->last);
+		blkg_iostat_add(&blkg->iostat.cur, &delta);
+		blkg_iostat_add(&bisc->last, &delta);
+		u64_stats_update_end(&blkg->iostat.sync);
+
+		/* propagate global delta to parent */
+		if (parent) {
+			u64_stats_update_begin(&parent->iostat.sync);
+			blkg_iostat_set(&delta, &blkg->iostat.cur);
+			blkg_iostat_sub(&delta, &blkg->iostat.last);
+			blkg_iostat_add(&parent->iostat.cur, &delta);
+			blkg_iostat_add(&blkg->iostat.last, &delta);
+			u64_stats_update_end(&parent->iostat.sync);
+		}
+	}
+
+	rcu_read_unlock();
+}
+
 static void blkcg_bind(struct cgroup_subsys_state *root_css)
 {
 	int i;
@@ -1246,6 +1319,7 @@  struct cgroup_subsys io_cgrp_subsys = {
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
+	.css_rstat_flush = blkcg_rstat_flush,
 	.bind = blkcg_bind,
 	.dfl_cftypes = blkcg_files,
 	.legacy_cftypes = blkcg_legacy_files,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 914ce55fa8c2..867ab391e409 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -15,7 +15,9 @@ 
  */
 
 #include <linux/cgroup.h>
+#include <linux/percpu.h>
 #include <linux/percpu_counter.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
@@ -31,6 +33,14 @@ 
 
 #ifdef CONFIG_BLK_CGROUP
 
+enum blkg_iostat_type {
+	BLKG_IOSTAT_READ,
+	BLKG_IOSTAT_WRITE,
+	BLKG_IOSTAT_DISCARD,
+
+	BLKG_IOSTAT_NR,
+};
+
 enum blkg_rwstat_type {
 	BLKG_RWSTAT_READ,
 	BLKG_RWSTAT_WRITE,
@@ -61,6 +71,17 @@  struct blkcg {
 #endif
 };
 
+struct blkg_iostat {
+	u64				bytes[BLKG_IOSTAT_NR];
+	u64				ios[BLKG_IOSTAT_NR];
+};
+
+struct blkg_iostat_set {
+	struct u64_stats_sync		sync;
+	struct blkg_iostat		cur;
+	struct blkg_iostat		last;
+};
+
 /*
  * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
  * recursive.  Used to carry stats of dead children.
@@ -127,8 +148,8 @@  struct blkcg_gq {
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;
 
-	struct blkg_rwstat		stat_bytes;
-	struct blkg_rwstat		stat_ios;
+	struct blkg_iostat_set __percpu	*iostat_cpu;
+	struct blkg_iostat_set		iostat;
 
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
@@ -740,15 +761,32 @@  static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	throtl = blk_throtl_bio(q, blkg, bio);
 
 	if (!throtl) {
+		struct blkg_iostat_set *bis;
+		int rwd, cpu;
+
+		if (op_is_discard(bio->bi_opf))
+			rwd = BLKG_IOSTAT_DISCARD;
+		else if (op_is_write(bio->bi_opf))
+			rwd = BLKG_IOSTAT_WRITE;
+		else
+			rwd = BLKG_IOSTAT_READ;
+
+		cpu = get_cpu();
+		bis = per_cpu_ptr(blkg->iostat_cpu, cpu);
+		u64_stats_update_begin(&bis->sync);
+
 		/*
 		 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
 		 * is a split bio and we would have already accounted for the
 		 * size of the bio.
 		 */
 		if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
-			blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
-					bio->bi_iter.bi_size);
-		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
+			bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+		bis->cur.ios[rwd]++;
+
+		u64_stats_update_end(&bis->sync);
+		cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
+		put_cpu();
 	}
 
 	blkcg_bio_issue_init(bio);