diff mbox series

[V2] blk-cgroup: Flush stats before releasing blkcg_gq

Message ID 20230524035150.727407-1-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series [V2] blk-cgroup: Flush stats before releasing blkcg_gq | expand

Commit Message

Ming Lei May 24, 2023, 3:51 a.m. UTC
As noted by Michal, the blkg_iostat_set's in the lockless list hold
reference to blkg's to protect against their removal. Those blkg's
hold reference to blkcg. When a cgroup is being destroyed,
cgroup_rstat_flush() is only called at css_release_work_fn() which
is called when the blkcg reference count reaches 0. This circular
dependency will prevent blkcg and some blkgs from being freed after
they are made offline.

It is less a problem if the cgroup to be destroyed also has other
controllers like memory that will call cgroup_rstat_flush() which will
clean up the reference count. If block is the only controller that uses
rstat, these offline blkcg and blkgs may never be freed leaking more
and more memory over time.

To prevent this potential memory leak:

- flush blkcg per-cpu stats list in __blkg_release(), when no new stat
can be added

- don't grab bio->bi_blkg reference when adding the stats into blkcg's
per-cpu stat list since all stats are guaranteed to be consumed before
releasing blkg instance, and grabbing blkg reference for stats was the
most fragile part of original patch

Based on Waiman's patch:

https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/

Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
Cc: Waiman Long <longman@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: mkoutny@suse.com
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
V2:
	- remove kernel/cgroup change, and call blkcg_rstat_flush()
	to flush stat directly

 block/blk-cgroup.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

Comments

Waiman Long May 24, 2023, 4:19 a.m. UTC | #1
On 5/23/23 23:51, Ming Lei wrote:
> As noted by Michal, the blkg_iostat_set's in the lockless list hold
> reference to blkg's to protect against their removal. Those blkg's
> hold reference to blkcg. When a cgroup is being destroyed,
> cgroup_rstat_flush() is only called at css_release_work_fn() which
> is called when the blkcg reference count reaches 0. This circular
> dependency will prevent blkcg and some blkgs from being freed after
> they are made offline.
>
> It is less a problem if the cgroup to be destroyed also has other
> controllers like memory that will call cgroup_rstat_flush() which will
> clean up the reference count. If block is the only controller that uses
> rstat, these offline blkcg and blkgs may never be freed leaking more
> and more memory over time.
>
> To prevent this potential memory leak:
>
> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> can be added
>
> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> per-cpu stat list since all stats are guaranteed to be consumed before
> releasing blkg instance, and grabbing blkg reference for stats was the
> most fragile part of original patch
>
> Based on Waiman's patch:
>
> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
>
> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> Cc: Waiman Long <longman@redhat.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: mkoutny@suse.com
> Cc: Yosry Ahmed <yosryahmed@google.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
> V2:
> 	- remove kernel/cgroup change, and call blkcg_rstat_flush()
> 	to flush stat directly
>
>   block/blk-cgroup.c | 29 +++++++++++++++++++++--------
>   1 file changed, 21 insertions(+), 8 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 0ce64dd73cfe..ed0eb8896972 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -34,6 +34,8 @@
>   #include "blk-ioprio.h"
>   #include "blk-throttle.h"
>   
> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
> +
>   /*
>    * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
>    * blkcg_pol_register_mutex nests outside of it and synchronizes entire
> @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
>   static void __blkg_release(struct rcu_head *rcu)
>   {
>   	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
> +	struct blkcg *blkcg = blkg->blkcg;
> +	int cpu;
>   
>   #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
>   	WARN_ON(!bio_list_empty(&blkg->async_bios));
>   #endif
> +	/*
> +	 * Flush all the non-empty percpu lockless lists before releasing
> +	 * us, given these stat belongs to us.
> +	 *
> +	 * cgroup locks aren't needed here since __blkcg_rstat_flush just
> +	 * propagates delta into blkg parent, which is live now.
> +	 */
> +	for_each_possible_cpu(cpu)
> +		__blkcg_rstat_flush(blkcg, cpu);
>   
>   	/* release the blkcg and parent blkg refs this blkg has been holding */
>   	css_put(&blkg->blkcg->css);
> @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
>   	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>   }
>   
> -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
>   {
> -	struct blkcg *blkcg = css_to_blkcg(css);
>   	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
>   	struct llist_node *lnode;
>   	struct blkg_iostat_set *bisc, *next_bisc;
>   
> -	/* Root-level stats are sourced from system-wide IO stats */
> -	if (!cgroup_parent(css->cgroup))
> -		return;
> -
>   	rcu_read_lock();
>   
>   	lnode = llist_del_all(lhead);
> @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>   		if (parent && parent->parent)
>   			blkcg_iostat_update(parent, &blkg->iostat.cur,
>   					    &blkg->iostat.last);
> -		percpu_ref_put(&blkg->refcnt);
>   	}
>   
>   out:
>   	rcu_read_unlock();
>   }
>   
> +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> +{
> +	/* Root-level stats are sourced from system-wide IO stats */
> +	if (cgroup_parent(css->cgroup))
> +		__blkcg_rstat_flush(css_to_blkcg(css), cpu);
> +}
> +

I think it may not safe to call __blkcg_rstat_flus() directly without 
taking the cgroup_rstat_cpu_lock. That is why I added a helper to 
kernel/cgroup/rstat.c in my patch to meet the locking requirement.

Cheers,
Longman
Ming Lei May 24, 2023, 4:26 a.m. UTC | #2
On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
> On 5/23/23 23:51, Ming Lei wrote:
> > As noted by Michal, the blkg_iostat_set's in the lockless list hold
> > reference to blkg's to protect against their removal. Those blkg's
> > hold reference to blkcg. When a cgroup is being destroyed,
> > cgroup_rstat_flush() is only called at css_release_work_fn() which
> > is called when the blkcg reference count reaches 0. This circular
> > dependency will prevent blkcg and some blkgs from being freed after
> > they are made offline.
> > 
> > It is less a problem if the cgroup to be destroyed also has other
> > controllers like memory that will call cgroup_rstat_flush() which will
> > clean up the reference count. If block is the only controller that uses
> > rstat, these offline blkcg and blkgs may never be freed leaking more
> > and more memory over time.
> > 
> > To prevent this potential memory leak:
> > 
> > - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> > can be added
> > 
> > - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> > per-cpu stat list since all stats are guaranteed to be consumed before
> > releasing blkg instance, and grabbing blkg reference for stats was the
> > most fragile part of original patch
> > 
> > Based on Waiman's patch:
> > 
> > https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> > 
> > Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> > Cc: Waiman Long <longman@redhat.com>
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: mkoutny@suse.com
> > Cc: Yosry Ahmed <yosryahmed@google.com>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> > V2:
> > 	- remove kernel/cgroup change, and call blkcg_rstat_flush()
> > 	to flush stat directly
> > 
> >   block/blk-cgroup.c | 29 +++++++++++++++++++++--------
> >   1 file changed, 21 insertions(+), 8 deletions(-)
> > 
> > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> > index 0ce64dd73cfe..ed0eb8896972 100644
> > --- a/block/blk-cgroup.c
> > +++ b/block/blk-cgroup.c
> > @@ -34,6 +34,8 @@
> >   #include "blk-ioprio.h"
> >   #include "blk-throttle.h"
> > +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
> > +
> >   /*
> >    * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
> >    * blkcg_pol_register_mutex nests outside of it and synchronizes entire
> > @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
> >   static void __blkg_release(struct rcu_head *rcu)
> >   {
> >   	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
> > +	struct blkcg *blkcg = blkg->blkcg;
> > +	int cpu;
> >   #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
> >   	WARN_ON(!bio_list_empty(&blkg->async_bios));
> >   #endif
> > +	/*
> > +	 * Flush all the non-empty percpu lockless lists before releasing
> > +	 * us, given these stat belongs to us.
> > +	 *
> > +	 * cgroup locks aren't needed here since __blkcg_rstat_flush just
> > +	 * propagates delta into blkg parent, which is live now.
> > +	 */
> > +	for_each_possible_cpu(cpu)
> > +		__blkcg_rstat_flush(blkcg, cpu);
> >   	/* release the blkcg and parent blkg refs this blkg has been holding */
> >   	css_put(&blkg->blkcg->css);
> > @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
> >   	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> >   }
> > -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> > +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
> >   {
> > -	struct blkcg *blkcg = css_to_blkcg(css);
> >   	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> >   	struct llist_node *lnode;
> >   	struct blkg_iostat_set *bisc, *next_bisc;
> > -	/* Root-level stats are sourced from system-wide IO stats */
> > -	if (!cgroup_parent(css->cgroup))
> > -		return;
> > -
> >   	rcu_read_lock();
> >   	lnode = llist_del_all(lhead);
> > @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> >   		if (parent && parent->parent)
> >   			blkcg_iostat_update(parent, &blkg->iostat.cur,
> >   					    &blkg->iostat.last);
> > -		percpu_ref_put(&blkg->refcnt);
> >   	}
> >   out:
> >   	rcu_read_unlock();
> >   }
> > +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> > +{
> > +	/* Root-level stats are sourced from system-wide IO stats */
> > +	if (cgroup_parent(css->cgroup))
> > +		__blkcg_rstat_flush(css_to_blkcg(css), cpu);
> > +}
> > +
> 
> I think it may not safe to call __blkcg_rstat_flus() directly without taking
> the cgroup_rstat_cpu_lock. That is why I added a helper to
> kernel/cgroup/rstat.c in my patch to meet the locking requirement.

All stats are removed from llist_del_all(), and the local list is
iterated, then each blkg & its parent is touched in __blkcg_rstat_flus(), so
can you explain it a bit why cgroup locks are needed? For protecting
what?

Thanks,
Ming
Waiman Long May 24, 2023, 3:43 p.m. UTC | #3
On 5/24/23 00:26, Ming Lei wrote:
> On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
>> On 5/23/23 23:51, Ming Lei wrote:
>>> As noted by Michal, the blkg_iostat_set's in the lockless list hold
>>> reference to blkg's to protect against their removal. Those blkg's
>>> hold reference to blkcg. When a cgroup is being destroyed,
>>> cgroup_rstat_flush() is only called at css_release_work_fn() which
>>> is called when the blkcg reference count reaches 0. This circular
>>> dependency will prevent blkcg and some blkgs from being freed after
>>> they are made offline.
>>>
>>> It is less a problem if the cgroup to be destroyed also has other
>>> controllers like memory that will call cgroup_rstat_flush() which will
>>> clean up the reference count. If block is the only controller that uses
>>> rstat, these offline blkcg and blkgs may never be freed leaking more
>>> and more memory over time.
>>>
>>> To prevent this potential memory leak:
>>>
>>> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
>>> can be added
>>>
>>> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
>>> per-cpu stat list since all stats are guaranteed to be consumed before
>>> releasing blkg instance, and grabbing blkg reference for stats was the
>>> most fragile part of original patch
>>>
>>> Based on Waiman's patch:
>>>
>>> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
>>>
>>> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
>>> Cc: Waiman Long <longman@redhat.com>
>>> Cc: Tejun Heo <tj@kernel.org>
>>> Cc: mkoutny@suse.com
>>> Cc: Yosry Ahmed <yosryahmed@google.com>
>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>> ---
>>> V2:
>>> 	- remove kernel/cgroup change, and call blkcg_rstat_flush()
>>> 	to flush stat directly
>>>
>>>    block/blk-cgroup.c | 29 +++++++++++++++++++++--------
>>>    1 file changed, 21 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>> index 0ce64dd73cfe..ed0eb8896972 100644
>>> --- a/block/blk-cgroup.c
>>> +++ b/block/blk-cgroup.c
>>> @@ -34,6 +34,8 @@
>>>    #include "blk-ioprio.h"
>>>    #include "blk-throttle.h"
>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
>>> +
>>>    /*
>>>     * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
>>>     * blkcg_pol_register_mutex nests outside of it and synchronizes entire
>>> @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
>>>    static void __blkg_release(struct rcu_head *rcu)
>>>    {
>>>    	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
>>> +	struct blkcg *blkcg = blkg->blkcg;
>>> +	int cpu;
>>>    #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
>>>    	WARN_ON(!bio_list_empty(&blkg->async_bios));
>>>    #endif
>>> +	/*
>>> +	 * Flush all the non-empty percpu lockless lists before releasing
>>> +	 * us, given these stat belongs to us.
>>> +	 *
>>> +	 * cgroup locks aren't needed here since __blkcg_rstat_flush just
>>> +	 * propagates delta into blkg parent, which is live now.
>>> +	 */
>>> +	for_each_possible_cpu(cpu)
>>> +		__blkcg_rstat_flush(blkcg, cpu);
>>>    	/* release the blkcg and parent blkg refs this blkg has been holding */
>>>    	css_put(&blkg->blkcg->css);
>>> @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
>>>    	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>>>    }
>>> -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
>>>    {
>>> -	struct blkcg *blkcg = css_to_blkcg(css);
>>>    	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
>>>    	struct llist_node *lnode;
>>>    	struct blkg_iostat_set *bisc, *next_bisc;
>>> -	/* Root-level stats are sourced from system-wide IO stats */
>>> -	if (!cgroup_parent(css->cgroup))
>>> -		return;
>>> -
>>>    	rcu_read_lock();
>>>    	lnode = llist_del_all(lhead);
>>> @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>>>    		if (parent && parent->parent)
>>>    			blkcg_iostat_update(parent, &blkg->iostat.cur,
>>>    					    &blkg->iostat.last);
>>> -		percpu_ref_put(&blkg->refcnt);
>>>    	}
>>>    out:
>>>    	rcu_read_unlock();
>>>    }
>>> +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>>> +{
>>> +	/* Root-level stats are sourced from system-wide IO stats */
>>> +	if (cgroup_parent(css->cgroup))
>>> +		__blkcg_rstat_flush(css_to_blkcg(css), cpu);
>>> +}
>>> +
>> I think it may not safe to call __blkcg_rstat_flus() directly without taking
>> the cgroup_rstat_cpu_lock. That is why I added a helper to
>> kernel/cgroup/rstat.c in my patch to meet the locking requirement.
> All stats are removed from llist_del_all(), and the local list is
> iterated, then each blkg & its parent is touched in __blkcg_rstat_flus(), so
> can you explain it a bit why cgroup locks are needed? For protecting
> what?

You are right. The llist_del_all() call in blkcg_rstat_flush() is 
atomic, so it is safe for concurrent execution which is what the 
cgroup_rstat_cpu_lock protects against. That may not be the case for 
rstat callbacks of other controllers. So I will suggest you to add a 
comment to clarify that point. Other than that, you patch looks good to me.

Reviewed: Waiman Long <longman@redhat.com>
Waiman Long May 24, 2023, 5:28 p.m. UTC | #4
On 5/24/23 11:43, Waiman Long wrote:
> On 5/24/23 00:26, Ming Lei wrote:
>> On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
>>> On 5/23/23 23:51, Ming Lei wrote:
>>>> As noted by Michal, the blkg_iostat_set's in the lockless list hold
>>>> reference to blkg's to protect against their removal. Those blkg's
>>>> hold reference to blkcg. When a cgroup is being destroyed,
>>>> cgroup_rstat_flush() is only called at css_release_work_fn() which
>>>> is called when the blkcg reference count reaches 0. This circular
>>>> dependency will prevent blkcg and some blkgs from being freed after
>>>> they are made offline.
>>>>
>>>> It is less a problem if the cgroup to be destroyed also has other
>>>> controllers like memory that will call cgroup_rstat_flush() which will
>>>> clean up the reference count. If block is the only controller that 
>>>> uses
>>>> rstat, these offline blkcg and blkgs may never be freed leaking more
>>>> and more memory over time.
>>>>
>>>> To prevent this potential memory leak:
>>>>
>>>> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
>>>> can be added
>>>>
>>>> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
>>>> per-cpu stat list since all stats are guaranteed to be consumed before
>>>> releasing blkg instance, and grabbing blkg reference for stats was the
>>>> most fragile part of original patch
>>>>
>>>> Based on Waiman's patch:
>>>>
>>>> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/ 
>>>>
>>>>
>>>> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
>>>> Cc: Waiman Long <longman@redhat.com>
>>>> Cc: Tejun Heo <tj@kernel.org>
>>>> Cc: mkoutny@suse.com
>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>>> ---
>>>> V2:
>>>>     - remove kernel/cgroup change, and call blkcg_rstat_flush()
>>>>     to flush stat directly
>>>>
>>>>    block/blk-cgroup.c | 29 +++++++++++++++++++++--------
>>>>    1 file changed, 21 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>> index 0ce64dd73cfe..ed0eb8896972 100644
>>>> --- a/block/blk-cgroup.c
>>>> +++ b/block/blk-cgroup.c
>>>> @@ -34,6 +34,8 @@
>>>>    #include "blk-ioprio.h"
>>>>    #include "blk-throttle.h"
>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
>>>> +
>>>>    /*
>>>>     * blkcg_pol_mutex protects blkcg_policy[] and policy 
>>>> [de]activation.
>>>>     * blkcg_pol_register_mutex nests outside of it and synchronizes 
>>>> entire
>>>> @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
>>>>    static void __blkg_release(struct rcu_head *rcu)
>>>>    {
>>>>        struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, 
>>>> rcu_head);
>>>> +    struct blkcg *blkcg = blkg->blkcg;
>>>> +    int cpu;
>>>>    #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
>>>>        WARN_ON(!bio_list_empty(&blkg->async_bios));
>>>>    #endif
>>>> +    /*
>>>> +     * Flush all the non-empty percpu lockless lists before releasing
>>>> +     * us, given these stat belongs to us.
>>>> +     *
>>>> +     * cgroup locks aren't needed here since __blkcg_rstat_flush just
>>>> +     * propagates delta into blkg parent, which is live now.
>>>> +     */
>>>> +    for_each_possible_cpu(cpu)
>>>> +        __blkcg_rstat_flush(blkcg, cpu);
>>>>        /* release the blkcg and parent blkg refs this blkg has been 
>>>> holding */
>>>>        css_put(&blkg->blkcg->css);
>>>> @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct 
>>>> blkcg_gq *blkg, struct blkg_iostat *cur,
>>>> u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>>>>    }
>>>> -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int 
>>>> cpu)
>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
>>>>    {
>>>> -    struct blkcg *blkcg = css_to_blkcg(css);
>>>>        struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
>>>>        struct llist_node *lnode;
>>>>        struct blkg_iostat_set *bisc, *next_bisc;
>>>> -    /* Root-level stats are sourced from system-wide IO stats */
>>>> -    if (!cgroup_parent(css->cgroup))
>>>> -        return;
>>>> -
>>>>        rcu_read_lock();
>>>>        lnode = llist_del_all(lhead);
>>>> @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct 
>>>> cgroup_subsys_state *css, int cpu)
>>>>            if (parent && parent->parent)
>>>>                blkcg_iostat_update(parent, &blkg->iostat.cur,
>>>>                            &blkg->iostat.last);
>>>> -        percpu_ref_put(&blkg->refcnt);
>>>>        }
>>>>    out:
>>>>        rcu_read_unlock();
>>>>    }
>>>> +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int 
>>>> cpu)
>>>> +{
>>>> +    /* Root-level stats are sourced from system-wide IO stats */
>>>> +    if (cgroup_parent(css->cgroup))
>>>> +        __blkcg_rstat_flush(css_to_blkcg(css), cpu);
>>>> +}
>>>> +
>>> I think it may not safe to call __blkcg_rstat_flus() directly 
>>> without taking
>>> the cgroup_rstat_cpu_lock. That is why I added a helper to
>>> kernel/cgroup/rstat.c in my patch to meet the locking requirement.
>> All stats are removed from llist_del_all(), and the local list is
>> iterated, then each blkg & its parent is touched in 
>> __blkcg_rstat_flus(), so
>> can you explain it a bit why cgroup locks are needed? For protecting
>> what?
>
> You are right. The llist_del_all() call in blkcg_rstat_flush() is 
> atomic, so it is safe for concurrent execution which is what the 
> cgroup_rstat_cpu_lock protects against. That may not be the case for 
> rstat callbacks of other controllers. So I will suggest you to add a 
> comment to clarify that point. Other than that, you patch looks good 
> to me.
>
> Reviewed: Waiman Long <longman@redhat.com>

After some more thought, I need to retract my reviewed-by tag for now. 
There is a slight possibility that blkcg_iostat_update() in 
blkcg_rstat_flush() can happen concurrently which will corrupt the 
sequence count. One way to avoid that is to synchronize it by 
cgroup_rstat_cpu_lock. Another way is to use the bisc->lqueued for 
synchronization. In that case, you need to move 
WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after 
all the  blkcg_iostat_update() call with smp_store_release() and replace 
the READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with 
smp_load_acquire().

Cheers,
Longman
Ming Lei May 25, 2023, 2:04 a.m. UTC | #5
On Wed, May 24, 2023 at 01:28:41PM -0400, Waiman Long wrote:
> On 5/24/23 11:43, Waiman Long wrote:
> > On 5/24/23 00:26, Ming Lei wrote:
> > > On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
> > > > On 5/23/23 23:51, Ming Lei wrote:
> > > > > As noted by Michal, the blkg_iostat_set's in the lockless list hold
> > > > > reference to blkg's to protect against their removal. Those blkg's
> > > > > hold reference to blkcg. When a cgroup is being destroyed,
> > > > > cgroup_rstat_flush() is only called at css_release_work_fn() which
> > > > > is called when the blkcg reference count reaches 0. This circular
> > > > > dependency will prevent blkcg and some blkgs from being freed after
> > > > > they are made offline.
> > > > > 
> > > > > It is less a problem if the cgroup to be destroyed also has other
> > > > > controllers like memory that will call cgroup_rstat_flush() which will
> > > > > clean up the reference count. If block is the only
> > > > > controller that uses
> > > > > rstat, these offline blkcg and blkgs may never be freed leaking more
> > > > > and more memory over time.
> > > > > 
> > > > > To prevent this potential memory leak:
> > > > > 
> > > > > - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> > > > > can be added
> > > > > 
> > > > > - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> > > > > per-cpu stat list since all stats are guaranteed to be consumed before
> > > > > releasing blkg instance, and grabbing blkg reference for stats was the
> > > > > most fragile part of original patch
> > > > > 
> > > > > Based on Waiman's patch:
> > > > > 
> > > > > https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> > > > > 
> > > > > 
> > > > > Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> > > > > Cc: Waiman Long <longman@redhat.com>
> > > > > Cc: Tejun Heo <tj@kernel.org>
> > > > > Cc: mkoutny@suse.com
> > > > > Cc: Yosry Ahmed <yosryahmed@google.com>
> > > > > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > > > > ---
> > > > > V2:
> > > > >     - remove kernel/cgroup change, and call blkcg_rstat_flush()
> > > > >     to flush stat directly
> > > > > 
> > > > >    block/blk-cgroup.c | 29 +++++++++++++++++++++--------
> > > > >    1 file changed, 21 insertions(+), 8 deletions(-)
> > > > > 
> > > > > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> > > > > index 0ce64dd73cfe..ed0eb8896972 100644
> > > > > --- a/block/blk-cgroup.c
> > > > > +++ b/block/blk-cgroup.c
> > > > > @@ -34,6 +34,8 @@
> > > > >    #include "blk-ioprio.h"
> > > > >    #include "blk-throttle.h"
> > > > > +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
> > > > > +
> > > > >    /*
> > > > >     * blkcg_pol_mutex protects blkcg_policy[] and policy
> > > > > [de]activation.
> > > > >     * blkcg_pol_register_mutex nests outside of it and
> > > > > synchronizes entire
> > > > > @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
> > > > >    static void __blkg_release(struct rcu_head *rcu)
> > > > >    {
> > > > >        struct blkcg_gq *blkg = container_of(rcu, struct
> > > > > blkcg_gq, rcu_head);
> > > > > +    struct blkcg *blkcg = blkg->blkcg;
> > > > > +    int cpu;
> > > > >    #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
> > > > >        WARN_ON(!bio_list_empty(&blkg->async_bios));
> > > > >    #endif
> > > > > +    /*
> > > > > +     * Flush all the non-empty percpu lockless lists before releasing
> > > > > +     * us, given these stat belongs to us.
> > > > > +     *
> > > > > +     * cgroup locks aren't needed here since __blkcg_rstat_flush just
> > > > > +     * propagates delta into blkg parent, which is live now.
> > > > > +     */
> > > > > +    for_each_possible_cpu(cpu)
> > > > > +        __blkcg_rstat_flush(blkcg, cpu);
> > > > >        /* release the blkcg and parent blkg refs this blkg
> > > > > has been holding */
> > > > >        css_put(&blkg->blkcg->css);
> > > > > @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct
> > > > > blkcg_gq *blkg, struct blkg_iostat *cur,
> > > > > u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> > > > >    }
> > > > > -static void blkcg_rstat_flush(struct cgroup_subsys_state
> > > > > *css, int cpu)
> > > > > +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
> > > > >    {
> > > > > -    struct blkcg *blkcg = css_to_blkcg(css);
> > > > >        struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> > > > >        struct llist_node *lnode;
> > > > >        struct blkg_iostat_set *bisc, *next_bisc;
> > > > > -    /* Root-level stats are sourced from system-wide IO stats */
> > > > > -    if (!cgroup_parent(css->cgroup))
> > > > > -        return;
> > > > > -
> > > > >        rcu_read_lock();
> > > > >        lnode = llist_del_all(lhead);
> > > > > @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct
> > > > > cgroup_subsys_state *css, int cpu)
> > > > >            if (parent && parent->parent)
> > > > >                blkcg_iostat_update(parent, &blkg->iostat.cur,
> > > > >                            &blkg->iostat.last);
> > > > > -        percpu_ref_put(&blkg->refcnt);
> > > > >        }
> > > > >    out:
> > > > >        rcu_read_unlock();
> > > > >    }
> > > > > +static void blkcg_rstat_flush(struct cgroup_subsys_state
> > > > > *css, int cpu)
> > > > > +{
> > > > > +    /* Root-level stats are sourced from system-wide IO stats */
> > > > > +    if (cgroup_parent(css->cgroup))
> > > > > +        __blkcg_rstat_flush(css_to_blkcg(css), cpu);
> > > > > +}
> > > > > +
> > > > I think it may not safe to call __blkcg_rstat_flus() directly
> > > > without taking
> > > > the cgroup_rstat_cpu_lock. That is why I added a helper to
> > > > kernel/cgroup/rstat.c in my patch to meet the locking requirement.
> > > All stats are removed from llist_del_all(), and the local list is
> > > iterated, then each blkg & its parent is touched in
> > > __blkcg_rstat_flus(), so
> > > can you explain it a bit why cgroup locks are needed? For protecting
> > > what?
> > 
> > You are right. The llist_del_all() call in blkcg_rstat_flush() is
> > atomic, so it is safe for concurrent execution which is what the
> > cgroup_rstat_cpu_lock protects against. That may not be the case for
> > rstat callbacks of other controllers. So I will suggest you to add a
> > comment to clarify that point. Other than that, you patch looks good to
> > me.
> > 
> > Reviewed: Waiman Long <longman@redhat.com>
> 
> After some more thought, I need to retract my reviewed-by tag for now. There
> is a slight possibility that blkcg_iostat_update() in blkcg_rstat_flush()
> can happen concurrently which will corrupt the sequence count.

llist_del_all() moves all 'bis' into one local list, and bis is one percpu
variable of blkg, so in theory same bis won't be flushed at the same
time. And one bis should be touched in just one of stat flush code path
because of llist_del_all().

So 'bis' still can be thought as being flushed in serialized way.

However, blk_cgroup_bio_start() runs concurrently with blkcg_rstat_flush(),
so once bis->lqueued is cleared in blkcg_rstat_flush(), this same bis
could be added to the percpu llist and __blkcg_rstat_flush() from blkg_release()
follows. This should be the only chance for concurrent stats update.

But, blkg_release() is run in RCU callback, so the previous flush has
been done, but new flush can come, and other blkg's stat could be added
with same story above.

> One way to
> avoid that is to synchronize it by cgroup_rstat_cpu_lock. Another way is to
> use the bisc->lqueued for synchronization.

I'd avoid the external cgroup lock here.

> In that case, you need to move
> WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after all
> the  blkcg_iostat_update() call with smp_store_release() and replace the
> READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with
> smp_load_acquire().

This way looks doable, but I guess it still can't avoid concurrent update on parent
stat, such as when  __blkcg_rstat_flush() from blkg_release() is
in-progress, another sibling blkg's bis is added, meantime
blkcg_rstat_flush() is called.

Another way is to add blkcg->stat_lock for covering __blkcg_rstat_flush(), what
do you think of this way?


Thanks,
Ming
Waiman Long May 25, 2023, 2:50 a.m. UTC | #6
On 5/24/23 22:04, Ming Lei wrote:
> On Wed, May 24, 2023 at 01:28:41PM -0400, Waiman Long wrote:
>> On 5/24/23 11:43, Waiman Long wrote:
>>> On 5/24/23 00:26, Ming Lei wrote:
>>>> On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
>>>>> On 5/23/23 23:51, Ming Lei wrote:
>>>>>> As noted by Michal, the blkg_iostat_set's in the lockless list hold
>>>>>> reference to blkg's to protect against their removal. Those blkg's
>>>>>> hold reference to blkcg. When a cgroup is being destroyed,
>>>>>> cgroup_rstat_flush() is only called at css_release_work_fn() which
>>>>>> is called when the blkcg reference count reaches 0. This circular
>>>>>> dependency will prevent blkcg and some blkgs from being freed after
>>>>>> they are made offline.
>>>>>>
>>>>>> It is less a problem if the cgroup to be destroyed also has other
>>>>>> controllers like memory that will call cgroup_rstat_flush() which will
>>>>>> clean up the reference count. If block is the only
>>>>>> controller that uses
>>>>>> rstat, these offline blkcg and blkgs may never be freed leaking more
>>>>>> and more memory over time.
>>>>>>
>>>>>> To prevent this potential memory leak:
>>>>>>
>>>>>> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
>>>>>> can be added
>>>>>>
>>>>>> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
>>>>>> per-cpu stat list since all stats are guaranteed to be consumed before
>>>>>> releasing blkg instance, and grabbing blkg reference for stats was the
>>>>>> most fragile part of original patch
>>>>>>
>>>>>> Based on Waiman's patch:
>>>>>>
>>>>>> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
>>>>>>
>>>>>>
>>>>>> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
>>>>>> Cc: Waiman Long <longman@redhat.com>
>>>>>> Cc: Tejun Heo <tj@kernel.org>
>>>>>> Cc: mkoutny@suse.com
>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
>>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>>>>> ---
>>>>>> V2:
>>>>>>      - remove kernel/cgroup change, and call blkcg_rstat_flush()
>>>>>>      to flush stat directly
>>>>>>
>>>>>>     block/blk-cgroup.c | 29 +++++++++++++++++++++--------
>>>>>>     1 file changed, 21 insertions(+), 8 deletions(-)
>>>>>>
>>>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>>>> index 0ce64dd73cfe..ed0eb8896972 100644
>>>>>> --- a/block/blk-cgroup.c
>>>>>> +++ b/block/blk-cgroup.c
>>>>>> @@ -34,6 +34,8 @@
>>>>>>     #include "blk-ioprio.h"
>>>>>>     #include "blk-throttle.h"
>>>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
>>>>>> +
>>>>>>     /*
>>>>>>      * blkcg_pol_mutex protects blkcg_policy[] and policy
>>>>>> [de]activation.
>>>>>>      * blkcg_pol_register_mutex nests outside of it and
>>>>>> synchronizes entire
>>>>>> @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
>>>>>>     static void __blkg_release(struct rcu_head *rcu)
>>>>>>     {
>>>>>>         struct blkcg_gq *blkg = container_of(rcu, struct
>>>>>> blkcg_gq, rcu_head);
>>>>>> +    struct blkcg *blkcg = blkg->blkcg;
>>>>>> +    int cpu;
>>>>>>     #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
>>>>>>         WARN_ON(!bio_list_empty(&blkg->async_bios));
>>>>>>     #endif
>>>>>> +    /*
>>>>>> +     * Flush all the non-empty percpu lockless lists before releasing
>>>>>> +     * us, given these stat belongs to us.
>>>>>> +     *
>>>>>> +     * cgroup locks aren't needed here since __blkcg_rstat_flush just
>>>>>> +     * propagates delta into blkg parent, which is live now.
>>>>>> +     */
>>>>>> +    for_each_possible_cpu(cpu)
>>>>>> +        __blkcg_rstat_flush(blkcg, cpu);
>>>>>>         /* release the blkcg and parent blkg refs this blkg
>>>>>> has been holding */
>>>>>>         css_put(&blkg->blkcg->css);
>>>>>> @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct
>>>>>> blkcg_gq *blkg, struct blkg_iostat *cur,
>>>>>> u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>>>>>>     }
>>>>>> -static void blkcg_rstat_flush(struct cgroup_subsys_state
>>>>>> *css, int cpu)
>>>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
>>>>>>     {
>>>>>> -    struct blkcg *blkcg = css_to_blkcg(css);
>>>>>>         struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
>>>>>>         struct llist_node *lnode;
>>>>>>         struct blkg_iostat_set *bisc, *next_bisc;
>>>>>> -    /* Root-level stats are sourced from system-wide IO stats */
>>>>>> -    if (!cgroup_parent(css->cgroup))
>>>>>> -        return;
>>>>>> -
>>>>>>         rcu_read_lock();
>>>>>>         lnode = llist_del_all(lhead);
>>>>>> @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct
>>>>>> cgroup_subsys_state *css, int cpu)
>>>>>>             if (parent && parent->parent)
>>>>>>                 blkcg_iostat_update(parent, &blkg->iostat.cur,
>>>>>>                             &blkg->iostat.last);
>>>>>> -        percpu_ref_put(&blkg->refcnt);
>>>>>>         }
>>>>>>     out:
>>>>>>         rcu_read_unlock();
>>>>>>     }
>>>>>> +static void blkcg_rstat_flush(struct cgroup_subsys_state
>>>>>> *css, int cpu)
>>>>>> +{
>>>>>> +    /* Root-level stats are sourced from system-wide IO stats */
>>>>>> +    if (cgroup_parent(css->cgroup))
>>>>>> +        __blkcg_rstat_flush(css_to_blkcg(css), cpu);
>>>>>> +}
>>>>>> +
>>>>> I think it may not safe to call __blkcg_rstat_flus() directly
>>>>> without taking
>>>>> the cgroup_rstat_cpu_lock. That is why I added a helper to
>>>>> kernel/cgroup/rstat.c in my patch to meet the locking requirement.
>>>> All stats are removed from llist_del_all(), and the local list is
>>>> iterated, then each blkg & its parent is touched in
>>>> __blkcg_rstat_flus(), so
>>>> can you explain it a bit why cgroup locks are needed? For protecting
>>>> what?
>>> You are right. The llist_del_all() call in blkcg_rstat_flush() is
>>> atomic, so it is safe for concurrent execution which is what the
>>> cgroup_rstat_cpu_lock protects against. That may not be the case for
>>> rstat callbacks of other controllers. So I will suggest you to add a
>>> comment to clarify that point. Other than that, you patch looks good to
>>> me.
>>>
>>> Reviewed: Waiman Long <longman@redhat.com>
>> After some more thought, I need to retract my reviewed-by tag for now. There
>> is a slight possibility that blkcg_iostat_update() in blkcg_rstat_flush()
>> can happen concurrently which will corrupt the sequence count.
> llist_del_all() moves all 'bis' into one local list, and bis is one percpu
> variable of blkg, so in theory same bis won't be flushed at the same
> time. And one bis should be touched in just one of stat flush code path
> because of llist_del_all().
>
> So 'bis' still can be thought as being flushed in serialized way.
>
> However, blk_cgroup_bio_start() runs concurrently with blkcg_rstat_flush(),
> so once bis->lqueued is cleared in blkcg_rstat_flush(), this same bis
> could be added to the percpu llist and __blkcg_rstat_flush() from blkg_release()
> follows. This should be the only chance for concurrent stats update.

That is why I have in mind. A __blkcg_rstat_flush() can be from 
blkg_release() and another one from the regular cgroup_rstat_flush*().


>
> But, blkg_release() is run in RCU callback, so the previous flush has
> been done, but new flush can come, and other blkg's stat could be added
> with same story above.
>
>> One way to
>> avoid that is to synchronize it by cgroup_rstat_cpu_lock. Another way is to
>> use the bisc->lqueued for synchronization.
> I'd avoid the external cgroup lock here.
>
>> In that case, you need to move
>> WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after all
>> the  blkcg_iostat_update() call with smp_store_release() and replace the
>> READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with
>> smp_load_acquire().
> This way looks doable, but I guess it still can't avoid concurrent update on parent
> stat, such as when  __blkcg_rstat_flush() from blkg_release() is
> in-progress, another sibling blkg's bis is added, meantime
> blkcg_rstat_flush() is called.
I realized that the use of cgroup_rstat_cpu_lock or the alternative was 
not safe enough for preventing concurrent parent blkg rstat update.
>
> Another way is to add blkcg->stat_lock for covering __blkcg_rstat_flush(), what
> do you think of this way?

I am thinking of adding a raw spinlock into blkg and take it when doing 
blkcg_iostat_update(). This can guarantee no concurrent update to rstat 
data. It has to be a raw spinlock as it will be under the 
cgroup_rstat_cpu_lock raw spinlock.

The use of u64_stats_fetch_begin/u64_stats_fetch_retry in retrieving the 
rstat data from bisc can happen concurrently with update without further 
synchronization. However, there can be at most one update at any time.

Cheers,
Longman

>
>
> Thanks,
> Ming
>
Yosry Ahmed May 25, 2023, 2:55 a.m. UTC | #7
On Wed, May 24, 2023 at 7:50 PM Waiman Long <longman@redhat.com> wrote:
>
> On 5/24/23 22:04, Ming Lei wrote:
> > On Wed, May 24, 2023 at 01:28:41PM -0400, Waiman Long wrote:
> >> On 5/24/23 11:43, Waiman Long wrote:
> >>> On 5/24/23 00:26, Ming Lei wrote:
> >>>> On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
> >>>>> On 5/23/23 23:51, Ming Lei wrote:
> >>>>>> As noted by Michal, the blkg_iostat_set's in the lockless list hold
> >>>>>> reference to blkg's to protect against their removal. Those blkg's
> >>>>>> hold reference to blkcg. When a cgroup is being destroyed,
> >>>>>> cgroup_rstat_flush() is only called at css_release_work_fn() which
> >>>>>> is called when the blkcg reference count reaches 0. This circular
> >>>>>> dependency will prevent blkcg and some blkgs from being freed after
> >>>>>> they are made offline.
> >>>>>>
> >>>>>> It is less a problem if the cgroup to be destroyed also has other
> >>>>>> controllers like memory that will call cgroup_rstat_flush() which will
> >>>>>> clean up the reference count. If block is the only
> >>>>>> controller that uses
> >>>>>> rstat, these offline blkcg and blkgs may never be freed leaking more
> >>>>>> and more memory over time.
> >>>>>>
> >>>>>> To prevent this potential memory leak:
> >>>>>>
> >>>>>> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> >>>>>> can be added
> >>>>>>
> >>>>>> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> >>>>>> per-cpu stat list since all stats are guaranteed to be consumed before
> >>>>>> releasing blkg instance, and grabbing blkg reference for stats was the
> >>>>>> most fragile part of original patch
> >>>>>>
> >>>>>> Based on Waiman's patch:
> >>>>>>
> >>>>>> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> >>>>>>
> >>>>>>
> >>>>>> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> >>>>>> Cc: Waiman Long <longman@redhat.com>
> >>>>>> Cc: Tejun Heo <tj@kernel.org>
> >>>>>> Cc: mkoutny@suse.com
> >>>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
> >>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >>>>>> ---
> >>>>>> V2:
> >>>>>>      - remove kernel/cgroup change, and call blkcg_rstat_flush()
> >>>>>>      to flush stat directly
> >>>>>>
> >>>>>>     block/blk-cgroup.c | 29 +++++++++++++++++++++--------
> >>>>>>     1 file changed, 21 insertions(+), 8 deletions(-)
> >>>>>>
> >>>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> >>>>>> index 0ce64dd73cfe..ed0eb8896972 100644
> >>>>>> --- a/block/blk-cgroup.c
> >>>>>> +++ b/block/blk-cgroup.c
> >>>>>> @@ -34,6 +34,8 @@
> >>>>>>     #include "blk-ioprio.h"
> >>>>>>     #include "blk-throttle.h"
> >>>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
> >>>>>> +
> >>>>>>     /*
> >>>>>>      * blkcg_pol_mutex protects blkcg_policy[] and policy
> >>>>>> [de]activation.
> >>>>>>      * blkcg_pol_register_mutex nests outside of it and
> >>>>>> synchronizes entire
> >>>>>> @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
> >>>>>>     static void __blkg_release(struct rcu_head *rcu)
> >>>>>>     {
> >>>>>>         struct blkcg_gq *blkg = container_of(rcu, struct
> >>>>>> blkcg_gq, rcu_head);
> >>>>>> +    struct blkcg *blkcg = blkg->blkcg;
> >>>>>> +    int cpu;
> >>>>>>     #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
> >>>>>>         WARN_ON(!bio_list_empty(&blkg->async_bios));
> >>>>>>     #endif
> >>>>>> +    /*
> >>>>>> +     * Flush all the non-empty percpu lockless lists before releasing
> >>>>>> +     * us, given these stat belongs to us.
> >>>>>> +     *
> >>>>>> +     * cgroup locks aren't needed here since __blkcg_rstat_flush just
> >>>>>> +     * propagates delta into blkg parent, which is live now.
> >>>>>> +     */
> >>>>>> +    for_each_possible_cpu(cpu)
> >>>>>> +        __blkcg_rstat_flush(blkcg, cpu);
> >>>>>>         /* release the blkcg and parent blkg refs this blkg
> >>>>>> has been holding */
> >>>>>>         css_put(&blkg->blkcg->css);
> >>>>>> @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct
> >>>>>> blkcg_gq *blkg, struct blkg_iostat *cur,
> >>>>>> u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> >>>>>>     }
> >>>>>> -static void blkcg_rstat_flush(struct cgroup_subsys_state
> >>>>>> *css, int cpu)
> >>>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
> >>>>>>     {
> >>>>>> -    struct blkcg *blkcg = css_to_blkcg(css);
> >>>>>>         struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> >>>>>>         struct llist_node *lnode;
> >>>>>>         struct blkg_iostat_set *bisc, *next_bisc;
> >>>>>> -    /* Root-level stats are sourced from system-wide IO stats */
> >>>>>> -    if (!cgroup_parent(css->cgroup))
> >>>>>> -        return;
> >>>>>> -
> >>>>>>         rcu_read_lock();
> >>>>>>         lnode = llist_del_all(lhead);
> >>>>>> @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct
> >>>>>> cgroup_subsys_state *css, int cpu)
> >>>>>>             if (parent && parent->parent)
> >>>>>>                 blkcg_iostat_update(parent, &blkg->iostat.cur,
> >>>>>>                             &blkg->iostat.last);
> >>>>>> -        percpu_ref_put(&blkg->refcnt);
> >>>>>>         }
> >>>>>>     out:
> >>>>>>         rcu_read_unlock();
> >>>>>>     }
> >>>>>> +static void blkcg_rstat_flush(struct cgroup_subsys_state
> >>>>>> *css, int cpu)
> >>>>>> +{
> >>>>>> +    /* Root-level stats are sourced from system-wide IO stats */
> >>>>>> +    if (cgroup_parent(css->cgroup))
> >>>>>> +        __blkcg_rstat_flush(css_to_blkcg(css), cpu);
> >>>>>> +}
> >>>>>> +
> >>>>> I think it may not safe to call __blkcg_rstat_flus() directly
> >>>>> without taking
> >>>>> the cgroup_rstat_cpu_lock. That is why I added a helper to
> >>>>> kernel/cgroup/rstat.c in my patch to meet the locking requirement.
> >>>> All stats are removed from llist_del_all(), and the local list is
> >>>> iterated, then each blkg & its parent is touched in
> >>>> __blkcg_rstat_flus(), so
> >>>> can you explain it a bit why cgroup locks are needed? For protecting
> >>>> what?
> >>> You are right. The llist_del_all() call in blkcg_rstat_flush() is
> >>> atomic, so it is safe for concurrent execution which is what the
> >>> cgroup_rstat_cpu_lock protects against. That may not be the case for
> >>> rstat callbacks of other controllers. So I will suggest you to add a
> >>> comment to clarify that point. Other than that, you patch looks good to
> >>> me.
> >>>
> >>> Reviewed: Waiman Long <longman@redhat.com>
> >> After some more thought, I need to retract my reviewed-by tag for now. There
> >> is a slight possibility that blkcg_iostat_update() in blkcg_rstat_flush()
> >> can happen concurrently which will corrupt the sequence count.
> > llist_del_all() moves all 'bis' into one local list, and bis is one percpu
> > variable of blkg, so in theory same bis won't be flushed at the same
> > time. And one bis should be touched in just one of stat flush code path
> > because of llist_del_all().
> >
> > So 'bis' still can be thought as being flushed in serialized way.
> >
> > However, blk_cgroup_bio_start() runs concurrently with blkcg_rstat_flush(),
> > so once bis->lqueued is cleared in blkcg_rstat_flush(), this same bis
> > could be added to the percpu llist and __blkcg_rstat_flush() from blkg_release()
> > follows. This should be the only chance for concurrent stats update.
>
> That is why I have in mind. A __blkcg_rstat_flush() can be from
> blkg_release() and another one from the regular cgroup_rstat_flush*().
>
>
> >
> > But, blkg_release() is run in RCU callback, so the previous flush has
> > been done, but new flush can come, and other blkg's stat could be added
> > with same story above.
> >
> >> One way to
> >> avoid that is to synchronize it by cgroup_rstat_cpu_lock. Another way is to
> >> use the bisc->lqueued for synchronization.
> > I'd avoid the external cgroup lock here.
> >
> >> In that case, you need to move
> >> WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after all
> >> the  blkcg_iostat_update() call with smp_store_release() and replace the
> >> READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with
> >> smp_load_acquire().
> > This way looks doable, but I guess it still can't avoid concurrent update on parent
> > stat, such as when  __blkcg_rstat_flush() from blkg_release() is
> > in-progress, another sibling blkg's bis is added, meantime
> > blkcg_rstat_flush() is called.
> I realized that the use of cgroup_rstat_cpu_lock or the alternative was
> not safe enough for preventing concurrent parent blkg rstat update.
> >
> > Another way is to add blkcg->stat_lock for covering __blkcg_rstat_flush(), what
> > do you think of this way?
>
> I am thinking of adding a raw spinlock into blkg and take it when doing
> blkcg_iostat_update(). This can guarantee no concurrent update to rstat
> data. It has to be a raw spinlock as it will be under the
> cgroup_rstat_cpu_lock raw spinlock.

Hi Waiman,

I don't have context about blkcg, but isn't this exactly what
cgroup_rstat_lock does? Is it too expensive to just call
cgroup_rstat_flush () here?

>
> The use of u64_stats_fetch_begin/u64_stats_fetch_retry in retrieving the
> rstat data from bisc can happen concurrently with update without further
> synchronization. However, there can be at most one update at any time.
>
> Cheers,
> Longman
>
> >
> >
> > Thanks,
> > Ming
> >
>
Waiman Long May 25, 2023, 3:04 a.m. UTC | #8
On 5/24/23 22:55, Yosry Ahmed wrote:
> On Wed, May 24, 2023 at 7:50 PM Waiman Long <longman@redhat.com> wrote:
>> On 5/24/23 22:04, Ming Lei wrote:
>>> On Wed, May 24, 2023 at 01:28:41PM -0400, Waiman Long wrote:
>>>> On 5/24/23 11:43, Waiman Long wrote:
>>>>> On 5/24/23 00:26, Ming Lei wrote:
>>>>>> On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
>>>>>>> On 5/23/23 23:51, Ming Lei wrote:
>>>>>>>> As noted by Michal, the blkg_iostat_set's in the lockless list hold
>>>>>>>> reference to blkg's to protect against their removal. Those blkg's
>>>>>>>> hold reference to blkcg. When a cgroup is being destroyed,
>>>>>>>> cgroup_rstat_flush() is only called at css_release_work_fn() which
>>>>>>>> is called when the blkcg reference count reaches 0. This circular
>>>>>>>> dependency will prevent blkcg and some blkgs from being freed after
>>>>>>>> they are made offline.
>>>>>>>>
>>>>>>>> It is less a problem if the cgroup to be destroyed also has other
>>>>>>>> controllers like memory that will call cgroup_rstat_flush() which will
>>>>>>>> clean up the reference count. If block is the only
>>>>>>>> controller that uses
>>>>>>>> rstat, these offline blkcg and blkgs may never be freed leaking more
>>>>>>>> and more memory over time.
>>>>>>>>
>>>>>>>> To prevent this potential memory leak:
>>>>>>>>
>>>>>>>> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
>>>>>>>> can be added
>>>>>>>>
>>>>>>>> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
>>>>>>>> per-cpu stat list since all stats are guaranteed to be consumed before
>>>>>>>> releasing blkg instance, and grabbing blkg reference for stats was the
>>>>>>>> most fragile part of original patch
>>>>>>>>
>>>>>>>> Based on Waiman's patch:
>>>>>>>>
>>>>>>>> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
>>>>>>>>
>>>>>>>>
>>>>>>>> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
>>>>>>>> Cc: Waiman Long <longman@redhat.com>
>>>>>>>> Cc: Tejun Heo <tj@kernel.org>
>>>>>>>> Cc: mkoutny@suse.com
>>>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
>>>>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>>>>>>> ---
>>>>>>>> V2:
>>>>>>>>       - remove kernel/cgroup change, and call blkcg_rstat_flush()
>>>>>>>>       to flush stat directly
>>>>>>>>
>>>>>>>>      block/blk-cgroup.c | 29 +++++++++++++++++++++--------
>>>>>>>>      1 file changed, 21 insertions(+), 8 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>>>>>> index 0ce64dd73cfe..ed0eb8896972 100644
>>>>>>>> --- a/block/blk-cgroup.c
>>>>>>>> +++ b/block/blk-cgroup.c
>>>>>>>> @@ -34,6 +34,8 @@
>>>>>>>>      #include "blk-ioprio.h"
>>>>>>>>      #include "blk-throttle.h"
>>>>>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
>>>>>>>> +
>>>>>>>>      /*
>>>>>>>>       * blkcg_pol_mutex protects blkcg_policy[] and policy
>>>>>>>> [de]activation.
>>>>>>>>       * blkcg_pol_register_mutex nests outside of it and
>>>>>>>> synchronizes entire
>>>>>>>> @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
>>>>>>>>      static void __blkg_release(struct rcu_head *rcu)
>>>>>>>>      {
>>>>>>>>          struct blkcg_gq *blkg = container_of(rcu, struct
>>>>>>>> blkcg_gq, rcu_head);
>>>>>>>> +    struct blkcg *blkcg = blkg->blkcg;
>>>>>>>> +    int cpu;
>>>>>>>>      #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
>>>>>>>>          WARN_ON(!bio_list_empty(&blkg->async_bios));
>>>>>>>>      #endif
>>>>>>>> +    /*
>>>>>>>> +     * Flush all the non-empty percpu lockless lists before releasing
>>>>>>>> +     * us, given these stat belongs to us.
>>>>>>>> +     *
>>>>>>>> +     * cgroup locks aren't needed here since __blkcg_rstat_flush just
>>>>>>>> +     * propagates delta into blkg parent, which is live now.
>>>>>>>> +     */
>>>>>>>> +    for_each_possible_cpu(cpu)
>>>>>>>> +        __blkcg_rstat_flush(blkcg, cpu);
>>>>>>>>          /* release the blkcg and parent blkg refs this blkg
>>>>>>>> has been holding */
>>>>>>>>          css_put(&blkg->blkcg->css);
>>>>>>>> @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct
>>>>>>>> blkcg_gq *blkg, struct blkg_iostat *cur,
>>>>>>>> u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
>>>>>>>>      }
>>>>>>>> -static void blkcg_rstat_flush(struct cgroup_subsys_state
>>>>>>>> *css, int cpu)
>>>>>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
>>>>>>>>      {
>>>>>>>> -    struct blkcg *blkcg = css_to_blkcg(css);
>>>>>>>>          struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
>>>>>>>>          struct llist_node *lnode;
>>>>>>>>          struct blkg_iostat_set *bisc, *next_bisc;
>>>>>>>> -    /* Root-level stats are sourced from system-wide IO stats */
>>>>>>>> -    if (!cgroup_parent(css->cgroup))
>>>>>>>> -        return;
>>>>>>>> -
>>>>>>>>          rcu_read_lock();
>>>>>>>>          lnode = llist_del_all(lhead);
>>>>>>>> @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct
>>>>>>>> cgroup_subsys_state *css, int cpu)
>>>>>>>>              if (parent && parent->parent)
>>>>>>>>                  blkcg_iostat_update(parent, &blkg->iostat.cur,
>>>>>>>>                              &blkg->iostat.last);
>>>>>>>> -        percpu_ref_put(&blkg->refcnt);
>>>>>>>>          }
>>>>>>>>      out:
>>>>>>>>          rcu_read_unlock();
>>>>>>>>      }
>>>>>>>> +static void blkcg_rstat_flush(struct cgroup_subsys_state
>>>>>>>> *css, int cpu)
>>>>>>>> +{
>>>>>>>> +    /* Root-level stats are sourced from system-wide IO stats */
>>>>>>>> +    if (cgroup_parent(css->cgroup))
>>>>>>>> +        __blkcg_rstat_flush(css_to_blkcg(css), cpu);
>>>>>>>> +}
>>>>>>>> +
>>>>>>> I think it may not safe to call __blkcg_rstat_flus() directly
>>>>>>> without taking
>>>>>>> the cgroup_rstat_cpu_lock. That is why I added a helper to
>>>>>>> kernel/cgroup/rstat.c in my patch to meet the locking requirement.
>>>>>> All stats are removed from llist_del_all(), and the local list is
>>>>>> iterated, then each blkg & its parent is touched in
>>>>>> __blkcg_rstat_flus(), so
>>>>>> can you explain it a bit why cgroup locks are needed? For protecting
>>>>>> what?
>>>>> You are right. The llist_del_all() call in blkcg_rstat_flush() is
>>>>> atomic, so it is safe for concurrent execution which is what the
>>>>> cgroup_rstat_cpu_lock protects against. That may not be the case for
>>>>> rstat callbacks of other controllers. So I will suggest you to add a
>>>>> comment to clarify that point. Other than that, you patch looks good to
>>>>> me.
>>>>>
>>>>> Reviewed: Waiman Long <longman@redhat.com>
>>>> After some more thought, I need to retract my reviewed-by tag for now. There
>>>> is a slight possibility that blkcg_iostat_update() in blkcg_rstat_flush()
>>>> can happen concurrently which will corrupt the sequence count.
>>> llist_del_all() moves all 'bis' into one local list, and bis is one percpu
>>> variable of blkg, so in theory same bis won't be flushed at the same
>>> time. And one bis should be touched in just one of stat flush code path
>>> because of llist_del_all().
>>>
>>> So 'bis' still can be thought as being flushed in serialized way.
>>>
>>> However, blk_cgroup_bio_start() runs concurrently with blkcg_rstat_flush(),
>>> so once bis->lqueued is cleared in blkcg_rstat_flush(), this same bis
>>> could be added to the percpu llist and __blkcg_rstat_flush() from blkg_release()
>>> follows. This should be the only chance for concurrent stats update.
>> That is why I have in mind. A __blkcg_rstat_flush() can be from
>> blkg_release() and another one from the regular cgroup_rstat_flush*().
>>
>>
>>> But, blkg_release() is run in RCU callback, so the previous flush has
>>> been done, but new flush can come, and other blkg's stat could be added
>>> with same story above.
>>>
>>>> One way to
>>>> avoid that is to synchronize it by cgroup_rstat_cpu_lock. Another way is to
>>>> use the bisc->lqueued for synchronization.
>>> I'd avoid the external cgroup lock here.
>>>
>>>> In that case, you need to move
>>>> WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after all
>>>> the  blkcg_iostat_update() call with smp_store_release() and replace the
>>>> READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with
>>>> smp_load_acquire().
>>> This way looks doable, but I guess it still can't avoid concurrent update on parent
>>> stat, such as when  __blkcg_rstat_flush() from blkg_release() is
>>> in-progress, another sibling blkg's bis is added, meantime
>>> blkcg_rstat_flush() is called.
>> I realized that the use of cgroup_rstat_cpu_lock or the alternative was
>> not safe enough for preventing concurrent parent blkg rstat update.
>>> Another way is to add blkcg->stat_lock for covering __blkcg_rstat_flush(), what
>>> do you think of this way?
>> I am thinking of adding a raw spinlock into blkg and take it when doing
>> blkcg_iostat_update(). This can guarantee no concurrent update to rstat
>> data. It has to be a raw spinlock as it will be under the
>> cgroup_rstat_cpu_lock raw spinlock.
> Hi Waiman,
>
> I don't have context about blkcg, but isn't this exactly what
> cgroup_rstat_lock does? Is it too expensive to just call
> cgroup_rstat_flush () here?

I have thought about that too. However, in my test, just calling 
cgroup_rstat_flush() in blkcg_destroy_blkgs() did not prevent dying 
blkcgs from increasing meaning that there are still some extra 
references blocking its removal. I haven't figured out exactly why that 
is the case. There may still be some races that we have not fully 
understood yet. On the other hand, Ming's patch is verified to not do 
that since it does not take extra blkg references. So I am leaning on 
his patch now. I just have to make sure that there is no concurrent 
rstat update.

Cheers,
Longman
Yosry Ahmed May 25, 2023, 3:08 a.m. UTC | #9
On Wed, May 24, 2023 at 8:04 PM Waiman Long <longman@redhat.com> wrote:
>
> On 5/24/23 22:55, Yosry Ahmed wrote:
> > On Wed, May 24, 2023 at 7:50 PM Waiman Long <longman@redhat.com> wrote:
> >> On 5/24/23 22:04, Ming Lei wrote:
> >>> On Wed, May 24, 2023 at 01:28:41PM -0400, Waiman Long wrote:
> >>>> On 5/24/23 11:43, Waiman Long wrote:
> >>>>> On 5/24/23 00:26, Ming Lei wrote:
> >>>>>> On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
> >>>>>>> On 5/23/23 23:51, Ming Lei wrote:
> >>>>>>>> As noted by Michal, the blkg_iostat_set's in the lockless list hold
> >>>>>>>> reference to blkg's to protect against their removal. Those blkg's
> >>>>>>>> hold reference to blkcg. When a cgroup is being destroyed,
> >>>>>>>> cgroup_rstat_flush() is only called at css_release_work_fn() which
> >>>>>>>> is called when the blkcg reference count reaches 0. This circular
> >>>>>>>> dependency will prevent blkcg and some blkgs from being freed after
> >>>>>>>> they are made offline.
> >>>>>>>>
> >>>>>>>> It is less a problem if the cgroup to be destroyed also has other
> >>>>>>>> controllers like memory that will call cgroup_rstat_flush() which will
> >>>>>>>> clean up the reference count. If block is the only
> >>>>>>>> controller that uses
> >>>>>>>> rstat, these offline blkcg and blkgs may never be freed leaking more
> >>>>>>>> and more memory over time.
> >>>>>>>>
> >>>>>>>> To prevent this potential memory leak:
> >>>>>>>>
> >>>>>>>> - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> >>>>>>>> can be added
> >>>>>>>>
> >>>>>>>> - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> >>>>>>>> per-cpu stat list since all stats are guaranteed to be consumed before
> >>>>>>>> releasing blkg instance, and grabbing blkg reference for stats was the
> >>>>>>>> most fragile part of original patch
> >>>>>>>>
> >>>>>>>> Based on Waiman's patch:
> >>>>>>>>
> >>>>>>>> https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> >>>>>>>> Cc: Waiman Long <longman@redhat.com>
> >>>>>>>> Cc: Tejun Heo <tj@kernel.org>
> >>>>>>>> Cc: mkoutny@suse.com
> >>>>>>>> Cc: Yosry Ahmed <yosryahmed@google.com>
> >>>>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >>>>>>>> ---
> >>>>>>>> V2:
> >>>>>>>>       - remove kernel/cgroup change, and call blkcg_rstat_flush()
> >>>>>>>>       to flush stat directly
> >>>>>>>>
> >>>>>>>>      block/blk-cgroup.c | 29 +++++++++++++++++++++--------
> >>>>>>>>      1 file changed, 21 insertions(+), 8 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> >>>>>>>> index 0ce64dd73cfe..ed0eb8896972 100644
> >>>>>>>> --- a/block/blk-cgroup.c
> >>>>>>>> +++ b/block/blk-cgroup.c
> >>>>>>>> @@ -34,6 +34,8 @@
> >>>>>>>>      #include "blk-ioprio.h"
> >>>>>>>>      #include "blk-throttle.h"
> >>>>>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
> >>>>>>>> +
> >>>>>>>>      /*
> >>>>>>>>       * blkcg_pol_mutex protects blkcg_policy[] and policy
> >>>>>>>> [de]activation.
> >>>>>>>>       * blkcg_pol_register_mutex nests outside of it and
> >>>>>>>> synchronizes entire
> >>>>>>>> @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
> >>>>>>>>      static void __blkg_release(struct rcu_head *rcu)
> >>>>>>>>      {
> >>>>>>>>          struct blkcg_gq *blkg = container_of(rcu, struct
> >>>>>>>> blkcg_gq, rcu_head);
> >>>>>>>> +    struct blkcg *blkcg = blkg->blkcg;
> >>>>>>>> +    int cpu;
> >>>>>>>>      #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
> >>>>>>>>          WARN_ON(!bio_list_empty(&blkg->async_bios));
> >>>>>>>>      #endif
> >>>>>>>> +    /*
> >>>>>>>> +     * Flush all the non-empty percpu lockless lists before releasing
> >>>>>>>> +     * us, given these stat belongs to us.
> >>>>>>>> +     *
> >>>>>>>> +     * cgroup locks aren't needed here since __blkcg_rstat_flush just
> >>>>>>>> +     * propagates delta into blkg parent, which is live now.
> >>>>>>>> +     */
> >>>>>>>> +    for_each_possible_cpu(cpu)
> >>>>>>>> +        __blkcg_rstat_flush(blkcg, cpu);
> >>>>>>>>          /* release the blkcg and parent blkg refs this blkg
> >>>>>>>> has been holding */
> >>>>>>>>          css_put(&blkg->blkcg->css);
> >>>>>>>> @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct
> >>>>>>>> blkcg_gq *blkg, struct blkg_iostat *cur,
> >>>>>>>> u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> >>>>>>>>      }
> >>>>>>>> -static void blkcg_rstat_flush(struct cgroup_subsys_state
> >>>>>>>> *css, int cpu)
> >>>>>>>> +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
> >>>>>>>>      {
> >>>>>>>> -    struct blkcg *blkcg = css_to_blkcg(css);
> >>>>>>>>          struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> >>>>>>>>          struct llist_node *lnode;
> >>>>>>>>          struct blkg_iostat_set *bisc, *next_bisc;
> >>>>>>>> -    /* Root-level stats are sourced from system-wide IO stats */
> >>>>>>>> -    if (!cgroup_parent(css->cgroup))
> >>>>>>>> -        return;
> >>>>>>>> -
> >>>>>>>>          rcu_read_lock();
> >>>>>>>>          lnode = llist_del_all(lhead);
> >>>>>>>> @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct
> >>>>>>>> cgroup_subsys_state *css, int cpu)
> >>>>>>>>              if (parent && parent->parent)
> >>>>>>>>                  blkcg_iostat_update(parent, &blkg->iostat.cur,
> >>>>>>>>                              &blkg->iostat.last);
> >>>>>>>> -        percpu_ref_put(&blkg->refcnt);
> >>>>>>>>          }
> >>>>>>>>      out:
> >>>>>>>>          rcu_read_unlock();
> >>>>>>>>      }
> >>>>>>>> +static void blkcg_rstat_flush(struct cgroup_subsys_state
> >>>>>>>> *css, int cpu)
> >>>>>>>> +{
> >>>>>>>> +    /* Root-level stats are sourced from system-wide IO stats */
> >>>>>>>> +    if (cgroup_parent(css->cgroup))
> >>>>>>>> +        __blkcg_rstat_flush(css_to_blkcg(css), cpu);
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>> I think it may not safe to call __blkcg_rstat_flus() directly
> >>>>>>> without taking
> >>>>>>> the cgroup_rstat_cpu_lock. That is why I added a helper to
> >>>>>>> kernel/cgroup/rstat.c in my patch to meet the locking requirement.
> >>>>>> All stats are removed from llist_del_all(), and the local list is
> >>>>>> iterated, then each blkg & its parent is touched in
> >>>>>> __blkcg_rstat_flus(), so
> >>>>>> can you explain it a bit why cgroup locks are needed? For protecting
> >>>>>> what?
> >>>>> You are right. The llist_del_all() call in blkcg_rstat_flush() is
> >>>>> atomic, so it is safe for concurrent execution which is what the
> >>>>> cgroup_rstat_cpu_lock protects against. That may not be the case for
> >>>>> rstat callbacks of other controllers. So I will suggest you to add a
> >>>>> comment to clarify that point. Other than that, you patch looks good to
> >>>>> me.
> >>>>>
> >>>>> Reviewed: Waiman Long <longman@redhat.com>
> >>>> After some more thought, I need to retract my reviewed-by tag for now. There
> >>>> is a slight possibility that blkcg_iostat_update() in blkcg_rstat_flush()
> >>>> can happen concurrently which will corrupt the sequence count.
> >>> llist_del_all() moves all 'bis' into one local list, and bis is one percpu
> >>> variable of blkg, so in theory same bis won't be flushed at the same
> >>> time. And one bis should be touched in just one of stat flush code path
> >>> because of llist_del_all().
> >>>
> >>> So 'bis' still can be thought as being flushed in serialized way.
> >>>
> >>> However, blk_cgroup_bio_start() runs concurrently with blkcg_rstat_flush(),
> >>> so once bis->lqueued is cleared in blkcg_rstat_flush(), this same bis
> >>> could be added to the percpu llist and __blkcg_rstat_flush() from blkg_release()
> >>> follows. This should be the only chance for concurrent stats update.
> >> That is why I have in mind. A __blkcg_rstat_flush() can be from
> >> blkg_release() and another one from the regular cgroup_rstat_flush*().
> >>
> >>
> >>> But, blkg_release() is run in RCU callback, so the previous flush has
> >>> been done, but new flush can come, and other blkg's stat could be added
> >>> with same story above.
> >>>
> >>>> One way to
> >>>> avoid that is to synchronize it by cgroup_rstat_cpu_lock. Another way is to
> >>>> use the bisc->lqueued for synchronization.
> >>> I'd avoid the external cgroup lock here.
> >>>
> >>>> In that case, you need to move
> >>>> WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after all
> >>>> the  blkcg_iostat_update() call with smp_store_release() and replace the
> >>>> READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with
> >>>> smp_load_acquire().
> >>> This way looks doable, but I guess it still can't avoid concurrent update on parent
> >>> stat, such as when  __blkcg_rstat_flush() from blkg_release() is
> >>> in-progress, another sibling blkg's bis is added, meantime
> >>> blkcg_rstat_flush() is called.
> >> I realized that the use of cgroup_rstat_cpu_lock or the alternative was
> >> not safe enough for preventing concurrent parent blkg rstat update.
> >>> Another way is to add blkcg->stat_lock for covering __blkcg_rstat_flush(), what
> >>> do you think of this way?
> >> I am thinking of adding a raw spinlock into blkg and take it when doing
> >> blkcg_iostat_update(). This can guarantee no concurrent update to rstat
> >> data. It has to be a raw spinlock as it will be under the
> >> cgroup_rstat_cpu_lock raw spinlock.
> > Hi Waiman,
> >
> > I don't have context about blkcg, but isn't this exactly what
> > cgroup_rstat_lock does? Is it too expensive to just call
> > cgroup_rstat_flush () here?
>
> I have thought about that too. However, in my test, just calling
> cgroup_rstat_flush() in blkcg_destroy_blkgs() did not prevent dying
> blkcgs from increasing meaning that there are still some extra
> references blocking its removal. I haven't figured out exactly why that
> is the case. There may still be some races that we have not fully
> understood yet. On the other hand, Ming's patch is verified to not do
> that since it does not take extra blkg references. So I am leaning on
> his patch now. I just have to make sure that there is no concurrent
> rstat update.

Oh I didn't mean to change Ming's approach of not taking extra blkg
references, just replacing:

for_each_possible_cpu(cpu)
               __blkcg_rstat_flush(blkcg, cpu);

with:

cgroup_rstat_flush(blkcg->css.cgroup);

in the current patch, so that we get the protection from
cgroup_rstat_lock against concurrent rstat updates.

>
> Cheers,
> Longman
>
Waiman Long May 25, 2023, 3:11 a.m. UTC | #10
On 5/24/23 23:04, Waiman Long wrote:
>> Hi Waiman,
>>
>> I don't have context about blkcg, but isn't this exactly what
>> cgroup_rstat_lock does? Is it too expensive to just call
>> cgroup_rstat_flush () here?
>
> I have thought about that too. However, in my test, just calling 
> cgroup_rstat_flush() in blkcg_destroy_blkgs() did not prevent dying 
> blkcgs from increasing meaning that there are still some extra 
> references blocking its removal. I haven't figured out exactly why 
> that is the case. There may still be some races that we have not fully 
> understood yet. On the other hand, Ming's patch is verified to not do 
> that since it does not take extra blkg references. So I am leaning on 
> his patch now. I just have to make sure that there is no concurrent 
> rstat update.

It is also true that calling cgroup_rstat_flush() is much more 
expensive, especially at the blkg level where there can be many blkgs to 
be destroyed when a blkcg is destroyed.

Cheers,
Longman
Yosry Ahmed May 25, 2023, 3:23 a.m. UTC | #11
On Wed, May 24, 2023 at 8:11 PM Waiman Long <longman@redhat.com> wrote:
>
> On 5/24/23 23:04, Waiman Long wrote:
> >> Hi Waiman,
> >>
> >> I don't have context about blkcg, but isn't this exactly what
> >> cgroup_rstat_lock does? Is it too expensive to just call
> >> cgroup_rstat_flush () here?
> >
> > I have thought about that too. However, in my test, just calling
> > cgroup_rstat_flush() in blkcg_destroy_blkgs() did not prevent dying
> > blkcgs from increasing meaning that there are still some extra
> > references blocking its removal. I haven't figured out exactly why
> > that is the case. There may still be some races that we have not fully
> > understood yet. On the other hand, Ming's patch is verified to not do
> > that since it does not take extra blkg references. So I am leaning on
> > his patch now. I just have to make sure that there is no concurrent
> > rstat update.
>
> It is also true that calling cgroup_rstat_flush() is much more
> expensive, especially at the blkg level where there can be many blkgs to
> be destroyed when a blkcg is destroyed.

I see. For what's it worth, consequent flushes on the same cgroup
should be fast since the first flush should remove it from the rstat
updated list (unless updates come from another css on the same cgroup
while the blkgs are being destroyed).

I am assuming in this case we are sure that the cgroup does not have
children, otherwise we need to flush them as well, not just the
cgroup, so calling __blkcg_rstat_flush() directly might not be a
viable option.

Perhaps we can add an API to cgroup rstat flushing that just
holds/releases the rstat lock, to avoid adding one extra lock on the
blkcg side that does the same job.

Alternatively, we can have a version of cgroup_rstat_flush() that only
flushes one subsystem; but then we will have to iterate the cgroups on
the rstat updated tree without popping them. Another approach would be
to split the rstat updated tree to have one per-subsystem so that they
don't slow each other down; but then we need to figure out what to do
about base stat flush and BPF flushing. Just thinking out loud here :)

>
> Cheers,
> Longman
>
Ming Lei May 25, 2023, 3:47 a.m. UTC | #12
On Wed, May 24, 2023 at 10:50:29PM -0400, Waiman Long wrote:
> On 5/24/23 22:04, Ming Lei wrote:
> > On Wed, May 24, 2023 at 01:28:41PM -0400, Waiman Long wrote:
> > > On 5/24/23 11:43, Waiman Long wrote:
> > > > On 5/24/23 00:26, Ming Lei wrote:
> > > > > On Wed, May 24, 2023 at 12:19:57AM -0400, Waiman Long wrote:
> > > > > > On 5/23/23 23:51, Ming Lei wrote:
> > > > > > > As noted by Michal, the blkg_iostat_set's in the lockless list hold
> > > > > > > reference to blkg's to protect against their removal. Those blkg's
> > > > > > > hold reference to blkcg. When a cgroup is being destroyed,
> > > > > > > cgroup_rstat_flush() is only called at css_release_work_fn() which
> > > > > > > is called when the blkcg reference count reaches 0. This circular
> > > > > > > dependency will prevent blkcg and some blkgs from being freed after
> > > > > > > they are made offline.
> > > > > > > 
> > > > > > > It is less a problem if the cgroup to be destroyed also has other
> > > > > > > controllers like memory that will call cgroup_rstat_flush() which will
> > > > > > > clean up the reference count. If block is the only
> > > > > > > controller that uses
> > > > > > > rstat, these offline blkcg and blkgs may never be freed leaking more
> > > > > > > and more memory over time.
> > > > > > > 
> > > > > > > To prevent this potential memory leak:
> > > > > > > 
> > > > > > > - flush blkcg per-cpu stats list in __blkg_release(), when no new stat
> > > > > > > can be added
> > > > > > > 
> > > > > > > - don't grab bio->bi_blkg reference when adding the stats into blkcg's
> > > > > > > per-cpu stat list since all stats are guaranteed to be consumed before
> > > > > > > releasing blkg instance, and grabbing blkg reference for stats was the
> > > > > > > most fragile part of original patch
> > > > > > > 
> > > > > > > Based on Waiman's patch:
> > > > > > > 
> > > > > > > https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/
> > > > > > > 
> > > > > > > 
> > > > > > > Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
> > > > > > > Cc: Waiman Long <longman@redhat.com>
> > > > > > > Cc: Tejun Heo <tj@kernel.org>
> > > > > > > Cc: mkoutny@suse.com
> > > > > > > Cc: Yosry Ahmed <yosryahmed@google.com>
> > > > > > > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > > > > > > ---
> > > > > > > V2:
> > > > > > >      - remove kernel/cgroup change, and call blkcg_rstat_flush()
> > > > > > >      to flush stat directly
> > > > > > > 
> > > > > > >     block/blk-cgroup.c | 29 +++++++++++++++++++++--------
> > > > > > >     1 file changed, 21 insertions(+), 8 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> > > > > > > index 0ce64dd73cfe..ed0eb8896972 100644
> > > > > > > --- a/block/blk-cgroup.c
> > > > > > > +++ b/block/blk-cgroup.c
> > > > > > > @@ -34,6 +34,8 @@
> > > > > > >     #include "blk-ioprio.h"
> > > > > > >     #include "blk-throttle.h"
> > > > > > > +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
> > > > > > > +
> > > > > > >     /*
> > > > > > >      * blkcg_pol_mutex protects blkcg_policy[] and policy
> > > > > > > [de]activation.
> > > > > > >      * blkcg_pol_register_mutex nests outside of it and
> > > > > > > synchronizes entire
> > > > > > > @@ -163,10 +165,21 @@ static void blkg_free(struct blkcg_gq *blkg)
> > > > > > >     static void __blkg_release(struct rcu_head *rcu)
> > > > > > >     {
> > > > > > >         struct blkcg_gq *blkg = container_of(rcu, struct
> > > > > > > blkcg_gq, rcu_head);
> > > > > > > +    struct blkcg *blkcg = blkg->blkcg;
> > > > > > > +    int cpu;
> > > > > > >     #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
> > > > > > >         WARN_ON(!bio_list_empty(&blkg->async_bios));
> > > > > > >     #endif
> > > > > > > +    /*
> > > > > > > +     * Flush all the non-empty percpu lockless lists before releasing
> > > > > > > +     * us, given these stat belongs to us.
> > > > > > > +     *
> > > > > > > +     * cgroup locks aren't needed here since __blkcg_rstat_flush just
> > > > > > > +     * propagates delta into blkg parent, which is live now.
> > > > > > > +     */
> > > > > > > +    for_each_possible_cpu(cpu)
> > > > > > > +        __blkcg_rstat_flush(blkcg, cpu);
> > > > > > >         /* release the blkcg and parent blkg refs this blkg
> > > > > > > has been holding */
> > > > > > >         css_put(&blkg->blkcg->css);
> > > > > > > @@ -951,17 +964,12 @@ static void blkcg_iostat_update(struct
> > > > > > > blkcg_gq *blkg, struct blkg_iostat *cur,
> > > > > > > u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> > > > > > >     }
> > > > > > > -static void blkcg_rstat_flush(struct cgroup_subsys_state
> > > > > > > *css, int cpu)
> > > > > > > +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
> > > > > > >     {
> > > > > > > -    struct blkcg *blkcg = css_to_blkcg(css);
> > > > > > >         struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
> > > > > > >         struct llist_node *lnode;
> > > > > > >         struct blkg_iostat_set *bisc, *next_bisc;
> > > > > > > -    /* Root-level stats are sourced from system-wide IO stats */
> > > > > > > -    if (!cgroup_parent(css->cgroup))
> > > > > > > -        return;
> > > > > > > -
> > > > > > >         rcu_read_lock();
> > > > > > >         lnode = llist_del_all(lhead);
> > > > > > > @@ -991,13 +999,19 @@ static void blkcg_rstat_flush(struct
> > > > > > > cgroup_subsys_state *css, int cpu)
> > > > > > >             if (parent && parent->parent)
> > > > > > >                 blkcg_iostat_update(parent, &blkg->iostat.cur,
> > > > > > >                             &blkg->iostat.last);
> > > > > > > -        percpu_ref_put(&blkg->refcnt);
> > > > > > >         }
> > > > > > >     out:
> > > > > > >         rcu_read_unlock();
> > > > > > >     }
> > > > > > > +static void blkcg_rstat_flush(struct cgroup_subsys_state
> > > > > > > *css, int cpu)
> > > > > > > +{
> > > > > > > +    /* Root-level stats are sourced from system-wide IO stats */
> > > > > > > +    if (cgroup_parent(css->cgroup))
> > > > > > > +        __blkcg_rstat_flush(css_to_blkcg(css), cpu);
> > > > > > > +}
> > > > > > > +
> > > > > > I think it may not safe to call __blkcg_rstat_flus() directly
> > > > > > without taking
> > > > > > the cgroup_rstat_cpu_lock. That is why I added a helper to
> > > > > > kernel/cgroup/rstat.c in my patch to meet the locking requirement.
> > > > > All stats are removed from llist_del_all(), and the local list is
> > > > > iterated, then each blkg & its parent is touched in
> > > > > __blkcg_rstat_flus(), so
> > > > > can you explain it a bit why cgroup locks are needed? For protecting
> > > > > what?
> > > > You are right. The llist_del_all() call in blkcg_rstat_flush() is
> > > > atomic, so it is safe for concurrent execution which is what the
> > > > cgroup_rstat_cpu_lock protects against. That may not be the case for
> > > > rstat callbacks of other controllers. So I will suggest you to add a
> > > > comment to clarify that point. Other than that, you patch looks good to
> > > > me.
> > > > 
> > > > Reviewed: Waiman Long <longman@redhat.com>
> > > After some more thought, I need to retract my reviewed-by tag for now. There
> > > is a slight possibility that blkcg_iostat_update() in blkcg_rstat_flush()
> > > can happen concurrently which will corrupt the sequence count.
> > llist_del_all() moves all 'bis' into one local list, and bis is one percpu
> > variable of blkg, so in theory same bis won't be flushed at the same
> > time. And one bis should be touched in just one of stat flush code path
> > because of llist_del_all().
> > 
> > So 'bis' still can be thought as being flushed in serialized way.
> > 
> > However, blk_cgroup_bio_start() runs concurrently with blkcg_rstat_flush(),
> > so once bis->lqueued is cleared in blkcg_rstat_flush(), this same bis
> > could be added to the percpu llist and __blkcg_rstat_flush() from blkg_release()
> > follows. This should be the only chance for concurrent stats update.
> 
> That is why I have in mind. A __blkcg_rstat_flush() can be from
> blkg_release() and another one from the regular cgroup_rstat_flush*().

But the two blkg can't be same because blkg_release() is run from rcu
callback.

> 
> 
> > 
> > But, blkg_release() is run in RCU callback, so the previous flush has
> > been done, but new flush can come, and other blkg's stat could be added
> > with same story above.
> > 
> > > One way to
> > > avoid that is to synchronize it by cgroup_rstat_cpu_lock. Another way is to
> > > use the bisc->lqueued for synchronization.
> > I'd avoid the external cgroup lock here.
> > 
> > > In that case, you need to move
> > > WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after all
> > > the  blkcg_iostat_update() call with smp_store_release() and replace the
> > > READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with
> > > smp_load_acquire().
> > This way looks doable, but I guess it still can't avoid concurrent update on parent
> > stat, such as when  __blkcg_rstat_flush() from blkg_release() is
> > in-progress, another sibling blkg's bis is added, meantime
> > blkcg_rstat_flush() is called.
> I realized that the use of cgroup_rstat_cpu_lock or the alternative was not
> safe enough for preventing concurrent parent blkg rstat update.

Indeed.

> > 
> > Another way is to add blkcg->stat_lock for covering __blkcg_rstat_flush(), what
> > do you think of this way?
> 
> I am thinking of adding a raw spinlock into blkg and take it when doing
> blkcg_iostat_update(). This can guarantee no concurrent update to rstat
> data. It has to be a raw spinlock as it will be under the
> cgroup_rstat_cpu_lock raw spinlock.

We still need to be careful since both child and parent are updated
in the following blkcg_iostat_update().

                if (parent && parent->parent)
                        blkcg_iostat_update(parent, &blkg->iostat.cur,
                                            &blkg->iostat.last);

But the concurrent update on parent blkg can be avoided by holding
parent->stat_lock only.


thanks,
Ming
Waiman Long May 25, 2023, 4:02 a.m. UTC | #13
On 5/24/23 23:47, Ming Lei wrote:
> On Wed, May 24, 2023 at 10:50:29PM -0400, Waiman Long wrote:
>> On 5/24/23 22:04, Ming Lei wrote:
>>> On Wed, May 24, 2023 at 01:28:41PM -0400, Waiman Long wrote:
>>>> On 5/24/23 11:43, Waiman Long wrote:
>>>>
>>> llist_del_all() moves all 'bis' into one local list, and bis is one percpu
>>> variable of blkg, so in theory same bis won't be flushed at the same
>>> time. And one bis should be touched in just one of stat flush code path
>>> because of llist_del_all().
>>>
>>> So 'bis' still can be thought as being flushed in serialized way.
>>>
>>> However, blk_cgroup_bio_start() runs concurrently with blkcg_rstat_flush(),
>>> so once bis->lqueued is cleared in blkcg_rstat_flush(), this same bis
>>> could be added to the percpu llist and __blkcg_rstat_flush() from blkg_release()
>>> follows. This should be the only chance for concurrent stats update.
>> That is why I have in mind. A __blkcg_rstat_flush() can be from
>> blkg_release() and another one from the regular cgroup_rstat_flush*().
> But the two blkg can't be same because blkg_release() is run from rcu
> callback.

I am not talking about the parent blkg which can have a concurrent 
update. It is probably safe at the blkg level, but the lockless list may 
contain another bisc from a blkg that is not being destroyed.


>
>>
>>> But, blkg_release() is run in RCU callback, so the previous flush has
>>> been done, but new flush can come, and other blkg's stat could be added
>>> with same story above.
>>>
>>>> One way to
>>>> avoid that is to synchronize it by cgroup_rstat_cpu_lock. Another way is to
>>>> use the bisc->lqueued for synchronization.
>>> I'd avoid the external cgroup lock here.
>>>
>>>> In that case, you need to move
>>>> WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after all
>>>> the  blkcg_iostat_update() call with smp_store_release() and replace the
>>>> READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with
>>>> smp_load_acquire().
>>> This way looks doable, but I guess it still can't avoid concurrent update on parent
>>> stat, such as when  __blkcg_rstat_flush() from blkg_release() is
>>> in-progress, another sibling blkg's bis is added, meantime
>>> blkcg_rstat_flush() is called.
>> I realized that the use of cgroup_rstat_cpu_lock or the alternative was not
>> safe enough for preventing concurrent parent blkg rstat update.
> Indeed.
>
>>> Another way is to add blkcg->stat_lock for covering __blkcg_rstat_flush(), what
>>> do you think of this way?
>> I am thinking of adding a raw spinlock into blkg and take it when doing
>> blkcg_iostat_update(). This can guarantee no concurrent update to rstat
>> data. It has to be a raw spinlock as it will be under the
>> cgroup_rstat_cpu_lock raw spinlock.
> We still need to be careful since both child and parent are updated
> in the following blkcg_iostat_update().
>
>                  if (parent && parent->parent)
>                          blkcg_iostat_update(parent, &blkg->iostat.cur,
>                                              &blkg->iostat.last);
>
> But the concurrent update on parent blkg can be avoided by holding
> parent->stat_lock only.

That is true. Perhaps we can use Yosry's suggestion of adding a helper 
to acquire and release cgroup_rstat_lock before calling 
__blkcg_rstat_flush(). That will also guarantee no concurrent update.

Cheers,
Longman
Ming Lei May 25, 2023, 4:05 a.m. UTC | #14
On Thu, May 25, 2023 at 12:02:01AM -0400, Waiman Long wrote:
> 
> On 5/24/23 23:47, Ming Lei wrote:
> > On Wed, May 24, 2023 at 10:50:29PM -0400, Waiman Long wrote:
> > > On 5/24/23 22:04, Ming Lei wrote:
> > > > On Wed, May 24, 2023 at 01:28:41PM -0400, Waiman Long wrote:
> > > > > On 5/24/23 11:43, Waiman Long wrote:
> > > > > 
> > > > llist_del_all() moves all 'bis' into one local list, and bis is one percpu
> > > > variable of blkg, so in theory same bis won't be flushed at the same
> > > > time. And one bis should be touched in just one of stat flush code path
> > > > because of llist_del_all().
> > > > 
> > > > So 'bis' still can be thought as being flushed in serialized way.
> > > > 
> > > > However, blk_cgroup_bio_start() runs concurrently with blkcg_rstat_flush(),
> > > > so once bis->lqueued is cleared in blkcg_rstat_flush(), this same bis
> > > > could be added to the percpu llist and __blkcg_rstat_flush() from blkg_release()
> > > > follows. This should be the only chance for concurrent stats update.
> > > That is why I have in mind. A __blkcg_rstat_flush() can be from
> > > blkg_release() and another one from the regular cgroup_rstat_flush*().
> > But the two blkg can't be same because blkg_release() is run from rcu
> > callback.
> 
> I am not talking about the parent blkg which can have a concurrent update.
> It is probably safe at the blkg level, but the lockless list may contain
> another bisc from a blkg that is not being destroyed.
> 
> 
> > 
> > > 
> > > > But, blkg_release() is run in RCU callback, so the previous flush has
> > > > been done, but new flush can come, and other blkg's stat could be added
> > > > with same story above.
> > > > 
> > > > > One way to
> > > > > avoid that is to synchronize it by cgroup_rstat_cpu_lock. Another way is to
> > > > > use the bisc->lqueued for synchronization.
> > > > I'd avoid the external cgroup lock here.
> > > > 
> > > > > In that case, you need to move
> > > > > WRITE_ONCE(bisc->lqueued, false) in blkcg_rstat_flush() to the end after all
> > > > > the  blkcg_iostat_update() call with smp_store_release() and replace the
> > > > > READ_ONCE(bis->lqueued) check in blk_cgroup_bio_start() with
> > > > > smp_load_acquire().
> > > > This way looks doable, but I guess it still can't avoid concurrent update on parent
> > > > stat, such as when  __blkcg_rstat_flush() from blkg_release() is
> > > > in-progress, another sibling blkg's bis is added, meantime
> > > > blkcg_rstat_flush() is called.
> > > I realized that the use of cgroup_rstat_cpu_lock or the alternative was not
> > > safe enough for preventing concurrent parent blkg rstat update.
> > Indeed.
> > 
> > > > Another way is to add blkcg->stat_lock for covering __blkcg_rstat_flush(), what
> > > > do you think of this way?
> > > I am thinking of adding a raw spinlock into blkg and take it when doing
> > > blkcg_iostat_update(). This can guarantee no concurrent update to rstat
> > > data. It has to be a raw spinlock as it will be under the
> > > cgroup_rstat_cpu_lock raw spinlock.
> > We still need to be careful since both child and parent are updated
> > in the following blkcg_iostat_update().
> > 
> >                  if (parent && parent->parent)
> >                          blkcg_iostat_update(parent, &blkg->iostat.cur,
> >                                              &blkg->iostat.last);
> > 
> > But the concurrent update on parent blkg can be avoided by holding
> > parent->stat_lock only.
> 
> That is true. Perhaps we can use Yosry's suggestion of adding a helper to
> acquire and release cgroup_rstat_lock before calling __blkcg_rstat_flush().
> That will also guarantee no concurrent update.

Yeah, probably add the following API:

void cgroup_rstat_css_flush(struct cgroup_subsys_state *css)
{
    spin_lock_irq(&cgroup_rstat_lock);

    for_each_possible_cpu(cpu) {
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);

        raw_spin_lock_irq(cpu_lock);
        css->ss->css_rstat_flush(css, cpu);
        raw_spin_unlock_irq(cpu_lock);
    }
    spin_unlock_irq(&cgroup_rstat_lock);
}

We should make the fix as simple as possible given it needs backport
to -stable, another choice is to add global blkg_stat_lock. Which
approach is better?


Thanks, 
Ming
diff mbox series

Patch

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..ed0eb8896972 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -34,6 +34,8 @@ 
 #include "blk-ioprio.h"
 #include "blk-throttle.h"
 
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
+
 /*
  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
  * blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -163,10 +165,21 @@  static void blkg_free(struct blkcg_gq *blkg)
 static void __blkg_release(struct rcu_head *rcu)
 {
 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+	struct blkcg *blkcg = blkg->blkcg;
+	int cpu;
 
 #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
 	WARN_ON(!bio_list_empty(&blkg->async_bios));
 #endif
+	/*
+	 * Flush all the non-empty percpu lockless lists before releasing
+	 * us, given these stat belongs to us.
+	 *
+	 * cgroup locks aren't needed here since __blkcg_rstat_flush just
+	 * propagates delta into blkg parent, which is live now.
+	 */
+	for_each_possible_cpu(cpu)
+		__blkcg_rstat_flush(blkcg, cpu);
 
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
@@ -951,17 +964,12 @@  static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
 	u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
 }
 
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
 	struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
 	struct llist_node *lnode;
 	struct blkg_iostat_set *bisc, *next_bisc;
 
-	/* Root-level stats are sourced from system-wide IO stats */
-	if (!cgroup_parent(css->cgroup))
-		return;
-
 	rcu_read_lock();
 
 	lnode = llist_del_all(lhead);
@@ -991,13 +999,19 @@  static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 		if (parent && parent->parent)
 			blkcg_iostat_update(parent, &blkg->iostat.cur,
 					    &blkg->iostat.last);
-		percpu_ref_put(&blkg->refcnt);
 	}
 
 out:
 	rcu_read_unlock();
 }
 
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+	/* Root-level stats are sourced from system-wide IO stats */
+	if (cgroup_parent(css->cgroup))
+		__blkcg_rstat_flush(css_to_blkcg(css), cpu);
+}
+
 /*
  * We source root cgroup stats from the system-wide stats to avoid
  * tracking the same information twice and incurring overhead when no
@@ -2075,7 +2089,6 @@  void blk_cgroup_bio_start(struct bio *bio)
 
 		llist_add(&bis->lnode, lhead);
 		WRITE_ONCE(bis->lqueued, true);
-		percpu_ref_get(&bis->blkg->refcnt);
 	}
 
 	u64_stats_update_end_irqrestore(&bis->sync, flags);