diff mbox series

[bpf-next,v8,06/34] bpf: prepare for memcg-based memory accounting for bpf maps

Message ID 20201125030119.2864302-7-guro@fb.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: switch to memcg-based memory accounting | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 15689 this patch: 15691
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 150 lines checked
netdev/build_allmodconfig_warn fail Errors and warnings before: 15601 this patch: 15603
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Roman Gushchin Nov. 25, 2020, 3 a.m. UTC
In the absolute majority of cases if a process is making a kernel
allocation, it's memory cgroup is getting charged.

Bpf maps can be updated from an interrupt context and in such
case there is no process which can be charged. It makes the memory
accounting of bpf maps non-trivial.

Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
memcg accounting from interrupt contexts") and b87d8cefe43c
("mm, memcg: rework remote charging API to support nesting")
it's finally possible.

To do it, a pointer to the memory cgroup of the process, which created
the map, is saved, and this cgroup can be charged for all allocations
made from an interrupt context. This commit introduces 2 helpers:
bpf_map_kmalloc_node() and bpf_map_alloc_percpu(). They can be used in
the bpf code for accounted memory allocations, both in the process and
interrupt contexts. In the interrupt context they're using the saved
memory cgroup, otherwise the current cgroup is getting charged.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 include/linux/bpf.h  | 26 +++++++++++++++
 kernel/bpf/syscall.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

Comments

Daniel Borkmann Nov. 26, 2020, 12:21 a.m. UTC | #1
On 11/25/20 4:00 AM, Roman Gushchin wrote:
> In the absolute majority of cases if a process is making a kernel
> allocation, it's memory cgroup is getting charged.
> 
> Bpf maps can be updated from an interrupt context and in such
> case there is no process which can be charged. It makes the memory
> accounting of bpf maps non-trivial.
> 
> Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
> memcg accounting from interrupt contexts") and b87d8cefe43c
> ("mm, memcg: rework remote charging API to support nesting")
> it's finally possible.
> 
> To do it, a pointer to the memory cgroup of the process, which created
> the map, is saved, and this cgroup can be charged for all allocations
> made from an interrupt context. This commit introduces 2 helpers:
> bpf_map_kmalloc_node() and bpf_map_alloc_percpu(). They can be used in
> the bpf code for accounted memory allocations, both in the process and
> interrupt contexts. In the interrupt context they're using the saved
> memory cgroup, otherwise the current cgroup is getting charged.
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>

Thanks for updating the cover letter; replying in this series instead
on one more item that came to mind:

[...]
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index f3fe9f53f93c..4154c616788c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -31,6 +31,8 @@
>   #include <linux/poll.h>
>   #include <linux/bpf-netns.h>
>   #include <linux/rcupdate_trace.h>
> +#include <linux/memcontrol.h>
> +#include <linux/sched/mm.h>
>   
>   #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
>   			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
> @@ -456,6 +458,77 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
>   		__release(&map_idr_lock);
>   }
>   
> +#ifdef CONFIG_MEMCG_KMEM
> +static void bpf_map_save_memcg(struct bpf_map *map)
> +{
> +	map->memcg = get_mem_cgroup_from_mm(current->mm);
> +}
> +
> +static void bpf_map_release_memcg(struct bpf_map *map)
> +{
> +	mem_cgroup_put(map->memcg);
> +}
> +
> +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
> +			   int node)
> +{
> +	struct mem_cgroup *old_memcg;
> +	bool in_interrupt;
> +	void *ptr;
> +
> +	/*
> +	 * If the memory allocation is performed from an interrupt context,
> +	 * the memory cgroup to charge can't be determined from the context
> +	 * of the current task. Instead, we charge the memory cgroup, which
> +	 * contained the process created the map.
> +	 */
> +	in_interrupt = in_interrupt();
> +	if (in_interrupt)
> +		old_memcg = set_active_memcg(map->memcg);
> +
> +	ptr = kmalloc_node(size, flags, node);
> +
> +	if (in_interrupt)
> +		set_active_memcg(old_memcg);
> +
> +	return ptr;
> +}
> +
> +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
> +				    size_t align, gfp_t gfp)
> +{
> +	struct mem_cgroup *old_memcg;
> +	bool in_interrupt;
> +	void *ptr;
> +
> +	/*
> +	 * If the memory allocation is performed from an interrupt context,
> +	 * the memory cgroup to charge can't be determined from the context
> +	 * of the current task. Instead, we charge the memory cgroup, which
> +	 * contained the process created the map.
> +	 */
> +	in_interrupt = in_interrupt();
> +	if (in_interrupt)
> +		old_memcg = set_active_memcg(map->memcg);
> +
> +	ptr = __alloc_percpu_gfp(size, align, gfp);
> +
> +	if (in_interrupt)
> +		set_active_memcg(old_memcg);

For this and above bpf_map_kmalloc_node() one, wouldn't it make more sense to
perform the temporary memcg unconditionally?

	old_memcg = set_active_memcg(map->memcg);
	ptr = kmalloc_node(size, flags, node);
	set_active_memcg(old_memcg);

I think the semantics are otherwise a bit weird and the charging unpredictable;
this way it would /always/ be accounted against the prog in the memcg that
originally created the map.

E.g. maps could be shared between progs attached to, say, XDP/tc where in_interrupt()
holds true with progs attached to skb-cgroup/egress where we're still in process
context. So some part of the memory is charged against the original map's memcg and
some other part against the current process' memcg which seems odd, no? Or, for example,
if we start to run a tracing BPF prog which updates state in a BPF map ... that tracing
prog now interferes with processes in other memcgs which may not be intentional & could
lead to potential failures there as opposed when the tracing prog is not run. My concern
is that the semantics are not quite clear and behavior unpredictable compared to always
charging against map->memcg.

Similarly, what if an orchestration prog creates dedicated memcg(s) for maps with
individual limits ... the assumed behavior (imho) would be that whatever memory is
accounted on the map it can be accurately retrieved from there & similarly limits
enforced, no? It seems that would not be the case currently.

Thoughts?

> +	return ptr;
> +}
> +
> +#else
> +static void bpf_map_save_memcg(struct bpf_map *map)
> +{
> +}
> +
> +static void bpf_map_release_memcg(struct bpf_map *map)
> +{
> +}
> +#endif
> +
>   /* called from workqueue */
>   static void bpf_map_free_deferred(struct work_struct *work)
>   {
> @@ -464,6 +537,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
>   
>   	bpf_map_charge_move(&mem, &map->memory);
>   	security_bpf_map_free(map);
> +	bpf_map_release_memcg(map);
>   	/* implementation dependent freeing */
>   	map->ops->map_free(map);
>   	bpf_map_charge_finish(&mem);
> @@ -875,6 +949,8 @@ static int map_create(union bpf_attr *attr)
>   	if (err)
>   		goto free_map_sec;
>   
> +	bpf_map_save_memcg(map);
> +
>   	err = bpf_map_new_fd(map, f_flags);
>   	if (err < 0) {
>   		/* failed to allocate fd.
>
Roman Gushchin Nov. 26, 2020, 2:30 a.m. UTC | #2
On Thu, Nov 26, 2020 at 01:21:41AM +0100, Daniel Borkmann wrote:
> On 11/25/20 4:00 AM, Roman Gushchin wrote:
> > In the absolute majority of cases if a process is making a kernel
> > allocation, it's memory cgroup is getting charged.
> > 
> > Bpf maps can be updated from an interrupt context and in such
> > case there is no process which can be charged. It makes the memory
> > accounting of bpf maps non-trivial.
> > 
> > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
> > memcg accounting from interrupt contexts") and b87d8cefe43c
> > ("mm, memcg: rework remote charging API to support nesting")
> > it's finally possible.
> > 
> > To do it, a pointer to the memory cgroup of the process, which created
> > the map, is saved, and this cgroup can be charged for all allocations
> > made from an interrupt context. This commit introduces 2 helpers:
> > bpf_map_kmalloc_node() and bpf_map_alloc_percpu(). They can be used in
> > the bpf code for accounted memory allocations, both in the process and
> > interrupt contexts. In the interrupt context they're using the saved
> > memory cgroup, otherwise the current cgroup is getting charged.
> > 
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> 
> Thanks for updating the cover letter; replying in this series instead
> on one more item that came to mind:
> 
> [...]
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index f3fe9f53f93c..4154c616788c 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -31,6 +31,8 @@
> >   #include <linux/poll.h>
> >   #include <linux/bpf-netns.h>
> >   #include <linux/rcupdate_trace.h>
> > +#include <linux/memcontrol.h>
> > +#include <linux/sched/mm.h>
> >   #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
> >   			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
> > @@ -456,6 +458,77 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
> >   		__release(&map_idr_lock);
> >   }
> > +#ifdef CONFIG_MEMCG_KMEM
> > +static void bpf_map_save_memcg(struct bpf_map *map)
> > +{
> > +	map->memcg = get_mem_cgroup_from_mm(current->mm);
> > +}
> > +
> > +static void bpf_map_release_memcg(struct bpf_map *map)
> > +{
> > +	mem_cgroup_put(map->memcg);
> > +}
> > +
> > +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
> > +			   int node)
> > +{
> > +	struct mem_cgroup *old_memcg;
> > +	bool in_interrupt;
> > +	void *ptr;
> > +
> > +	/*
> > +	 * If the memory allocation is performed from an interrupt context,
> > +	 * the memory cgroup to charge can't be determined from the context
> > +	 * of the current task. Instead, we charge the memory cgroup, which
> > +	 * contained the process created the map.
> > +	 */
> > +	in_interrupt = in_interrupt();
> > +	if (in_interrupt)
> > +		old_memcg = set_active_memcg(map->memcg);
> > +
> > +	ptr = kmalloc_node(size, flags, node);
> > +
> > +	if (in_interrupt)
> > +		set_active_memcg(old_memcg);
> > +
> > +	return ptr;
> > +}
> > +
> > +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
> > +				    size_t align, gfp_t gfp)
> > +{
> > +	struct mem_cgroup *old_memcg;
> > +	bool in_interrupt;
> > +	void *ptr;
> > +
> > +	/*
> > +	 * If the memory allocation is performed from an interrupt context,
> > +	 * the memory cgroup to charge can't be determined from the context
> > +	 * of the current task. Instead, we charge the memory cgroup, which
> > +	 * contained the process created the map.
> > +	 */
> > +	in_interrupt = in_interrupt();
> > +	if (in_interrupt)
> > +		old_memcg = set_active_memcg(map->memcg);
> > +
> > +	ptr = __alloc_percpu_gfp(size, align, gfp);
> > +
> > +	if (in_interrupt)
> > +		set_active_memcg(old_memcg);
> 
> For this and above bpf_map_kmalloc_node() one, wouldn't it make more sense to
> perform the temporary memcg unconditionally?
> 
> 	old_memcg = set_active_memcg(map->memcg);
> 	ptr = kmalloc_node(size, flags, node);
> 	set_active_memcg(old_memcg);
> 
> I think the semantics are otherwise a bit weird and the charging unpredictable;
> this way it would /always/ be accounted against the prog in the memcg that
> originally created the map.
> 
> E.g. maps could be shared between progs attached to, say, XDP/tc where in_interrupt()
> holds true with progs attached to skb-cgroup/egress where we're still in process
> context. So some part of the memory is charged against the original map's memcg and
> some other part against the current process' memcg which seems odd, no? Or, for example,
> if we start to run a tracing BPF prog which updates state in a BPF map ... that tracing
> prog now interferes with processes in other memcgs which may not be intentional & could
> lead to potential failures there as opposed when the tracing prog is not run. My concern
> is that the semantics are not quite clear and behavior unpredictable compared to always
> charging against map->memcg.
> 
> Similarly, what if an orchestration prog creates dedicated memcg(s) for maps with
> individual limits ... the assumed behavior (imho) would be that whatever memory is
> accounted on the map it can be accurately retrieved from there & similarly limits
> enforced, no? It seems that would not be the case currently.
> 
> Thoughts?

I did consider this option. There are pros and cons. In general we tend to charge the cgroup
which actually allocates the memory, and I decided to stick with this rule. I agree, it's fairly
easy to come with arguments why always charging the map creator is better. The opposite is
also true: it's not clear why bpf is different here. So I'm fine with both options, if there
is a wide consensus, I'm happy to switch to the other option. In general, I believe that
the current scheme is more flexible: if someone want to pay in advance, they are free to preallocate
the map. Otherwise it's up to whoever wants to populate it.

Thanks!
Toke Høiland-Jørgensen Nov. 26, 2020, 9:56 a.m. UTC | #3
Roman Gushchin <guro@fb.com> writes:

> On Thu, Nov 26, 2020 at 01:21:41AM +0100, Daniel Borkmann wrote:
>> On 11/25/20 4:00 AM, Roman Gushchin wrote:
>> > In the absolute majority of cases if a process is making a kernel
>> > allocation, it's memory cgroup is getting charged.
>> > 
>> > Bpf maps can be updated from an interrupt context and in such
>> > case there is no process which can be charged. It makes the memory
>> > accounting of bpf maps non-trivial.
>> > 
>> > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
>> > memcg accounting from interrupt contexts") and b87d8cefe43c
>> > ("mm, memcg: rework remote charging API to support nesting")
>> > it's finally possible.
>> > 
>> > To do it, a pointer to the memory cgroup of the process, which created
>> > the map, is saved, and this cgroup can be charged for all allocations
>> > made from an interrupt context. This commit introduces 2 helpers:
>> > bpf_map_kmalloc_node() and bpf_map_alloc_percpu(). They can be used in
>> > the bpf code for accounted memory allocations, both in the process and
>> > interrupt contexts. In the interrupt context they're using the saved
>> > memory cgroup, otherwise the current cgroup is getting charged.
>> > 
>> > Signed-off-by: Roman Gushchin <guro@fb.com>
>> 
>> Thanks for updating the cover letter; replying in this series instead
>> on one more item that came to mind:
>> 
>> [...]
>> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> > index f3fe9f53f93c..4154c616788c 100644
>> > --- a/kernel/bpf/syscall.c
>> > +++ b/kernel/bpf/syscall.c
>> > @@ -31,6 +31,8 @@
>> >   #include <linux/poll.h>
>> >   #include <linux/bpf-netns.h>
>> >   #include <linux/rcupdate_trace.h>
>> > +#include <linux/memcontrol.h>
>> > +#include <linux/sched/mm.h>
>> >   #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
>> >   			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
>> > @@ -456,6 +458,77 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
>> >   		__release(&map_idr_lock);
>> >   }
>> > +#ifdef CONFIG_MEMCG_KMEM
>> > +static void bpf_map_save_memcg(struct bpf_map *map)
>> > +{
>> > +	map->memcg = get_mem_cgroup_from_mm(current->mm);
>> > +}
>> > +
>> > +static void bpf_map_release_memcg(struct bpf_map *map)
>> > +{
>> > +	mem_cgroup_put(map->memcg);
>> > +}
>> > +
>> > +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
>> > +			   int node)
>> > +{
>> > +	struct mem_cgroup *old_memcg;
>> > +	bool in_interrupt;
>> > +	void *ptr;
>> > +
>> > +	/*
>> > +	 * If the memory allocation is performed from an interrupt context,
>> > +	 * the memory cgroup to charge can't be determined from the context
>> > +	 * of the current task. Instead, we charge the memory cgroup, which
>> > +	 * contained the process created the map.
>> > +	 */
>> > +	in_interrupt = in_interrupt();
>> > +	if (in_interrupt)
>> > +		old_memcg = set_active_memcg(map->memcg);
>> > +
>> > +	ptr = kmalloc_node(size, flags, node);
>> > +
>> > +	if (in_interrupt)
>> > +		set_active_memcg(old_memcg);
>> > +
>> > +	return ptr;
>> > +}
>> > +
>> > +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
>> > +				    size_t align, gfp_t gfp)
>> > +{
>> > +	struct mem_cgroup *old_memcg;
>> > +	bool in_interrupt;
>> > +	void *ptr;
>> > +
>> > +	/*
>> > +	 * If the memory allocation is performed from an interrupt context,
>> > +	 * the memory cgroup to charge can't be determined from the context
>> > +	 * of the current task. Instead, we charge the memory cgroup, which
>> > +	 * contained the process created the map.
>> > +	 */
>> > +	in_interrupt = in_interrupt();
>> > +	if (in_interrupt)
>> > +		old_memcg = set_active_memcg(map->memcg);
>> > +
>> > +	ptr = __alloc_percpu_gfp(size, align, gfp);
>> > +
>> > +	if (in_interrupt)
>> > +		set_active_memcg(old_memcg);
>> 
>> For this and above bpf_map_kmalloc_node() one, wouldn't it make more sense to
>> perform the temporary memcg unconditionally?
>> 
>> 	old_memcg = set_active_memcg(map->memcg);
>> 	ptr = kmalloc_node(size, flags, node);
>> 	set_active_memcg(old_memcg);
>> 
>> I think the semantics are otherwise a bit weird and the charging unpredictable;
>> this way it would /always/ be accounted against the prog in the memcg that
>> originally created the map.
>> 
>> E.g. maps could be shared between progs attached to, say, XDP/tc where in_interrupt()
>> holds true with progs attached to skb-cgroup/egress where we're still in process
>> context. So some part of the memory is charged against the original map's memcg and
>> some other part against the current process' memcg which seems odd, no? Or, for example,
>> if we start to run a tracing BPF prog which updates state in a BPF map ... that tracing
>> prog now interferes with processes in other memcgs which may not be intentional & could
>> lead to potential failures there as opposed when the tracing prog is not run. My concern
>> is that the semantics are not quite clear and behavior unpredictable compared to always
>> charging against map->memcg.
>> 
>> Similarly, what if an orchestration prog creates dedicated memcg(s) for maps with
>> individual limits ... the assumed behavior (imho) would be that whatever memory is
>> accounted on the map it can be accurately retrieved from there & similarly limits
>> enforced, no? It seems that would not be the case currently.
>> 
>> Thoughts?
>
> I did consider this option. There are pros and cons. In general we
> tend to charge the cgroup which actually allocates the memory, and I
> decided to stick with this rule. I agree, it's fairly easy to come
> with arguments why always charging the map creator is better. The
> opposite is also true: it's not clear why bpf is different here. So
> I'm fine with both options, if there is a wide consensus, I'm happy to
> switch to the other option. In general, I believe that the current
> scheme is more flexible: if someone want to pay in advance, they are
> free to preallocate the map. Otherwise it's up to whoever wants to
> populate it.

I think I agree with Daniel here: conceptually the memory used by a map
ought to belong to that map's memcg. I can see how the other scheme can
be more flexible, but as Daniel points out it seems like it can lead to
hard-to-debug errors...

(Side note: I'm really excited about this work in general! The ulimit
thing has been a major pain...)

-Toke
Roman Gushchin Nov. 26, 2020, 4:06 p.m. UTC | #4
On Thu, Nov 26, 2020 at 10:56:12AM +0100, Toke Høiland-Jørgensen wrote:
> Roman Gushchin <guro@fb.com> writes:
> 
> > On Thu, Nov 26, 2020 at 01:21:41AM +0100, Daniel Borkmann wrote:
> >> On 11/25/20 4:00 AM, Roman Gushchin wrote:
> >> > In the absolute majority of cases if a process is making a kernel
> >> > allocation, it's memory cgroup is getting charged.
> >> > 
> >> > Bpf maps can be updated from an interrupt context and in such
> >> > case there is no process which can be charged. It makes the memory
> >> > accounting of bpf maps non-trivial.
> >> > 
> >> > Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
> >> > memcg accounting from interrupt contexts") and b87d8cefe43c
> >> > ("mm, memcg: rework remote charging API to support nesting")
> >> > it's finally possible.
> >> > 
> >> > To do it, a pointer to the memory cgroup of the process, which created
> >> > the map, is saved, and this cgroup can be charged for all allocations
> >> > made from an interrupt context. This commit introduces 2 helpers:
> >> > bpf_map_kmalloc_node() and bpf_map_alloc_percpu(). They can be used in
> >> > the bpf code for accounted memory allocations, both in the process and
> >> > interrupt contexts. In the interrupt context they're using the saved
> >> > memory cgroup, otherwise the current cgroup is getting charged.
> >> > 
> >> > Signed-off-by: Roman Gushchin <guro@fb.com>
> >> 
> >> Thanks for updating the cover letter; replying in this series instead
> >> on one more item that came to mind:
> >> 
> >> [...]
> >> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> >> > index f3fe9f53f93c..4154c616788c 100644
> >> > --- a/kernel/bpf/syscall.c
> >> > +++ b/kernel/bpf/syscall.c
> >> > @@ -31,6 +31,8 @@
> >> >   #include <linux/poll.h>
> >> >   #include <linux/bpf-netns.h>
> >> >   #include <linux/rcupdate_trace.h>
> >> > +#include <linux/memcontrol.h>
> >> > +#include <linux/sched/mm.h>
> >> >   #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
> >> >   			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
> >> > @@ -456,6 +458,77 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
> >> >   		__release(&map_idr_lock);
> >> >   }
> >> > +#ifdef CONFIG_MEMCG_KMEM
> >> > +static void bpf_map_save_memcg(struct bpf_map *map)
> >> > +{
> >> > +	map->memcg = get_mem_cgroup_from_mm(current->mm);
> >> > +}
> >> > +
> >> > +static void bpf_map_release_memcg(struct bpf_map *map)
> >> > +{
> >> > +	mem_cgroup_put(map->memcg);
> >> > +}
> >> > +
> >> > +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
> >> > +			   int node)
> >> > +{
> >> > +	struct mem_cgroup *old_memcg;
> >> > +	bool in_interrupt;
> >> > +	void *ptr;
> >> > +
> >> > +	/*
> >> > +	 * If the memory allocation is performed from an interrupt context,
> >> > +	 * the memory cgroup to charge can't be determined from the context
> >> > +	 * of the current task. Instead, we charge the memory cgroup, which
> >> > +	 * contained the process created the map.
> >> > +	 */
> >> > +	in_interrupt = in_interrupt();
> >> > +	if (in_interrupt)
> >> > +		old_memcg = set_active_memcg(map->memcg);
> >> > +
> >> > +	ptr = kmalloc_node(size, flags, node);
> >> > +
> >> > +	if (in_interrupt)
> >> > +		set_active_memcg(old_memcg);
> >> > +
> >> > +	return ptr;
> >> > +}
> >> > +
> >> > +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
> >> > +				    size_t align, gfp_t gfp)
> >> > +{
> >> > +	struct mem_cgroup *old_memcg;
> >> > +	bool in_interrupt;
> >> > +	void *ptr;
> >> > +
> >> > +	/*
> >> > +	 * If the memory allocation is performed from an interrupt context,
> >> > +	 * the memory cgroup to charge can't be determined from the context
> >> > +	 * of the current task. Instead, we charge the memory cgroup, which
> >> > +	 * contained the process created the map.
> >> > +	 */
> >> > +	in_interrupt = in_interrupt();
> >> > +	if (in_interrupt)
> >> > +		old_memcg = set_active_memcg(map->memcg);
> >> > +
> >> > +	ptr = __alloc_percpu_gfp(size, align, gfp);
> >> > +
> >> > +	if (in_interrupt)
> >> > +		set_active_memcg(old_memcg);
> >> 
> >> For this and above bpf_map_kmalloc_node() one, wouldn't it make more sense to
> >> perform the temporary memcg unconditionally?
> >> 
> >> 	old_memcg = set_active_memcg(map->memcg);
> >> 	ptr = kmalloc_node(size, flags, node);
> >> 	set_active_memcg(old_memcg);
> >> 
> >> I think the semantics are otherwise a bit weird and the charging unpredictable;
> >> this way it would /always/ be accounted against the prog in the memcg that
> >> originally created the map.
> >> 
> >> E.g. maps could be shared between progs attached to, say, XDP/tc where in_interrupt()
> >> holds true with progs attached to skb-cgroup/egress where we're still in process
> >> context. So some part of the memory is charged against the original map's memcg and
> >> some other part against the current process' memcg which seems odd, no? Or, for example,
> >> if we start to run a tracing BPF prog which updates state in a BPF map ... that tracing
> >> prog now interferes with processes in other memcgs which may not be intentional & could
> >> lead to potential failures there as opposed when the tracing prog is not run. My concern
> >> is that the semantics are not quite clear and behavior unpredictable compared to always
> >> charging against map->memcg.
> >> 
> >> Similarly, what if an orchestration prog creates dedicated memcg(s) for maps with
> >> individual limits ... the assumed behavior (imho) would be that whatever memory is
> >> accounted on the map it can be accurately retrieved from there & similarly limits
> >> enforced, no? It seems that would not be the case currently.
> >> 
> >> Thoughts?
> >
> > I did consider this option. There are pros and cons. In general we
> > tend to charge the cgroup which actually allocates the memory, and I
> > decided to stick with this rule. I agree, it's fairly easy to come
> > with arguments why always charging the map creator is better. The
> > opposite is also true: it's not clear why bpf is different here. So
> > I'm fine with both options, if there is a wide consensus, I'm happy to
> > switch to the other option. In general, I believe that the current
> > scheme is more flexible: if someone want to pay in advance, they are
> > free to preallocate the map. Otherwise it's up to whoever wants to
> > populate it.
> 
> I think I agree with Daniel here: conceptually the memory used by a map
> ought to belong to that map's memcg. I can see how the other scheme can
> be more flexible, but as Daniel points out it seems like it can lead to
> hard-to-debug errors...

Ok, I'll switch to always charging the map's memcg in the next version.

> 
> (Side note: I'm really excited about this work in general! The ulimit
> thing has been a major pain...)

Great! Thanks!
Alexei Starovoitov Nov. 26, 2020, 5:12 p.m. UTC | #5
On Wed, Nov 25, 2020 at 6:30 PM Roman Gushchin <guro@fb.com> wrote:
>
> I did consider this option. There are pros and cons. In general we tend to charge the cgroup
> which actually allocates the memory, and I decided to stick with this rule. I agree, it's fairly
> easy to come with arguments why always charging the map creator is better. The opposite is
> also true: it's not clear why bpf is different here. So I'm fine with both options, if there
> is a wide consensus, I'm happy to switch to the other option. In general, I believe that
> the current scheme is more flexible.

I don't understand the 'more flexible' part.
The current_memcg or map_memcg approach makes it less predictable.
pre-alloc vs not is somewhat orthogonal.
I've grepped through the kernel where set_active_memcg() is used
and couldn't find a conditional pattern of its usage.
If memcg is known it's used. I couldn't come up with the use case where
using current memcg is the more correct thing to do.

> In general we tend to charge the cgroup which actually allocates the memory

that makes sense where allocation is driven by the user process.
Like user space doing a syscall then all kernel allocation would be
from memcg of that process.
But bpf tracing allocations are not something that the user process requested
the kernel to do. It's like another user space process tapped into another.
Arguably when bpf prog is running the two user processes are active.
One that created the map and loaded the prog and another that is being traced.
I think there will be cases where bpf prog will request the kernel to allocate
memory on behalf of the process being traced, but that memory should be
given back to the process and accessible by it in some form.
Like bpf prog could ask the kernel to grow heap of that process or
trigger readahead.
In such case current_memcg would be the right thing to charge.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1bcb6d7345c..b11436cb9e3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -20,6 +20,7 @@ 
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/capability.h>
+#include <linux/slab.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -37,6 +38,7 @@  struct bpf_iter_aux_info;
 struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
+struct mem_cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -161,6 +163,9 @@  struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	struct bpf_map_memory memory;
+#ifdef CONFIG_MEMCG_KMEM
+	struct mem_cgroup *memcg;
+#endif
 	char name[BPF_OBJ_NAME_LEN];
 	u32 btf_vmlinux_value_type_id;
 	bool bypass_spec_v1;
@@ -1240,6 +1245,27 @@  int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
+#ifdef CONFIG_MEMCG_KMEM
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node);
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t gfp);
+#else
+static inline void *
+bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+		     int node)
+{
+	return kmalloc_node(size, flags, node);
+}
+
+static inline void __percpu *
+bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
+		     gfp_t gfp)
+{
+	return __alloc_percpu_gfp(size, align, gfp);
+}
+#endif
+
 extern int sysctl_unprivileged_bpf_disabled;
 
 static inline bool bpf_allow_ptr_leaks(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f3fe9f53f93c..4154c616788c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -31,6 +31,8 @@ 
 #include <linux/poll.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/mm.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -456,6 +458,77 @@  void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 		__release(&map_idr_lock);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+	map->memcg = get_mem_cgroup_from_mm(current->mm);
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+	mem_cgroup_put(map->memcg);
+}
+
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node)
+{
+	struct mem_cgroup *old_memcg;
+	bool in_interrupt;
+	void *ptr;
+
+	/*
+	 * If the memory allocation is performed from an interrupt context,
+	 * the memory cgroup to charge can't be determined from the context
+	 * of the current task. Instead, we charge the memory cgroup, which
+	 * contained the process created the map.
+	 */
+	in_interrupt = in_interrupt();
+	if (in_interrupt)
+		old_memcg = set_active_memcg(map->memcg);
+
+	ptr = kmalloc_node(size, flags, node);
+
+	if (in_interrupt)
+		set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t gfp)
+{
+	struct mem_cgroup *old_memcg;
+	bool in_interrupt;
+	void *ptr;
+
+	/*
+	 * If the memory allocation is performed from an interrupt context,
+	 * the memory cgroup to charge can't be determined from the context
+	 * of the current task. Instead, we charge the memory cgroup, which
+	 * contained the process created the map.
+	 */
+	in_interrupt = in_interrupt();
+	if (in_interrupt)
+		old_memcg = set_active_memcg(map->memcg);
+
+	ptr = __alloc_percpu_gfp(size, align, gfp);
+
+	if (in_interrupt)
+		set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+#else
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+}
+#endif
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -464,6 +537,7 @@  static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
+	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 	bpf_map_charge_finish(&mem);
@@ -875,6 +949,8 @@  static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_sec;
 
+	bpf_map_save_memcg(map);
+
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.