Message ID | 20220619155032.32515-8-laoar.shao@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | bpf, mm: Recharge pages when reuse bpf map | expand |
Hello, On Sun, Jun 19, 2022 at 03:50:29PM +0000, Yafang Shao wrote: > This patch introduces a helper to recharge the corresponding pages of > a given percpu address. It is similar with how to recharge a kmalloc'ed > address. > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com> > --- > include/linux/percpu.h | 1 + > mm/percpu.c | 98 ++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 99 insertions(+) > > diff --git a/include/linux/percpu.h b/include/linux/percpu.h > index f1ec5ad1351c..e88429410179 100644 > --- a/include/linux/percpu.h > +++ b/include/linux/percpu.h > @@ -128,6 +128,7 @@ extern void __init setup_per_cpu_areas(void); > extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); > extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); > extern void free_percpu(void __percpu *__pdata); > +bool recharge_percpu(void __percpu *__pdata, int step); Nit: can you add extern to keep the file consistent. > extern phys_addr_t per_cpu_ptr_to_phys(void *addr); > > #define alloc_percpu_gfp(type, gfp) \ > diff --git a/mm/percpu.c b/mm/percpu.c > index 3633eeefaa0d..fd81f4d79f2f 100644 > --- a/mm/percpu.c > +++ b/mm/percpu.c > @@ -2310,6 +2310,104 @@ void free_percpu(void __percpu *ptr) > } > EXPORT_SYMBOL_GPL(free_percpu); > > +#ifdef CONFIG_MEMCG_KMEM > +bool recharge_percpu(void __percpu *ptr, int step) > +{ > + int bit_off, off, bits, size, end; > + struct obj_cgroup *objcg_old; > + struct obj_cgroup *objcg_new; > + struct pcpu_chunk *chunk; > + unsigned long flags; > + void *addr; > + > + WARN_ON(!in_task()); > + > + if (!ptr) > + return true; > + > + addr = __pcpu_ptr_to_addr(ptr); > + spin_lock_irqsave(&pcpu_lock, flags); > + chunk = pcpu_chunk_addr_search(addr); > + off = addr - chunk->base_addr; > + objcg_old = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; > + if (!objcg_old && step != MEMCG_KMEM_POST_CHARGE) { > + spin_unlock_irqrestore(&pcpu_lock, flags); > + return true; > + } > + > + bit_off = off / PCPU_MIN_ALLOC_SIZE; > + /* find end index */ > + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), > + bit_off + 1); > + bits = end - bit_off; > + size = bits * PCPU_MIN_ALLOC_SIZE; > + > + switch (step) { > + case MEMCG_KMEM_PRE_CHARGE: > + objcg_new = get_obj_cgroup_from_current(); > + WARN_ON(!objcg_new); > + if (obj_cgroup_charge(objcg_new, GFP_KERNEL, > + size * num_possible_cpus())) { > + obj_cgroup_put(objcg_new); > + spin_unlock_irqrestore(&pcpu_lock, flags); > + return false; > + } > + break; > + case MEMCG_KMEM_UNCHARGE: > + obj_cgroup_uncharge(objcg_old, size * num_possible_cpus()); > + rcu_read_lock(); > + mod_memcg_state(obj_cgroup_memcg(objcg_old), MEMCG_PERCPU_B, > + -(size * num_possible_cpus())); > + rcu_read_unlock(); > + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; > + obj_cgroup_put(objcg_old); > + break; > + case MEMCG_KMEM_POST_CHARGE: > + rcu_read_lock(); > + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = obj_cgroup_from_current(); > + mod_memcg_state(mem_cgroup_from_task(current), MEMCG_PERCPU_B, > + (size * num_possible_cpus())); > + rcu_read_unlock(); > + break; > + case MEMCG_KMEM_CHARGE_ERR: > + /* > + * In case fail to charge to the new one in the pre charge state, > + * for example, we have pre-charged one memcg successfully but fail > + * to pre-charge the second memcg, then we should uncharge the first > + * memcg. > + */ > + objcg_new = obj_cgroup_from_current(); > + obj_cgroup_uncharge(objcg_new, size * num_possible_cpus()); > + obj_cgroup_put(objcg_new); > + rcu_read_lock(); > + mod_memcg_state(obj_cgroup_memcg(objcg_new), MEMCG_PERCPU_B, > + -(size * num_possible_cpus())); > + rcu_read_unlock(); > + > + break; > + } I'm not really the biggest fan of this step stuff. I see why you're doing it because you want to do all or nothing recharging the percpu bpf maps. Is there a way to have percpu own this logic and attempt to do all or nothing instead? I realize bpf is likely the largest percpu user, but the recharge_percpu() api seems to be more generic than forcing potential other users in the future to open code it. > + > + spin_unlock_irqrestore(&pcpu_lock, flags); > + > + return true; > +} > +EXPORT_SYMBOL(recharge_percpu); > + > +#else /* CONFIG_MEMCG_KMEM */ > + > +bool charge_percpu(void __percpu *ptr, bool charge) > +{ > + return true; > +} > +EXPORT_SYMBOL(charge_percpu); > + > +void uncharge_percpu(void __percpu *ptr) > +{ > +} I'm guessing this is supposed to be recharge_percpu() not (un)charge_percpu(). > +EXPORT_SYMBOL(uncharge_percpu); > + > +#endif /* CONFIG_MEMCG_KMEM */ > + > bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) > { > #ifdef CONFIG_SMP > -- > 2.17.1 > > Thanks, Dennis
On Thu, Jun 23, 2022 at 1:25 PM Dennis Zhou <dennisszhou@gmail.com> wrote: > > Hello, > > On Sun, Jun 19, 2022 at 03:50:29PM +0000, Yafang Shao wrote: > > This patch introduces a helper to recharge the corresponding pages of > > a given percpu address. It is similar with how to recharge a kmalloc'ed > > address. > > > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com> > > --- > > include/linux/percpu.h | 1 + > > mm/percpu.c | 98 ++++++++++++++++++++++++++++++++++++++++++ > > 2 files changed, 99 insertions(+) > > > > diff --git a/include/linux/percpu.h b/include/linux/percpu.h > > index f1ec5ad1351c..e88429410179 100644 > > --- a/include/linux/percpu.h > > +++ b/include/linux/percpu.h > > @@ -128,6 +128,7 @@ extern void __init setup_per_cpu_areas(void); > > extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); > > extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); > > extern void free_percpu(void __percpu *__pdata); > > +bool recharge_percpu(void __percpu *__pdata, int step); > > Nit: can you add extern to keep the file consistent. > Sure, I will do it. > > extern phys_addr_t per_cpu_ptr_to_phys(void *addr); > > > > #define alloc_percpu_gfp(type, gfp) \ > > diff --git a/mm/percpu.c b/mm/percpu.c > > index 3633eeefaa0d..fd81f4d79f2f 100644 > > --- a/mm/percpu.c > > +++ b/mm/percpu.c > > @@ -2310,6 +2310,104 @@ void free_percpu(void __percpu *ptr) > > } > > EXPORT_SYMBOL_GPL(free_percpu); > > > > +#ifdef CONFIG_MEMCG_KMEM > > +bool recharge_percpu(void __percpu *ptr, int step) > > +{ > > + int bit_off, off, bits, size, end; > > + struct obj_cgroup *objcg_old; > > + struct obj_cgroup *objcg_new; > > + struct pcpu_chunk *chunk; > > + unsigned long flags; > > + void *addr; > > + > > + WARN_ON(!in_task()); > > + > > + if (!ptr) > > + return true; > > + > > + addr = __pcpu_ptr_to_addr(ptr); > > + spin_lock_irqsave(&pcpu_lock, flags); > > + chunk = pcpu_chunk_addr_search(addr); > > + off = addr - chunk->base_addr; > > + objcg_old = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; > > + if (!objcg_old && step != MEMCG_KMEM_POST_CHARGE) { > > + spin_unlock_irqrestore(&pcpu_lock, flags); > > + return true; > > + } > > + > > + bit_off = off / PCPU_MIN_ALLOC_SIZE; > > + /* find end index */ > > + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), > > + bit_off + 1); > > + bits = end - bit_off; > > + size = bits * PCPU_MIN_ALLOC_SIZE; > > + > > + switch (step) { > > + case MEMCG_KMEM_PRE_CHARGE: > > + objcg_new = get_obj_cgroup_from_current(); > > + WARN_ON(!objcg_new); > > + if (obj_cgroup_charge(objcg_new, GFP_KERNEL, > > + size * num_possible_cpus())) { > > + obj_cgroup_put(objcg_new); > > + spin_unlock_irqrestore(&pcpu_lock, flags); > > + return false; > > + } > > + break; > > + case MEMCG_KMEM_UNCHARGE: > > + obj_cgroup_uncharge(objcg_old, size * num_possible_cpus()); > > + rcu_read_lock(); > > + mod_memcg_state(obj_cgroup_memcg(objcg_old), MEMCG_PERCPU_B, > > + -(size * num_possible_cpus())); > > + rcu_read_unlock(); > > + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; > > + obj_cgroup_put(objcg_old); > > + break; > > + case MEMCG_KMEM_POST_CHARGE: > > + rcu_read_lock(); > > + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = obj_cgroup_from_current(); > > + mod_memcg_state(mem_cgroup_from_task(current), MEMCG_PERCPU_B, > > + (size * num_possible_cpus())); > > + rcu_read_unlock(); > > + break; > > + case MEMCG_KMEM_CHARGE_ERR: > > + /* > > + * In case fail to charge to the new one in the pre charge state, > > + * for example, we have pre-charged one memcg successfully but fail > > + * to pre-charge the second memcg, then we should uncharge the first > > + * memcg. > > + */ > > + objcg_new = obj_cgroup_from_current(); > > + obj_cgroup_uncharge(objcg_new, size * num_possible_cpus()); > > + obj_cgroup_put(objcg_new); > > + rcu_read_lock(); > > + mod_memcg_state(obj_cgroup_memcg(objcg_new), MEMCG_PERCPU_B, > > + -(size * num_possible_cpus())); > > + rcu_read_unlock(); > > + > > + break; > > + } > > I'm not really the biggest fan of this step stuff. I see why you're > doing it because you want to do all or nothing recharging the percpu bpf > maps. Is there a way to have percpu own this logic and attempt to do all > or nothing instead? I realize bpf is likely the largest percpu user, but > the recharge_percpu() api seems to be more generic than forcing > potential other users in the future to open code it. > Agree with you that the recharge api may be used by other users. It should be a more generic helper. Maybe we can make percpu own this logic by introducing a new value for the parameter step, for example, recharge_percpu(ptr, -1); // -1 means the user doesn't need to care about the multiple steps. > > + > > + spin_unlock_irqrestore(&pcpu_lock, flags); > > + > > + return true; > > +} > > +EXPORT_SYMBOL(recharge_percpu); > > + > > +#else /* CONFIG_MEMCG_KMEM */ > > + > > +bool charge_percpu(void __percpu *ptr, bool charge) > > +{ > > + return true; > > +} > > +EXPORT_SYMBOL(charge_percpu); > > + > > +void uncharge_percpu(void __percpu *ptr) > > +{ > > +} > > I'm guessing this is supposed to be recharge_percpu() not > (un)charge_percpu(). Thanks for pointing out this bug. The lkp robot also reported this bug to me. I will change it.
diff --git a/include/linux/percpu.h b/include/linux/percpu.h index f1ec5ad1351c..e88429410179 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -128,6 +128,7 @@ extern void __init setup_per_cpu_areas(void); extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); extern void free_percpu(void __percpu *__pdata); +bool recharge_percpu(void __percpu *__pdata, int step); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); #define alloc_percpu_gfp(type, gfp) \ diff --git a/mm/percpu.c b/mm/percpu.c index 3633eeefaa0d..fd81f4d79f2f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2310,6 +2310,104 @@ void free_percpu(void __percpu *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +#ifdef CONFIG_MEMCG_KMEM +bool recharge_percpu(void __percpu *ptr, int step) +{ + int bit_off, off, bits, size, end; + struct obj_cgroup *objcg_old; + struct obj_cgroup *objcg_new; + struct pcpu_chunk *chunk; + unsigned long flags; + void *addr; + + WARN_ON(!in_task()); + + if (!ptr) + return true; + + addr = __pcpu_ptr_to_addr(ptr); + spin_lock_irqsave(&pcpu_lock, flags); + chunk = pcpu_chunk_addr_search(addr); + off = addr - chunk->base_addr; + objcg_old = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; + if (!objcg_old && step != MEMCG_KMEM_POST_CHARGE) { + spin_unlock_irqrestore(&pcpu_lock, flags); + return true; + } + + bit_off = off / PCPU_MIN_ALLOC_SIZE; + /* find end index */ + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), + bit_off + 1); + bits = end - bit_off; + size = bits * PCPU_MIN_ALLOC_SIZE; + + switch (step) { + case MEMCG_KMEM_PRE_CHARGE: + objcg_new = get_obj_cgroup_from_current(); + WARN_ON(!objcg_new); + if (obj_cgroup_charge(objcg_new, GFP_KERNEL, + size * num_possible_cpus())) { + obj_cgroup_put(objcg_new); + spin_unlock_irqrestore(&pcpu_lock, flags); + return false; + } + break; + case MEMCG_KMEM_UNCHARGE: + obj_cgroup_uncharge(objcg_old, size * num_possible_cpus()); + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg_old), MEMCG_PERCPU_B, + -(size * num_possible_cpus())); + rcu_read_unlock(); + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; + obj_cgroup_put(objcg_old); + break; + case MEMCG_KMEM_POST_CHARGE: + rcu_read_lock(); + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = obj_cgroup_from_current(); + mod_memcg_state(mem_cgroup_from_task(current), MEMCG_PERCPU_B, + (size * num_possible_cpus())); + rcu_read_unlock(); + break; + case MEMCG_KMEM_CHARGE_ERR: + /* + * In case fail to charge to the new one in the pre charge state, + * for example, we have pre-charged one memcg successfully but fail + * to pre-charge the second memcg, then we should uncharge the first + * memcg. + */ + objcg_new = obj_cgroup_from_current(); + obj_cgroup_uncharge(objcg_new, size * num_possible_cpus()); + obj_cgroup_put(objcg_new); + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg_new), MEMCG_PERCPU_B, + -(size * num_possible_cpus())); + rcu_read_unlock(); + + break; + } + + spin_unlock_irqrestore(&pcpu_lock, flags); + + return true; +} +EXPORT_SYMBOL(recharge_percpu); + +#else /* CONFIG_MEMCG_KMEM */ + +bool charge_percpu(void __percpu *ptr, bool charge) +{ + return true; +} +EXPORT_SYMBOL(charge_percpu); + +void uncharge_percpu(void __percpu *ptr) +{ +} +EXPORT_SYMBOL(uncharge_percpu); + +#endif /* CONFIG_MEMCG_KMEM */ + bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) { #ifdef CONFIG_SMP
This patch introduces a helper to recharge the corresponding pages of a given percpu address. It is similar with how to recharge a kmalloc'ed address. Signed-off-by: Yafang Shao <laoar.shao@gmail.com> --- include/linux/percpu.h | 1 + mm/percpu.c | 98 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+)