diff mbox series

[RFC,3/4] rcu, slab: use a regular callback function for kvfree_rcu

Message ID 20250123-slub-tiny-kfree_rcu-v1-3-0e386ef1541a@suse.cz (mailing list archive)
State New
Headers show
Series slab, rcu: move and consolidate TINY_RCU kvfree_rcu() to SLAB | expand

Commit Message

Vlastimil Babka Jan. 23, 2025, 10:37 a.m. UTC
RCU has been special-casing callback function pointers that are integers
lower than 4096 as offsets of rcu_head for kvfree() instead. The tree
RCU implementation no longer does that as the batched kvfree_rcu() is
not a simple call_rcu(). The tiny RCU still does, and the plan is also
to make tree RCU use call_rcu() for SLUB_TINY configurations.

Instead of teaching tree RCU again to special case the offsets, let's
remove the special casing completely. Since there's no SLOB anymore, it
is possible to create a callback function that can take a pointer to a
middle of slab object with unknown offset and determine the object's
pointer before freeing it, so implement that as kvfree_rcu_cb().

Large kmalloc and vmalloc allocations are handled simply by aligning
down to page size. For that we retain the requirement that the offset is
smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely
and instead just opencode the condition in the BUILD_BUG_ON() check.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/rcupdate.h | 24 +++++++++---------------
 kernel/rcu/tiny.c        | 13 -------------
 mm/slab.h                |  2 ++
 mm/slab_common.c         |  5 +----
 mm/slub.c                | 42 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 54 insertions(+), 32 deletions(-)

Comments

Uladzislau Rezki Jan. 24, 2025, 12:47 p.m. UTC | #1
On Thu, Jan 23, 2025 at 11:37:20AM +0100, Vlastimil Babka wrote:
> RCU has been special-casing callback function pointers that are integers
> lower than 4096 as offsets of rcu_head for kvfree() instead. The tree
> RCU implementation no longer does that as the batched kvfree_rcu() is
> not a simple call_rcu(). The tiny RCU still does, and the plan is also
> to make tree RCU use call_rcu() for SLUB_TINY configurations.
> 
> Instead of teaching tree RCU again to special case the offsets, let's
> remove the special casing completely. Since there's no SLOB anymore, it
> is possible to create a callback function that can take a pointer to a
> middle of slab object with unknown offset and determine the object's
> pointer before freeing it, so implement that as kvfree_rcu_cb().
> 
> Large kmalloc and vmalloc allocations are handled simply by aligning
> down to page size. For that we retain the requirement that the offset is
> smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely
> and instead just opencode the condition in the BUILD_BUG_ON() check.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  include/linux/rcupdate.h | 24 +++++++++---------------
>  kernel/rcu/tiny.c        | 13 -------------
>  mm/slab.h                |  2 ++
>  mm/slab_common.c         |  5 +----
>  mm/slub.c                | 42 ++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 54 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 3f70d1c8144426f40553c8c589f07097ece8a706..7ff16a70ca1c0fb1012c4118388f60687c5e5b3f 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1025,12 +1025,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
>  #define RCU_POINTER_INITIALIZER(p, v) \
>  		.p = RCU_INITIALIZER(v)
>  
> -/*
> - * Does the specified offset indicate that the corresponding rcu_head
> - * structure can be handled by kvfree_rcu()?
> - */
> -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
> -
>  /**
>   * kfree_rcu() - kfree an object after a grace period.
>   * @ptr: pointer to kfree for double-argument invocations.
> @@ -1041,11 +1035,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
>   * when they are used in a kernel module, that module must invoke the
>   * high-latency rcu_barrier() function at module-unload time.
>   *
> - * The kfree_rcu() function handles this issue.  Rather than encoding a
> - * function address in the embedded rcu_head structure, kfree_rcu() instead
> - * encodes the offset of the rcu_head structure within the base structure.
> - * Because the functions are not allowed in the low-order 4096 bytes of
> - * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
> + * The kfree_rcu() function handles this issue. In order to have a universal
> + * callback function handling different offsets of rcu_head, the callback needs
> + * to determine the starting address of the freed object, which can be a large
> + * kmalloc of vmalloc allocation. To allow simply aligning the pointer down to
> + * page boundary for those, only offsets up to 4095 bytes can be accommodated.
>   * If the offset is larger than 4095 bytes, a compile-time error will
>   * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
>   * either fall back to use of call_rcu() or rearrange the structure to
> @@ -1091,10 +1085,10 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr);
>  do {									\
>  	typeof (ptr) ___p = (ptr);					\
>  									\
> -	if (___p) {									\
> -		BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));	\
> -		kvfree_call_rcu(&((___p)->rhf), (void *) (___p));			\
> -	}										\
> +	if (___p) {							\
> +		BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096);	\
> +		kvfree_call_rcu(&((___p)->rhf), (void *) (___p));	\
> +	}								\
>
Why removing the macro? At least __is_kvfree_rcu_offset() makes it
clear what and why + it has a nice comment what it is used for. 4096
looks like hard-coded value.

Or you do not want that someone else started to use that macro in
another places?

>  } while (0)
>  
>  #define kvfree_rcu_arg_1(ptr)					\
> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
> index 0ec27093d0e14a4b1060ea08932c4ac13f9b0f26..77e0db0221364376a99ebeb17485650879385a6e 100644
> --- a/kernel/rcu/tiny.c
> +++ b/kernel/rcu/tiny.c
> @@ -88,12 +88,6 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
>  	unsigned long offset = (unsigned long)head->func;
>  
>  	rcu_lock_acquire(&rcu_callback_map);
> -	if (__is_kvfree_rcu_offset(offset)) {
> -		trace_rcu_invoke_kvfree_callback("", head, offset);
> -		kvfree((void *)head - offset);
> -		rcu_lock_release(&rcu_callback_map);
> -		return true;
> -	}
>  
>  	trace_rcu_invoke_callback("", head);
>  	f = head->func;
> @@ -159,10 +153,6 @@ void synchronize_rcu(void)
>  }
>  EXPORT_SYMBOL_GPL(synchronize_rcu);
>  
> -static void tiny_rcu_leak_callback(struct rcu_head *rhp)
> -{
> -}
> -
>  /*
>   * Post an RCU callback to be invoked after the end of an RCU grace
>   * period.  But since we have but one CPU, that would be after any
> @@ -178,9 +168,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>  			pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
>  			mem_dump_obj(head);
>  		}
> -
> -		if (!__is_kvfree_rcu_offset((unsigned long)head->func))
> -			WRITE_ONCE(head->func, tiny_rcu_leak_callback);
>  		return;
>  	}
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index e9fd9bf0bfa65b343a4ae0ecd5b4c2a325b04883..2f01c7317988ce036f0b22807403226a59f0f708 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -604,6 +604,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
>  			    void **p, int objects, struct slabobj_ext *obj_exts);
>  #endif
>  
> +void kvfree_rcu_cb(struct rcu_head *head);
> +
>  size_t __ksize(const void *objp);
>  
>  static inline size_t slab_ksize(const struct kmem_cache *s)
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 330cdd8ebc5380090ee784c58e8ca1d1a52b3758..f13d2c901daf1419993620459fbd5845eecb85f1 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1532,9 +1532,6 @@ kvfree_rcu_list(struct rcu_head *head)
>  		rcu_lock_acquire(&rcu_callback_map);
>  		trace_rcu_invoke_kvfree_callback("slab", head, offset);
>  
> -		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
> -			kvfree(ptr);
> -
This is not correct unless i miss something. Why do you remove this?

>  
> diff --git a/mm/slub.c b/mm/slub.c
> index c2151c9fee228d121a9cbcc220c3ae054769dacf..651381bf05566e88de8493e0550f121d23b757a1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -19,6 +19,7 @@
>  #include <linux/bitops.h>
>  #include <linux/slab.h>
>  #include "slab.h"
> +#include <linux/vmalloc.h>
>  #include <linux/proc_fs.h>
>  #include <linux/seq_file.h>
>  #include <linux/kasan.h>
> @@ -4732,6 +4733,47 @@ static void free_large_kmalloc(struct folio *folio, void *object)
>  	folio_put(folio);
>  }
>  
> +void kvfree_rcu_cb(struct rcu_head *head)
> +{
> +	void *obj = head;
> +	struct folio *folio;
> +	struct slab *slab;
> +	struct kmem_cache *s;
> +	void *slab_addr;
> +
> +	if (unlikely(is_vmalloc_addr(obj))) {
> +		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
> +		vfree(obj);
> +		return;
> +	}
> +
> +	folio = virt_to_folio(obj);
> +	if (unlikely(!folio_test_slab(folio))) {
> +		/*
> +		 * rcu_head offset can be only less than page size so no need to
> +		 * consider folio order
> +		 */
> +		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
> +		free_large_kmalloc(folio, obj);
> +		return;
> +	}
> +
> +	slab = folio_slab(folio);
> +	s = slab->slab_cache;
> +	slab_addr = folio_address(folio);
> +
> +	if (is_kfence_address(obj)) {
> +		obj = kfence_object_start(obj);
> +	} else {
> +		unsigned int idx = __obj_to_index(s, slab_addr, obj);
> +
> +		obj = slab_addr + s->size * idx;
> +		obj = fixup_red_left(s, obj);
> +	}
> +
> +	slab_free(s, slab, obj, _RET_IP_);
> +}
> +
Tiny computer case. Just small nit, maybe remove unlikely() but you decide :)

--
Uladzislau Rezki
Vlastimil Babka Jan. 24, 2025, 2:16 p.m. UTC | #2
On 1/24/25 13:47, Uladzislau Rezki wrote:
> On Thu, Jan 23, 2025 at 11:37:20AM +0100, Vlastimil Babka wrote:
>> RCU has been special-casing callback function pointers that are integers
>> lower than 4096 as offsets of rcu_head for kvfree() instead. The tree
>> RCU implementation no longer does that as the batched kvfree_rcu() is
>> not a simple call_rcu(). The tiny RCU still does, and the plan is also
>> to make tree RCU use call_rcu() for SLUB_TINY configurations.
>> 
>> Instead of teaching tree RCU again to special case the offsets, let's
>> remove the special casing completely. Since there's no SLOB anymore, it
>> is possible to create a callback function that can take a pointer to a
>> middle of slab object with unknown offset and determine the object's
>> pointer before freeing it, so implement that as kvfree_rcu_cb().
>> 
>> Large kmalloc and vmalloc allocations are handled simply by aligning
>> down to page size. For that we retain the requirement that the offset is
>> smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely
>> and instead just opencode the condition in the BUILD_BUG_ON() check.
>> 
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
>>  include/linux/rcupdate.h | 24 +++++++++---------------
>>  kernel/rcu/tiny.c        | 13 -------------
>>  mm/slab.h                |  2 ++
>>  mm/slab_common.c         |  5 +----
>>  mm/slub.c                | 42 ++++++++++++++++++++++++++++++++++++++++++
>>  5 files changed, 54 insertions(+), 32 deletions(-)
>> 
>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>> index 3f70d1c8144426f40553c8c589f07097ece8a706..7ff16a70ca1c0fb1012c4118388f60687c5e5b3f 100644
>> --- a/include/linux/rcupdate.h
>> +++ b/include/linux/rcupdate.h
>> @@ -1025,12 +1025,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
>>  #define RCU_POINTER_INITIALIZER(p, v) \
>>  		.p = RCU_INITIALIZER(v)
>>  
>> -/*
>> - * Does the specified offset indicate that the corresponding rcu_head
>> - * structure can be handled by kvfree_rcu()?
>> - */
>> -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
>> -
>>  /**
>>   * kfree_rcu() - kfree an object after a grace period.
>>   * @ptr: pointer to kfree for double-argument invocations.
>> @@ -1041,11 +1035,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
>>   * when they are used in a kernel module, that module must invoke the
>>   * high-latency rcu_barrier() function at module-unload time.
>>   *
>> - * The kfree_rcu() function handles this issue.  Rather than encoding a
>> - * function address in the embedded rcu_head structure, kfree_rcu() instead
>> - * encodes the offset of the rcu_head structure within the base structure.
>> - * Because the functions are not allowed in the low-order 4096 bytes of
>> - * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
>> + * The kfree_rcu() function handles this issue. In order to have a universal
>> + * callback function handling different offsets of rcu_head, the callback needs
>> + * to determine the starting address of the freed object, which can be a large
>> + * kmalloc of vmalloc allocation. To allow simply aligning the pointer down to
>> + * page boundary for those, only offsets up to 4095 bytes can be accommodated.
>>   * If the offset is larger than 4095 bytes, a compile-time error will
>>   * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
>>   * either fall back to use of call_rcu() or rearrange the structure to
>> @@ -1091,10 +1085,10 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr);
>>  do {									\
>>  	typeof (ptr) ___p = (ptr);					\
>>  									\
>> -	if (___p) {									\
>> -		BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));	\
>> -		kvfree_call_rcu(&((___p)->rhf), (void *) (___p));			\
>> -	}										\
>> +	if (___p) {							\
>> +		BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096);	\
>> +		kvfree_call_rcu(&((___p)->rhf), (void *) (___p));	\
>> +	}								\
>>
> Why removing the macro? At least __is_kvfree_rcu_offset() makes it
> clear what and why + it has a nice comment what it is used for. 4096
> looks like hard-coded value.

Hmm but it's ultimately shorter. We could add a short comment to
kvfree_rcu_arg_2() referring to the longer explanation in the kfree_rcu()
comment?

> Or you do not want that someone else started to use that macro in
> another places?

And also that, since this has to be exposed in a "public" header.

>> diff --git a/mm/slab.h b/mm/slab.h
>> index e9fd9bf0bfa65b343a4ae0ecd5b4c2a325b04883..2f01c7317988ce036f0b22807403226a59f0f708 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -604,6 +604,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
>>  			    void **p, int objects, struct slabobj_ext *obj_exts);
>>  #endif
>>  
>> +void kvfree_rcu_cb(struct rcu_head *head);
>> +
>>  size_t __ksize(const void *objp);
>>  
>>  static inline size_t slab_ksize(const struct kmem_cache *s)
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 330cdd8ebc5380090ee784c58e8ca1d1a52b3758..f13d2c901daf1419993620459fbd5845eecb85f1 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -1532,9 +1532,6 @@ kvfree_rcu_list(struct rcu_head *head)
>>  		rcu_lock_acquire(&rcu_callback_map);
>>  		trace_rcu_invoke_kvfree_callback("slab", head, offset);
>>  
>> -		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
>> -			kvfree(ptr);
>> -
> This is not correct unless i miss something. Why do you remove this?

Oops, meant to remove just the warn check, and unconditionally call
kvfree(), thanks for noticing :)

The warning could really only trigger due to a use-after-free corruption of
the ptr pointer, because the BUILD_BUG_ON() would catch misuse with a too
large offset, so we don't need to keep it.

>> +void kvfree_rcu_cb(struct rcu_head *head)
>> +{
>> +	void *obj = head;
>> +	struct folio *folio;
>> +	struct slab *slab;
>> +	struct kmem_cache *s;
>> +	void *slab_addr;
>> +
>> +	if (unlikely(is_vmalloc_addr(obj))) {
>> +		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
>> +		vfree(obj);
>> +		return;
>> +	}
>> +
>> +	folio = virt_to_folio(obj);
>> +	if (unlikely(!folio_test_slab(folio))) {
>> +		/*
>> +		 * rcu_head offset can be only less than page size so no need to
>> +		 * consider folio order
>> +		 */
>> +		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
>> +		free_large_kmalloc(folio, obj);
>> +		return;
>> +	}
>> +
>> +	slab = folio_slab(folio);
>> +	s = slab->slab_cache;
>> +	slab_addr = folio_address(folio);
>> +
>> +	if (is_kfence_address(obj)) {
>> +		obj = kfence_object_start(obj);
>> +	} else {
>> +		unsigned int idx = __obj_to_index(s, slab_addr, obj);
>> +
>> +		obj = slab_addr + s->size * idx;
>> +		obj = fixup_red_left(s, obj);
>> +	}
>> +
>> +	slab_free(s, slab, obj, _RET_IP_);
>> +}
>> +
> Tiny computer case. Just small nit, maybe remove unlikely() but you decide :)

Force of habbit :)

> --
> Uladzislau Rezki
Uladzislau Rezki Jan. 24, 2025, 2:19 p.m. UTC | #3
On Fri, Jan 24, 2025 at 03:16:05PM +0100, Vlastimil Babka wrote:
> On 1/24/25 13:47, Uladzislau Rezki wrote:
> > On Thu, Jan 23, 2025 at 11:37:20AM +0100, Vlastimil Babka wrote:
> >> RCU has been special-casing callback function pointers that are integers
> >> lower than 4096 as offsets of rcu_head for kvfree() instead. The tree
> >> RCU implementation no longer does that as the batched kvfree_rcu() is
> >> not a simple call_rcu(). The tiny RCU still does, and the plan is also
> >> to make tree RCU use call_rcu() for SLUB_TINY configurations.
> >> 
> >> Instead of teaching tree RCU again to special case the offsets, let's
> >> remove the special casing completely. Since there's no SLOB anymore, it
> >> is possible to create a callback function that can take a pointer to a
> >> middle of slab object with unknown offset and determine the object's
> >> pointer before freeing it, so implement that as kvfree_rcu_cb().
> >> 
> >> Large kmalloc and vmalloc allocations are handled simply by aligning
> >> down to page size. For that we retain the requirement that the offset is
> >> smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely
> >> and instead just opencode the condition in the BUILD_BUG_ON() check.
> >> 
> >> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >> ---
> >>  include/linux/rcupdate.h | 24 +++++++++---------------
> >>  kernel/rcu/tiny.c        | 13 -------------
> >>  mm/slab.h                |  2 ++
> >>  mm/slab_common.c         |  5 +----
> >>  mm/slub.c                | 42 ++++++++++++++++++++++++++++++++++++++++++
> >>  5 files changed, 54 insertions(+), 32 deletions(-)
> >> 
> >> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> >> index 3f70d1c8144426f40553c8c589f07097ece8a706..7ff16a70ca1c0fb1012c4118388f60687c5e5b3f 100644
> >> --- a/include/linux/rcupdate.h
> >> +++ b/include/linux/rcupdate.h
> >> @@ -1025,12 +1025,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
> >>  #define RCU_POINTER_INITIALIZER(p, v) \
> >>  		.p = RCU_INITIALIZER(v)
> >>  
> >> -/*
> >> - * Does the specified offset indicate that the corresponding rcu_head
> >> - * structure can be handled by kvfree_rcu()?
> >> - */
> >> -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
> >> -
> >>  /**
> >>   * kfree_rcu() - kfree an object after a grace period.
> >>   * @ptr: pointer to kfree for double-argument invocations.
> >> @@ -1041,11 +1035,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
> >>   * when they are used in a kernel module, that module must invoke the
> >>   * high-latency rcu_barrier() function at module-unload time.
> >>   *
> >> - * The kfree_rcu() function handles this issue.  Rather than encoding a
> >> - * function address in the embedded rcu_head structure, kfree_rcu() instead
> >> - * encodes the offset of the rcu_head structure within the base structure.
> >> - * Because the functions are not allowed in the low-order 4096 bytes of
> >> - * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
> >> + * The kfree_rcu() function handles this issue. In order to have a universal
> >> + * callback function handling different offsets of rcu_head, the callback needs
> >> + * to determine the starting address of the freed object, which can be a large
> >> + * kmalloc of vmalloc allocation. To allow simply aligning the pointer down to
> >> + * page boundary for those, only offsets up to 4095 bytes can be accommodated.
> >>   * If the offset is larger than 4095 bytes, a compile-time error will
> >>   * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
> >>   * either fall back to use of call_rcu() or rearrange the structure to
> >> @@ -1091,10 +1085,10 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr);
> >>  do {									\
> >>  	typeof (ptr) ___p = (ptr);					\
> >>  									\
> >> -	if (___p) {									\
> >> -		BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));	\
> >> -		kvfree_call_rcu(&((___p)->rhf), (void *) (___p));			\
> >> -	}										\
> >> +	if (___p) {							\
> >> +		BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096);	\
> >> +		kvfree_call_rcu(&((___p)->rhf), (void *) (___p));	\
> >> +	}								\
> >>
> > Why removing the macro? At least __is_kvfree_rcu_offset() makes it
> > clear what and why + it has a nice comment what it is used for. 4096
> > looks like hard-coded value.
> 
> Hmm but it's ultimately shorter. We could add a short comment to
> kvfree_rcu_arg_2() referring to the longer explanation in the kfree_rcu()
> comment?
> 
Sounds good to place or keep the comment.

> > Or you do not want that someone else started to use that macro in
> > another places?
> 
> And also that, since this has to be exposed in a "public" header.
> 
> >> diff --git a/mm/slab.h b/mm/slab.h
> >> index e9fd9bf0bfa65b343a4ae0ecd5b4c2a325b04883..2f01c7317988ce036f0b22807403226a59f0f708 100644
> >> --- a/mm/slab.h
> >> +++ b/mm/slab.h
> >> @@ -604,6 +604,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
> >>  			    void **p, int objects, struct slabobj_ext *obj_exts);
> >>  #endif
> >>  
> >> +void kvfree_rcu_cb(struct rcu_head *head);
> >> +
> >>  size_t __ksize(const void *objp);
> >>  
> >>  static inline size_t slab_ksize(const struct kmem_cache *s)
> >> diff --git a/mm/slab_common.c b/mm/slab_common.c
> >> index 330cdd8ebc5380090ee784c58e8ca1d1a52b3758..f13d2c901daf1419993620459fbd5845eecb85f1 100644
> >> --- a/mm/slab_common.c
> >> +++ b/mm/slab_common.c
> >> @@ -1532,9 +1532,6 @@ kvfree_rcu_list(struct rcu_head *head)
> >>  		rcu_lock_acquire(&rcu_callback_map);
> >>  		trace_rcu_invoke_kvfree_callback("slab", head, offset);
> >>  
> >> -		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
> >> -			kvfree(ptr);
> >> -
> > This is not correct unless i miss something. Why do you remove this?
> 
> Oops, meant to remove just the warn check, and unconditionally call
> kvfree(), thanks for noticing :)
> 
> The warning could really only trigger due to a use-after-free corruption of
> the ptr pointer, because the BUILD_BUG_ON() would catch misuse with a too
> large offset, so we don't need to keep it.
> 
> >> +void kvfree_rcu_cb(struct rcu_head *head)
> >> +{
> >> +	void *obj = head;
> >> +	struct folio *folio;
> >> +	struct slab *slab;
> >> +	struct kmem_cache *s;
> >> +	void *slab_addr;
> >> +
> >> +	if (unlikely(is_vmalloc_addr(obj))) {
> >> +		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
> >> +		vfree(obj);
> >> +		return;
> >> +	}
> >> +
> >> +	folio = virt_to_folio(obj);
> >> +	if (unlikely(!folio_test_slab(folio))) {
> >> +		/*
> >> +		 * rcu_head offset can be only less than page size so no need to
> >> +		 * consider folio order
> >> +		 */
> >> +		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
> >> +		free_large_kmalloc(folio, obj);
> >> +		return;
> >> +	}
> >> +
> >> +	slab = folio_slab(folio);
> >> +	s = slab->slab_cache;
> >> +	slab_addr = folio_address(folio);
> >> +
> >> +	if (is_kfence_address(obj)) {
> >> +		obj = kfence_object_start(obj);
> >> +	} else {
> >> +		unsigned int idx = __obj_to_index(s, slab_addr, obj);
> >> +
> >> +		obj = slab_addr + s->size * idx;
> >> +		obj = fixup_red_left(s, obj);
> >> +	}
> >> +
> >> +	slab_free(s, slab, obj, _RET_IP_);
> >> +}
> >> +
> > Tiny computer case. Just small nit, maybe remove unlikely() but you decide :)
> 
> Force of habbit :)
> 
:)

--
Uladzislau Rezki
diff mbox series

Patch

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3f70d1c8144426f40553c8c589f07097ece8a706..7ff16a70ca1c0fb1012c4118388f60687c5e5b3f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1025,12 +1025,6 @@  static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define RCU_POINTER_INITIALIZER(p, v) \
 		.p = RCU_INITIALIZER(v)
 
-/*
- * Does the specified offset indicate that the corresponding rcu_head
- * structure can be handled by kvfree_rcu()?
- */
-#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
-
 /**
  * kfree_rcu() - kfree an object after a grace period.
  * @ptr: pointer to kfree for double-argument invocations.
@@ -1041,11 +1035,11 @@  static inline notrace void rcu_read_unlock_sched_notrace(void)
  * when they are used in a kernel module, that module must invoke the
  * high-latency rcu_barrier() function at module-unload time.
  *
- * The kfree_rcu() function handles this issue.  Rather than encoding a
- * function address in the embedded rcu_head structure, kfree_rcu() instead
- * encodes the offset of the rcu_head structure within the base structure.
- * Because the functions are not allowed in the low-order 4096 bytes of
- * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
+ * The kfree_rcu() function handles this issue. In order to have a universal
+ * callback function handling different offsets of rcu_head, the callback needs
+ * to determine the starting address of the freed object, which can be a large
+ * kmalloc of vmalloc allocation. To allow simply aligning the pointer down to
+ * page boundary for those, only offsets up to 4095 bytes can be accommodated.
  * If the offset is larger than 4095 bytes, a compile-time error will
  * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
  * either fall back to use of call_rcu() or rearrange the structure to
@@ -1091,10 +1085,10 @@  void kvfree_call_rcu(struct rcu_head *head, void *ptr);
 do {									\
 	typeof (ptr) ___p = (ptr);					\
 									\
-	if (___p) {									\
-		BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));	\
-		kvfree_call_rcu(&((___p)->rhf), (void *) (___p));			\
-	}										\
+	if (___p) {							\
+		BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096);	\
+		kvfree_call_rcu(&((___p)->rhf), (void *) (___p));	\
+	}								\
 } while (0)
 
 #define kvfree_rcu_arg_1(ptr)					\
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0ec27093d0e14a4b1060ea08932c4ac13f9b0f26..77e0db0221364376a99ebeb17485650879385a6e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -88,12 +88,6 @@  static inline bool rcu_reclaim_tiny(struct rcu_head *head)
 	unsigned long offset = (unsigned long)head->func;
 
 	rcu_lock_acquire(&rcu_callback_map);
-	if (__is_kvfree_rcu_offset(offset)) {
-		trace_rcu_invoke_kvfree_callback("", head, offset);
-		kvfree((void *)head - offset);
-		rcu_lock_release(&rcu_callback_map);
-		return true;
-	}
 
 	trace_rcu_invoke_callback("", head);
 	f = head->func;
@@ -159,10 +153,6 @@  void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-static void tiny_rcu_leak_callback(struct rcu_head *rhp)
-{
-}
-
 /*
  * Post an RCU callback to be invoked after the end of an RCU grace
  * period.  But since we have but one CPU, that would be after any
@@ -178,9 +168,6 @@  void call_rcu(struct rcu_head *head, rcu_callback_t func)
 			pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
 			mem_dump_obj(head);
 		}
-
-		if (!__is_kvfree_rcu_offset((unsigned long)head->func))
-			WRITE_ONCE(head->func, tiny_rcu_leak_callback);
 		return;
 	}
 
diff --git a/mm/slab.h b/mm/slab.h
index e9fd9bf0bfa65b343a4ae0ecd5b4c2a325b04883..2f01c7317988ce036f0b22807403226a59f0f708 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -604,6 +604,8 @@  void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
 			    void **p, int objects, struct slabobj_ext *obj_exts);
 #endif
 
+void kvfree_rcu_cb(struct rcu_head *head);
+
 size_t __ksize(const void *objp);
 
 static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 330cdd8ebc5380090ee784c58e8ca1d1a52b3758..f13d2c901daf1419993620459fbd5845eecb85f1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1532,9 +1532,6 @@  kvfree_rcu_list(struct rcu_head *head)
 		rcu_lock_acquire(&rcu_callback_map);
 		trace_rcu_invoke_kvfree_callback("slab", head, offset);
 
-		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
-			kvfree(ptr);
-
 		rcu_lock_release(&rcu_callback_map);
 		cond_resched_tasks_rcu_qs();
 	}
@@ -1867,7 +1864,7 @@  void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 {
 	if (head) {
 		kasan_record_aux_stack_noalloc(ptr);
-		call_rcu(head, (rcu_callback_t) ((void *) head - ptr));
+		call_rcu(head, kvfree_rcu_cb);
 		return;
 	}
 
diff --git a/mm/slub.c b/mm/slub.c
index c2151c9fee228d121a9cbcc220c3ae054769dacf..651381bf05566e88de8493e0550f121d23b757a1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,6 +19,7 @@ 
 #include <linux/bitops.h>
 #include <linux/slab.h>
 #include "slab.h"
+#include <linux/vmalloc.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kasan.h>
@@ -4732,6 +4733,47 @@  static void free_large_kmalloc(struct folio *folio, void *object)
 	folio_put(folio);
 }
 
+void kvfree_rcu_cb(struct rcu_head *head)
+{
+	void *obj = head;
+	struct folio *folio;
+	struct slab *slab;
+	struct kmem_cache *s;
+	void *slab_addr;
+
+	if (unlikely(is_vmalloc_addr(obj))) {
+		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+		vfree(obj);
+		return;
+	}
+
+	folio = virt_to_folio(obj);
+	if (unlikely(!folio_test_slab(folio))) {
+		/*
+		 * rcu_head offset can be only less than page size so no need to
+		 * consider folio order
+		 */
+		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+		free_large_kmalloc(folio, obj);
+		return;
+	}
+
+	slab = folio_slab(folio);
+	s = slab->slab_cache;
+	slab_addr = folio_address(folio);
+
+	if (is_kfence_address(obj)) {
+		obj = kfence_object_start(obj);
+	} else {
+		unsigned int idx = __obj_to_index(s, slab_addr, obj);
+
+		obj = slab_addr + s->size * idx;
+		obj = fixup_red_left(s, obj);
+	}
+
+	slab_free(s, slab, obj, _RET_IP_);
+}
+
 /**
  * kfree - free previously allocated memory
  * @object: pointer returned by kmalloc() or kmem_cache_alloc()