diff mbox series

[2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator

Message ID 20250126070206.381302-2-surenb@google.com (mailing list archive)
State New
Headers show
Series [1/3] mm: avoid extra mem_alloc_profiling_enabled() checks | expand

Commit Message

Suren Baghdasaryan Jan. 26, 2025, 7:02 a.m. UTC
When a sizable code section is protected by a disabled static key, that
code gets into the instruction cache even though it's not executed and
consumes the cache, increasing cache misses. This can be remedied by
moving such code into a separate uninlined function. The improvement
however comes at the expense of the configuration when this static key
gets enabled since there is now an additional function call.
The default state of the mem_alloc_profiling_key is controlled by
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. Apply this optimization
only if CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, improving the
performance of the default configuration.
When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y the functions
are inlined and performance does not change.

On a Pixel6 phone, slab allocation profiling overhead measured with
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and profiling disabled:

             baseline             modified
Big          3.31%                0.17%
Medium       3.79%                0.57%
Little       6.68%                1.28%

When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and memory allocation
profiling gets enabled, the difference in performance before and after
this change stays within noise levels.

On x86 this patch does not make noticeable difference because the overhead
with mem_alloc_profiling_key disabled is much lower (under 1%) to start
with, so any improvement is less visible and hard to distinguish from the
noise.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/alloc_tag.h |  6 +++++
 mm/slub.c                 | 46 ++++++++++++++++++++++++---------------
 2 files changed, 34 insertions(+), 18 deletions(-)

Comments

Vlastimil Babka Jan. 26, 2025, 4:47 p.m. UTC | #1
On 1/26/25 08:02, Suren Baghdasaryan wrote:
> When a sizable code section is protected by a disabled static key, that
> code gets into the instruction cache even though it's not executed and
> consumes the cache, increasing cache misses. This can be remedied by
> moving such code into a separate uninlined function. The improvement

Weird, I thought the static_branch_likely/unlikely/maybe was already
handling this by the unlikely case being a jump to a block away from the
fast-path stream of instructions, thus making it less likely to get cached.
AFAIU even plain likely()/unlikely() should do this, along with branch
prediction hints.

> however comes at the expense of the configuration when this static key
> gets enabled since there is now an additional function call.
> The default state of the mem_alloc_profiling_key is controlled by
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. Apply this optimization
> only if CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, improving the
> performance of the default configuration.
> When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y the functions
> are inlined and performance does not change.
> 
> On a Pixel6 phone, slab allocation profiling overhead measured with
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and profiling disabled:
> 
>              baseline             modified
> Big          3.31%                0.17%
> Medium       3.79%                0.57%
> Little       6.68%                1.28%

What does big/medium/little mean here? But indeed not nice overhead for
disabled static key.

> When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and memory allocation
> profiling gets enabled, the difference in performance before and after
> this change stays within noise levels.
> 
> On x86 this patch does not make noticeable difference because the overhead
> with mem_alloc_profiling_key disabled is much lower (under 1%) to start
> with, so any improvement is less visible and hard to distinguish from the
> noise.

That would be in line with my understanding above. Does the arm64 compiler
not do it as well as x86 (could be maybe found out by disassembling) or the
Pixel6 cpu somhow caches these out of line blocks more aggressively and only
a function call stops it?

> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

Kinda sad that despite the static key we have to control a lot by the
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.

> ---
>  include/linux/alloc_tag.h |  6 +++++
>  mm/slub.c                 | 46 ++++++++++++++++++++++++---------------
>  2 files changed, 34 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> index a946e0203e6d..c5de2a0c1780 100644
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -116,6 +116,12 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
>  DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
>  			mem_alloc_profiling_key);
>  
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> +#define inline_if_mem_alloc_prof	inline
> +#else
> +#define inline_if_mem_alloc_prof	noinline
> +#endif
> +
>  static inline bool mem_alloc_profiling_enabled(void)
>  {
>  	return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> diff --git a/mm/slub.c b/mm/slub.c
> index 996691c137eb..3107d43dfddc 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2000,7 +2000,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
>  	return 0;
>  }
>  
> -static inline void free_slab_obj_exts(struct slab *slab)
> +static inline_if_mem_alloc_prof void free_slab_obj_exts(struct slab *slab)
>  {
>  	struct slabobj_ext *obj_exts;
>  
> @@ -2077,33 +2077,35 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
>  	return slab_obj_exts(slab) + obj_to_index(s, slab, p);
>  }
>  
> -static inline void
> -alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> +static inline_if_mem_alloc_prof void
> +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
>  {
> -	if (need_slab_obj_ext()) {
> -		struct slabobj_ext *obj_exts;
> +	struct slabobj_ext *obj_exts;
>  
> -		obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
> -		/*
> -		 * Currently obj_exts is used only for allocation profiling.
> -		 * If other users appear then mem_alloc_profiling_enabled()
> -		 * check should be added before alloc_tag_add().
> -		 */
> -		if (likely(obj_exts))
> -			alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
> -	}
> +	obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
> +	/*
> +	 * Currently obj_exts is used only for allocation profiling.
> +	 * If other users appear then mem_alloc_profiling_enabled()
> +	 * check should be added before alloc_tag_add().
> +	 */
> +	if (likely(obj_exts))
> +		alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
>  }
>  
>  static inline void
> -alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> +{
> +	if (need_slab_obj_ext())
> +		__alloc_tagging_slab_alloc_hook(s, object, flags);
> +}
> +
> +static inline_if_mem_alloc_prof void
> +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
>  			     int objects)
>  {
>  	struct slabobj_ext *obj_exts;
>  	int i;
>  
> -	if (!mem_alloc_profiling_enabled())
> -		return;
> -
>  	/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
>  	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
>  		return;
> @@ -2119,6 +2121,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
>  	}
>  }
>  
> +static inline void
> +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> +			     int objects)
> +{
> +	if (mem_alloc_profiling_enabled())
> +		__alloc_tagging_slab_free_hook(s, slab, p, objects);
> +}
> +
>  #else /* CONFIG_MEM_ALLOC_PROFILING */
>  
>  static inline void
Suren Baghdasaryan Jan. 27, 2025, 7:38 p.m. UTC | #2
On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > When a sizable code section is protected by a disabled static key, that
> > code gets into the instruction cache even though it's not executed and
> > consumes the cache, increasing cache misses. This can be remedied by
> > moving such code into a separate uninlined function. The improvement

Sorry, I missed adding Steven Rostedt into the CC list since his
advice was instrumental in finding the way to optimize the static key
performance in this patch. Added now.

>
> Weird, I thought the static_branch_likely/unlikely/maybe was already
> handling this by the unlikely case being a jump to a block away from the
> fast-path stream of instructions, thus making it less likely to get cached.
> AFAIU even plain likely()/unlikely() should do this, along with branch
> prediction hints.

This was indeed an unexpected overhead when I measured it on Android.
Cache pollution was my understanding of the cause for this high
overhead after Steven told me to try uninlining the protected code. He
has done something similar in the tracing subsystem. But maybe I
misunderstood the real reason. Steven, could you please verify if my
understanding of the high overhead cause is correct here? Maybe there
is something else at play that I missed?

>
> > however comes at the expense of the configuration when this static key
> > gets enabled since there is now an additional function call.
> > The default state of the mem_alloc_profiling_key is controlled by
> > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. Apply this optimization
> > only if CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, improving the
> > performance of the default configuration.
> > When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y the functions
> > are inlined and performance does not change.
> >
> > On a Pixel6 phone, slab allocation profiling overhead measured with
> > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and profiling disabled:
> >
> >              baseline             modified
> > Big          3.31%                0.17%
> > Medium       3.79%                0.57%
> > Little       6.68%                1.28%
>
> What does big/medium/little mean here? But indeed not nice overhead for
> disabled static key.

Big/Medium/Little is the CPU core size on my ARM64-based Android phone.

>
> > When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and memory allocation
> > profiling gets enabled, the difference in performance before and after
> > this change stays within noise levels.
> >
> > On x86 this patch does not make noticeable difference because the overhead
> > with mem_alloc_profiling_key disabled is much lower (under 1%) to start
> > with, so any improvement is less visible and hard to distinguish from the
> > noise.
>
> That would be in line with my understanding above. Does the arm64 compiler
> not do it as well as x86 (could be maybe found out by disassembling) or the
> Pixel6 cpu somhow caches these out of line blocks more aggressively and only
> a function call stops it?

I'll disassemble the code and will see what it looks like.

>
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>
> Kinda sad that despite the static key we have to control a lot by the
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.

I agree. If there is a better way to fix this regression I'm open to
changes. Let's wait for Steven to confirm my understanding before
proceeding.
Thanks,
Suren.

>
> > ---
> >  include/linux/alloc_tag.h |  6 +++++
> >  mm/slub.c                 | 46 ++++++++++++++++++++++++---------------
> >  2 files changed, 34 insertions(+), 18 deletions(-)
> >
> > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> > index a946e0203e6d..c5de2a0c1780 100644
> > --- a/include/linux/alloc_tag.h
> > +++ b/include/linux/alloc_tag.h
> > @@ -116,6 +116,12 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
> >  DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> >                       mem_alloc_profiling_key);
> >
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > +#define inline_if_mem_alloc_prof     inline
> > +#else
> > +#define inline_if_mem_alloc_prof     noinline
> > +#endif
> > +
> >  static inline bool mem_alloc_profiling_enabled(void)
> >  {
> >       return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 996691c137eb..3107d43dfddc 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -2000,7 +2000,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
> >       return 0;
> >  }
> >
> > -static inline void free_slab_obj_exts(struct slab *slab)
> > +static inline_if_mem_alloc_prof void free_slab_obj_exts(struct slab *slab)
> >  {
> >       struct slabobj_ext *obj_exts;
> >
> > @@ -2077,33 +2077,35 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
> >       return slab_obj_exts(slab) + obj_to_index(s, slab, p);
> >  }
> >
> > -static inline void
> > -alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> > +static inline_if_mem_alloc_prof void
> > +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> >  {
> > -     if (need_slab_obj_ext()) {
> > -             struct slabobj_ext *obj_exts;
> > +     struct slabobj_ext *obj_exts;
> >
> > -             obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
> > -             /*
> > -              * Currently obj_exts is used only for allocation profiling.
> > -              * If other users appear then mem_alloc_profiling_enabled()
> > -              * check should be added before alloc_tag_add().
> > -              */
> > -             if (likely(obj_exts))
> > -                     alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
> > -     }
> > +     obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
> > +     /*
> > +      * Currently obj_exts is used only for allocation profiling.
> > +      * If other users appear then mem_alloc_profiling_enabled()
> > +      * check should be added before alloc_tag_add().
> > +      */
> > +     if (likely(obj_exts))
> > +             alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
> >  }
> >
> >  static inline void
> > -alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> > +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> > +{
> > +     if (need_slab_obj_ext())
> > +             __alloc_tagging_slab_alloc_hook(s, object, flags);
> > +}
> > +
> > +static inline_if_mem_alloc_prof void
> > +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> >                            int objects)
> >  {
> >       struct slabobj_ext *obj_exts;
> >       int i;
> >
> > -     if (!mem_alloc_profiling_enabled())
> > -             return;
> > -
> >       /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
> >       if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
> >               return;
> > @@ -2119,6 +2121,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> >       }
> >  }
> >
> > +static inline void
> > +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> > +                          int objects)
> > +{
> > +     if (mem_alloc_profiling_enabled())
> > +             __alloc_tagging_slab_free_hook(s, slab, p, objects);
> > +}
> > +
> >  #else /* CONFIG_MEM_ALLOC_PROFILING */
> >
> >  static inline void
>
Steven Rostedt Jan. 28, 2025, 7:35 p.m. UTC | #3
On Mon, 27 Jan 2025 11:38:32 -0800
Suren Baghdasaryan <surenb@google.com> wrote:

> On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > On 1/26/25 08:02, Suren Baghdasaryan wrote:  
> > > When a sizable code section is protected by a disabled static key, that
> > > code gets into the instruction cache even though it's not executed and
> > > consumes the cache, increasing cache misses. This can be remedied by
> > > moving such code into a separate uninlined function. The improvement  
> 
> Sorry, I missed adding Steven Rostedt into the CC list since his
> advice was instrumental in finding the way to optimize the static key
> performance in this patch. Added now.
> 
> >
> > Weird, I thought the static_branch_likely/unlikely/maybe was already
> > handling this by the unlikely case being a jump to a block away from the
> > fast-path stream of instructions, thus making it less likely to get cached.
> > AFAIU even plain likely()/unlikely() should do this, along with branch
> > prediction hints.  
> 
> This was indeed an unexpected overhead when I measured it on Android.
> Cache pollution was my understanding of the cause for this high
> overhead after Steven told me to try uninlining the protected code. He
> has done something similar in the tracing subsystem. But maybe I
> misunderstood the real reason. Steven, could you please verify if my
> understanding of the high overhead cause is correct here? Maybe there
> is something else at play that I missed?

From what I understand, is that the compiler will only move code to the end
of a function with the unlikely(). But, the code after the function could
also be in the control flow path. If you have several functions that are
called together, by adding code to the unlikely() cases may not help the
speed.

I made an effort to make the tracepoint code call functions instead of
having everything inlined. It actually brought down the size of the text of
the kernel, but looking in the change logs I never posted benchmarks. But
I'm sure making the size of the scheduler text section smaller probably did
help.

> > That would be in line with my understanding above. Does the arm64 compiler
> > not do it as well as x86 (could be maybe found out by disassembling) or the
> > Pixel6 cpu somhow caches these out of line blocks more aggressively and only
> > a function call stops it?  
> 
> I'll disassemble the code and will see what it looks like.

I think I asked you to do that too ;-)

> 
> >  
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>  
> >
> > Kinda sad that despite the static key we have to control a lot by the
> > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.  
> 
> I agree. If there is a better way to fix this regression I'm open to
> changes. Let's wait for Steven to confirm my understanding before
> proceeding.

How slow is it to always do the call instead of inlining?

-- Steve
Peter Zijlstra Jan. 28, 2025, 10:49 p.m. UTC | #4
On Sun, Jan 26, 2025 at 05:47:08PM +0100, Vlastimil Babka wrote:
> On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > When a sizable code section is protected by a disabled static key, that
> > code gets into the instruction cache even though it's not executed and
> > consumes the cache, increasing cache misses. This can be remedied by
> > moving such code into a separate uninlined function. The improvement
> 
> Weird, I thought the static_branch_likely/unlikely/maybe was already
> handling this by the unlikely case being a jump to a block away from the
> fast-path stream of instructions, thus making it less likely to get cached.
> AFAIU even plain likely()/unlikely() should do this, along with branch
> prediction hints.

Very much depends on the compiler :-(

sometimes unlikely just moves it to the end of the function, sometimes
it's moved to .text.unlikely.

Some compilers have label attributes:

l_yes: __attribute__((cold));

but the same compilers utterly ignore it when it's combined with
asm-goto or something -- we could never get it to work reliably.


It's been a while since I looked at this, so I'm not entirely sure what
the current version of compilers do.
Suren Baghdasaryan Jan. 28, 2025, 11:43 p.m. UTC | #5
On Tue, Jan 28, 2025 at 11:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 27 Jan 2025 11:38:32 -0800
> Suren Baghdasaryan <surenb@google.com> wrote:
>
> > On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > >
> > > On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > > > When a sizable code section is protected by a disabled static key, that
> > > > code gets into the instruction cache even though it's not executed and
> > > > consumes the cache, increasing cache misses. This can be remedied by
> > > > moving such code into a separate uninlined function. The improvement
> >
> > Sorry, I missed adding Steven Rostedt into the CC list since his
> > advice was instrumental in finding the way to optimize the static key
> > performance in this patch. Added now.
> >
> > >
> > > Weird, I thought the static_branch_likely/unlikely/maybe was already
> > > handling this by the unlikely case being a jump to a block away from the
> > > fast-path stream of instructions, thus making it less likely to get cached.
> > > AFAIU even plain likely()/unlikely() should do this, along with branch
> > > prediction hints.
> >
> > This was indeed an unexpected overhead when I measured it on Android.
> > Cache pollution was my understanding of the cause for this high
> > overhead after Steven told me to try uninlining the protected code. He
> > has done something similar in the tracing subsystem. But maybe I
> > misunderstood the real reason. Steven, could you please verify if my
> > understanding of the high overhead cause is correct here? Maybe there
> > is something else at play that I missed?
>
> From what I understand, is that the compiler will only move code to the end
> of a function with the unlikely(). But, the code after the function could
> also be in the control flow path. If you have several functions that are
> called together, by adding code to the unlikely() cases may not help the
> speed.
>
> I made an effort to make the tracepoint code call functions instead of
> having everything inlined. It actually brought down the size of the text of
> the kernel, but looking in the change logs I never posted benchmarks. But
> I'm sure making the size of the scheduler text section smaller probably did
> help.
>
> > > That would be in line with my understanding above. Does the arm64 compiler
> > > not do it as well as x86 (could be maybe found out by disassembling) or the
> > > Pixel6 cpu somhow caches these out of line blocks more aggressively and only
> > > a function call stops it?
> >
> > I'll disassemble the code and will see what it looks like.
>
> I think I asked you to do that too ;-)

Yes you did! And I disassembled almost each of these functions during
my investigation but in my infinite wisdom I did not save any of them.
So, now I need to do that again to answer Vlastimil's question. I'll
try to do that today.

>
> >
> > >
> > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > >
> > > Kinda sad that despite the static key we have to control a lot by the
> > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.
> >
> > I agree. If there is a better way to fix this regression I'm open to
> > changes. Let's wait for Steven to confirm my understanding before
> > proceeding.
>
> How slow is it to always do the call instead of inlining?

Let's see... The additional overhead if we always call is:

Little core: 2.42%
Middle core: 1.23%
Big core: 0.66%

Not a huge deal because the overhead of memory profiling when enabled
is much higher. So, maybe for simplicity I should indeed always call?

>
> -- Steve
Steven Rostedt Jan. 29, 2025, 12:03 a.m. UTC | #6
On Tue, 28 Jan 2025 15:43:13 -0800
Suren Baghdasaryan <surenb@google.com> wrote:

> > How slow is it to always do the call instead of inlining?  
> 
> Let's see... The additional overhead if we always call is:
> 
> Little core: 2.42%
> Middle core: 1.23%
> Big core: 0.66%
> 
> Not a huge deal because the overhead of memory profiling when enabled
> is much higher. So, maybe for simplicity I should indeed always call?

That's what I was thinking, unless the other maintainers are OK with this
special logic.

-- Steve
Suren Baghdasaryan Jan. 29, 2025, 2:54 a.m. UTC | #7
On Tue, Jan 28, 2025 at 3:43 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jan 28, 2025 at 11:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Mon, 27 Jan 2025 11:38:32 -0800
> > Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > > On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > > >
> > > > On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > > > > When a sizable code section is protected by a disabled static key, that
> > > > > code gets into the instruction cache even though it's not executed and
> > > > > consumes the cache, increasing cache misses. This can be remedied by
> > > > > moving such code into a separate uninlined function. The improvement
> > >
> > > Sorry, I missed adding Steven Rostedt into the CC list since his
> > > advice was instrumental in finding the way to optimize the static key
> > > performance in this patch. Added now.
> > >
> > > >
> > > > Weird, I thought the static_branch_likely/unlikely/maybe was already
> > > > handling this by the unlikely case being a jump to a block away from the
> > > > fast-path stream of instructions, thus making it less likely to get cached.
> > > > AFAIU even plain likely()/unlikely() should do this, along with branch
> > > > prediction hints.
> > >
> > > This was indeed an unexpected overhead when I measured it on Android.
> > > Cache pollution was my understanding of the cause for this high
> > > overhead after Steven told me to try uninlining the protected code. He
> > > has done something similar in the tracing subsystem. But maybe I
> > > misunderstood the real reason. Steven, could you please verify if my
> > > understanding of the high overhead cause is correct here? Maybe there
> > > is something else at play that I missed?
> >
> > From what I understand, is that the compiler will only move code to the end
> > of a function with the unlikely(). But, the code after the function could
> > also be in the control flow path. If you have several functions that are
> > called together, by adding code to the unlikely() cases may not help the
> > speed.
> >
> > I made an effort to make the tracepoint code call functions instead of
> > having everything inlined. It actually brought down the size of the text of
> > the kernel, but looking in the change logs I never posted benchmarks. But
> > I'm sure making the size of the scheduler text section smaller probably did
> > help.
> >
> > > > That would be in line with my understanding above. Does the arm64 compiler
> > > > not do it as well as x86 (could be maybe found out by disassembling) or the
> > > > Pixel6 cpu somhow caches these out of line blocks more aggressively and only
> > > > a function call stops it?
> > >
> > > I'll disassemble the code and will see what it looks like.
> >
> > I think I asked you to do that too ;-)
>
> Yes you did! And I disassembled almost each of these functions during
> my investigation but in my infinite wisdom I did not save any of them.
> So, now I need to do that again to answer Vlastimil's question. I'll
> try to do that today.

Yeah, quite a difference. This is alloc_tagging_slab_alloc_hook() with
outlined version of __alloc_tagging_slab_alloc_hook():

ffffffc0803a2dd8 <alloc_tagging_slab_alloc_hook>:
ffffffc0803a2dd8: d503201f      nop
ffffffc0803a2ddc: d65f03c0      ret
ffffffc0803a2de0: d503233f      paciasp
ffffffc0803a2de4: a9bf7bfd      stp x29, x30, [sp, #-0x10]!
ffffffc0803a2de8: 910003fd      mov x29, sp
ffffffc0803a2dec: 94000004      bl 0xffffffc0803a2dfc
<__alloc_tagging_slab_alloc_hook>
ffffffc0803a2df0: a8c17bfd      ldp x29, x30, [sp], #0x10
ffffffc0803a2df4: d50323bf      autiasp
ffffffc0803a2df8: d65f03c0      ret

This is the same function with inlined version of
__alloc_tagging_slab_alloc_hook():

ffffffc0803a2dd8 <alloc_tagging_slab_alloc_hook>:
ffffffc0803a2dd8: d503233f      paciasp
ffffffc0803a2ddc: d10103ff      sub sp, sp, #0x40
ffffffc0803a2de0: a9017bfd      stp x29, x30, [sp, #0x10]
ffffffc0803a2de4: f90013f5      str x21, [sp, #0x20]
ffffffc0803a2de8: a9034ff4      stp x20, x19, [sp, #0x30]
ffffffc0803a2dec: 910043fd      add x29, sp, #0x10
ffffffc0803a2df0: d503201f      nop
ffffffc0803a2df4: a9434ff4      ldp x20, x19, [sp, #0x30]
ffffffc0803a2df8: f94013f5      ldr x21, [sp, #0x20]
ffffffc0803a2dfc: a9417bfd      ldp x29, x30, [sp, #0x10]
ffffffc0803a2e00: 910103ff      add sp, sp, #0x40
ffffffc0803a2e04: d50323bf      autiasp
ffffffc0803a2e08: d65f03c0      ret
ffffffc0803a2e0c: b4ffff41      cbz x1, 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2e10: b9400808      ldr w8, [x0, #0x8]
ffffffc0803a2e14: 12060049      and w9, w2, #0x4000000
ffffffc0803a2e18: 12152108      and w8, w8, #0xff800
ffffffc0803a2e1c: 120d6108      and w8, w8, #0xfff80fff
ffffffc0803a2e20: 2a090108      orr w8, w8, w9
ffffffc0803a2e24: 35fffe88      cbnz w8, 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2e28: d378dc28      lsl x8, x1, #8
ffffffc0803a2e2c: d2c01009      mov x9, #0x8000000000 // =549755813888
ffffffc0803a2e30: f9000fa0      str x0, [x29, #0x18]
ffffffc0803a2e34: f90007e1      str x1, [sp, #0x8]
ffffffc0803a2e38: 8b882128      add x8, x9, x8, asr #8
ffffffc0803a2e3c: b25f7be9      mov x9, #-0x200000000 // =-8589934592
ffffffc0803a2e40: f2b80009      movk x9, #0xc000, lsl #16
ffffffc0803a2e44: d34cfd08      lsr x8, x8, #12
ffffffc0803a2e48: 8b081928      add x8, x9, x8, lsl #6
ffffffc0803a2e4c: f9400509      ldr x9, [x8, #0x8]
ffffffc0803a2e50: d100052a      sub x10, x9, #0x1
ffffffc0803a2e54: 7200013f      tst w9, #0x1
ffffffc0803a2e58: 9a8a0108      csel x8, x8, x10, eq
ffffffc0803a2e5c: 3940cd09      ldrb w9, [x8, #0x33]
ffffffc0803a2e60: 7103d53f      cmp w9, #0xf5
ffffffc0803a2e64: 9a9f0113      csel x19, x8, xzr, eq
ffffffc0803a2e68: f9401e68      ldr x8, [x19, #0x38]
ffffffc0803a2e6c: f1001d1f      cmp x8, #0x7
ffffffc0803a2e70: 540000a8      b.hi 0xffffffc0803a2e84
<alloc_tagging_slab_alloc_hook+0xac>
ffffffc0803a2e74: aa1303e0      mov x0, x19
ffffffc0803a2e78: 2a1f03e3      mov w3, wzr
ffffffc0803a2e7c: 97ffd6a5      bl 0xffffffc080398910 <alloc_slab_obj_exts>
ffffffc0803a2e80: 350009c0      cbnz w0, 0xffffffc0803a2fb8
<alloc_tagging_slab_alloc_hook+0x1e0>
ffffffc0803a2e84: b000f2c8      adrp x8, 0xffffffc0821fb000
<max_load_balance_interval>
ffffffc0803a2e88: f9401e6a      ldr x10, [x19, #0x38]
ffffffc0803a2e8c: f9453909      ldr x9, [x8, #0xa70]
ffffffc0803a2e90: 927df148      and x8, x10, #0xfffffffffffffff8
ffffffc0803a2e94: b40000e9      cbz x9, 0xffffffc0803a2eb0
<alloc_tagging_slab_alloc_hook+0xd8>
ffffffc0803a2e98: f94007ea      ldr x10, [sp, #0x8]
ffffffc0803a2e9c: cb090149      sub x9, x10, x9
ffffffc0803a2ea0: f142013f      cmp x9, #0x80, lsl #12 // =0x80000
ffffffc0803a2ea4: 54000062      b.hs 0xffffffc0803a2eb0
<alloc_tagging_slab_alloc_hook+0xd8>
ffffffc0803a2ea8: aa1f03e9      mov x9, xzr
ffffffc0803a2eac: 14000015      b 0xffffffc0803a2f00
<alloc_tagging_slab_alloc_hook+0x128>
ffffffc0803a2eb0: d2ffe009      mov x9, #-0x100000000000000 //
=-72057594037927936
ffffffc0803a2eb4: 14000002      b 0xffffffc0803a2ebc
<alloc_tagging_slab_alloc_hook+0xe4>
ffffffc0803a2eb8: aa1f03e9      mov x9, xzr
ffffffc0803a2ebc: d2dffa0a      mov x10, #0xffd000000000 // =281268818280448
ffffffc0803a2ec0: f2e01fea      movk x10, #0xff, lsl #48
ffffffc0803a2ec4: 8b13194a      add x10, x10, x19, lsl #6
ffffffc0803a2ec8: 9274ad4a      and x10, x10, #0xfffffffffff000
ffffffc0803a2ecc: aa0a012a      orr x10, x9, x10
ffffffc0803a2ed0: f9400fa9      ldr x9, [x29, #0x18]
ffffffc0803a2ed4: f940112b      ldr x11, [x9, #0x20]
ffffffc0803a2ed8: f94007e9      ldr x9, [sp, #0x8]
ffffffc0803a2edc: cb0a0129      sub x9, x9, x10
ffffffc0803a2ee0: d360fd6c      lsr x12, x11, #32
ffffffc0803a2ee4: 9bab7d2a      umull x10, w9, w11
ffffffc0803a2ee8: d368fd6b      lsr x11, x11, #40
ffffffc0803a2eec: d360fd4a      lsr x10, x10, #32
ffffffc0803a2ef0: 4b0a0129      sub w9, w9, w10
ffffffc0803a2ef4: 1acc2529      lsr w9, w9, w12
ffffffc0803a2ef8: 0b0a0129      add w9, w9, w10
ffffffc0803a2efc: 1acb2529      lsr w9, w9, w11
ffffffc0803a2f00: ab091109      adds x9, x8, x9, lsl #4
ffffffc0803a2f04: f9400fa8      ldr x8, [x29, #0x18]
ffffffc0803a2f08: 54fff760      b.eq 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2f0c: b1002129      adds x9, x9, #0x8
ffffffc0803a2f10: 54fff720      b.eq 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2f14: d5384113      mrs x19, SP_EL0
ffffffc0803a2f18: f9402a74      ldr x20, [x19, #0x50]
ffffffc0803a2f1c: b4fff6d4      cbz x20, 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2f20: b9401915      ldr w21, [x8, #0x18]
ffffffc0803a2f24: f9000134      str x20, [x9]
ffffffc0803a2f28: b9401268      ldr w8, [x19, #0x10]
ffffffc0803a2f2c: 11000508      add w8, w8, #0x1
ffffffc0803a2f30: b9001268      str w8, [x19, #0x10]
ffffffc0803a2f34: f9401288      ldr x8, [x20, #0x20]
ffffffc0803a2f38: d538d089      mrs x9, TPIDR_EL1
ffffffc0803a2f3c: 8b090108      add x8, x8, x9
ffffffc0803a2f40: 52800029      mov w9, #0x1        // =1
ffffffc0803a2f44: 91002108      add x8, x8, #0x8
ffffffc0803a2f48: c85f7d0b      ldxr x11, [x8]
ffffffc0803a2f4c: 8b09016b      add x11, x11, x9
ffffffc0803a2f50: c80a7d0b      stxr w10, x11, [x8]
ffffffc0803a2f54: 35ffffaa      cbnz w10, 0xffffffc0803a2f48
<alloc_tagging_slab_alloc_hook+0x170>
ffffffc0803a2f58: f9400a68      ldr x8, [x19, #0x10]
ffffffc0803a2f5c: f1000508      subs x8, x8, #0x1
ffffffc0803a2f60: b9001268      str w8, [x19, #0x10]
ffffffc0803a2f64: 540003c0      b.eq 0xffffffc0803a2fdc
<alloc_tagging_slab_alloc_hook+0x204>
ffffffc0803a2f68: f9400a68      ldr x8, [x19, #0x10]
ffffffc0803a2f6c: b4000388      cbz x8, 0xffffffc0803a2fdc
<alloc_tagging_slab_alloc_hook+0x204>
ffffffc0803a2f70: b9401268      ldr w8, [x19, #0x10]
ffffffc0803a2f74: 11000508      add w8, w8, #0x1
ffffffc0803a2f78: b9001268      str w8, [x19, #0x10]
ffffffc0803a2f7c: f9401288      ldr x8, [x20, #0x20]
ffffffc0803a2f80: d538d089      mrs x9, TPIDR_EL1
ffffffc0803a2f84: 8b080128      add x8, x9, x8
ffffffc0803a2f88: c85f7d0a      ldxr x10, [x8]
ffffffc0803a2f8c: 8b15014a      add x10, x10, x21
ffffffc0803a2f90: c8097d0a      stxr w9, x10, [x8]
ffffffc0803a2f94: 35ffffa9      cbnz w9, 0xffffffc0803a2f88
<alloc_tagging_slab_alloc_hook+0x1b0>
ffffffc0803a2f98: f9400a68      ldr x8, [x19, #0x10]
ffffffc0803a2f9c: f1000508      subs x8, x8, #0x1
ffffffc0803a2fa0: b9001268      str w8, [x19, #0x10]
ffffffc0803a2fa4: 54000060      b.eq 0xffffffc0803a2fb0
<alloc_tagging_slab_alloc_hook+0x1d8>
ffffffc0803a2fa8: f9400a68      ldr x8, [x19, #0x10]
ffffffc0803a2fac: b5fff248      cbnz x8, 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2fb0: 94344478      bl 0xffffffc0810b4190 <preempt_schedule_notrace>
ffffffc0803a2fb4: 17ffff90      b 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2fb8: f9400fa8      ldr x8, [x29, #0x18]
ffffffc0803a2fbc: f00092c0      adrp x0, 0xffffffc0815fd000
<f_midi_shortname+0x4cf4>
ffffffc0803a2fc0: 910e5400      add x0, x0, #0x395
ffffffc0803a2fc4: d00099c1      adrp x1, 0xffffffc0816dc000 <longname+0x2727d>
ffffffc0803a2fc8: 911d1421      add x1, x1, #0x745
ffffffc0803a2fcc: f9403102      ldr x2, [x8, #0x60]
ffffffc0803a2fd0: 97f46d47      bl 0xffffffc0800be4ec <__warn_printk>
ffffffc0803a2fd4: d4210000      brk #0x800
ffffffc0803a2fd8: 17ffff87      b 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2fdc: 9434446d      bl 0xffffffc0810b4190 <preempt_schedule_notrace>
ffffffc0803a2fe0: 17ffffe4      b 0xffffffc0803a2f70
<alloc_tagging_slab_alloc_hook+0x198>





>
> >
> > >
> > > >
> > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > >
> > > > Kinda sad that despite the static key we have to control a lot by the
> > > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.
> > >
> > > I agree. If there is a better way to fix this regression I'm open to
> > > changes. Let's wait for Steven to confirm my understanding before
> > > proceeding.
> >
> > How slow is it to always do the call instead of inlining?
>
> Let's see... The additional overhead if we always call is:
>
> Little core: 2.42%
> Middle core: 1.23%
> Big core: 0.66%
>
> Not a huge deal because the overhead of memory profiling when enabled
> is much higher. So, maybe for simplicity I should indeed always call?
>
> >
> > -- Steve
Vlastimil Babka Jan. 29, 2025, 9:38 a.m. UTC | #8
On 1/29/25 03:54, Suren Baghdasaryan wrote:
> On Tue, Jan 28, 2025 at 3:43 PM Suren Baghdasaryan <surenb@google.com> wrote:
>>
>> On Tue, Jan 28, 2025 at 11:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>> >
>> > On Mon, 27 Jan 2025 11:38:32 -0800
>> > Suren Baghdasaryan <surenb@google.com> wrote:
>> >
>> > > On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>> > > >
>> > > > On 1/26/25 08:02, Suren Baghdasaryan wrote:
>> > > > > When a sizable code section is protected by a disabled static key, that
>> > > > > code gets into the instruction cache even though it's not executed and
>> > > > > consumes the cache, increasing cache misses. This can be remedied by
>> > > > > moving such code into a separate uninlined function. The improvement
>> > >
>> > > Sorry, I missed adding Steven Rostedt into the CC list since his
>> > > advice was instrumental in finding the way to optimize the static key
>> > > performance in this patch. Added now.
>> > >
>> > > >
>> > > > Weird, I thought the static_branch_likely/unlikely/maybe was already
>> > > > handling this by the unlikely case being a jump to a block away from the
>> > > > fast-path stream of instructions, thus making it less likely to get cached.
>> > > > AFAIU even plain likely()/unlikely() should do this, along with branch
>> > > > prediction hints.
>> > >
>> > > This was indeed an unexpected overhead when I measured it on Android.
>> > > Cache pollution was my understanding of the cause for this high
>> > > overhead after Steven told me to try uninlining the protected code. He
>> > > has done something similar in the tracing subsystem. But maybe I
>> > > misunderstood the real reason. Steven, could you please verify if my
>> > > understanding of the high overhead cause is correct here? Maybe there
>> > > is something else at play that I missed?
>> >
>> > From what I understand, is that the compiler will only move code to the end
>> > of a function with the unlikely(). But, the code after the function could
>> > also be in the control flow path. If you have several functions that are
>> > called together, by adding code to the unlikely() cases may not help the
>> > speed.
>> >
>> > I made an effort to make the tracepoint code call functions instead of
>> > having everything inlined. It actually brought down the size of the text of
>> > the kernel, but looking in the change logs I never posted benchmarks. But
>> > I'm sure making the size of the scheduler text section smaller probably did
>> > help.
>> >
>> > > > That would be in line with my understanding above. Does the arm64 compiler
>> > > > not do it as well as x86 (could be maybe found out by disassembling) or the
>> > > > Pixel6 cpu somhow caches these out of line blocks more aggressively and only
>> > > > a function call stops it?
>> > >
>> > > I'll disassemble the code and will see what it looks like.
>> >
>> > I think I asked you to do that too ;-)
>>
>> Yes you did! And I disassembled almost each of these functions during
>> my investigation but in my infinite wisdom I did not save any of them.
>> So, now I need to do that again to answer Vlastimil's question. I'll
>> try to do that today.
> 
> Yeah, quite a difference. This is alloc_tagging_slab_alloc_hook() with
> outlined version of __alloc_tagging_slab_alloc_hook():

Not fluent in arm64 assembly but let's see...

> ffffffc0803a2dd8 <alloc_tagging_slab_alloc_hook>:
> ffffffc0803a2dd8: d503201f      nop
> ffffffc0803a2ddc: d65f03c0      ret

So that's an immediate return unless static key rewrites the nop.

BTW, I wouldn't expect the alloc_tagging_slab_alloc_hook() to exist as a
separate function in the first place, since it's "static inline". It seems
weird to do a function call to a static key test. We should perhaps force
inline it.

> ffffffc0803a2de0: d503233f      paciasp
> ffffffc0803a2de4: a9bf7bfd      stp x29, x30, [sp, #-0x10]!
> ffffffc0803a2de8: 910003fd      mov x29, sp
> ffffffc0803a2dec: 94000004      bl 0xffffffc0803a2dfc
> <__alloc_tagging_slab_alloc_hook>
> ffffffc0803a2df0: a8c17bfd      ldp x29, x30, [sp], #0x10
> ffffffc0803a2df4: d50323bf      autiasp
> ffffffc0803a2df8: d65f03c0      ret
> 
> This is the same function with inlined version of
> __alloc_tagging_slab_alloc_hook():
> 
> ffffffc0803a2dd8 <alloc_tagging_slab_alloc_hook>:
> ffffffc0803a2dd8: d503233f      paciasp
> ffffffc0803a2ddc: d10103ff      sub sp, sp, #0x40
> ffffffc0803a2de0: a9017bfd      stp x29, x30, [sp, #0x10]
> ffffffc0803a2de4: f90013f5      str x21, [sp, #0x20]
> ffffffc0803a2de8: a9034ff4      stp x20, x19, [sp, #0x30]
> ffffffc0803a2dec: 910043fd      add x29, sp, #0x10
> ffffffc0803a2df0: d503201f      nop
> ffffffc0803a2df4: a9434ff4      ldp x20, x19, [sp, #0x30]
> ffffffc0803a2df8: f94013f5      ldr x21, [sp, #0x20]
> ffffffc0803a2dfc: a9417bfd      ldp x29, x30, [sp, #0x10]
> ffffffc0803a2e00: 910103ff      add sp, sp, #0x40
> ffffffc0803a2e04: d50323bf      autiasp
> ffffffc0803a2e08: d65f03c0      ret

Seems to me this will also return unless the nop is rewritten, but instead
of making a call reachable there will be a jump to below?
Now is the overhead larger because the code below gets cached, or because
the block above is doing more in the disabled case? It looks quite suboptimal.

> ffffffc0803a2e0c: b4ffff41      cbz x1, 0xffffffc0803a2df4
> <alloc_tagging_slab_alloc_hook+0x1c>
> ffffffc0803a2e10: b9400808      ldr w8, [x0, #0x8]
> ffffffc0803a2e14: 12060049      and w9, w2, #0x4000000
> ffffffc0803a2e18: 12152108      and w8, w8, #0xff800
> ffffffc0803a2e1c: 120d6108      and w8, w8, #0xfff80fff
> ffffffc0803a2e20: 2a090108      orr w8, w8, w9
Vlastimil Babka Jan. 29, 2025, 9:50 a.m. UTC | #9
On 1/29/25 01:03, Steven Rostedt wrote:
> On Tue, 28 Jan 2025 15:43:13 -0800
> Suren Baghdasaryan <surenb@google.com> wrote:
> 
>> > How slow is it to always do the call instead of inlining?  
>> 
>> Let's see... The additional overhead if we always call is:
>> 
>> Little core: 2.42%
>> Middle core: 1.23%
>> Big core: 0.66%
>> 
>> Not a huge deal because the overhead of memory profiling when enabled
>> is much higher. So, maybe for simplicity I should indeed always call?
> 
> That's what I was thinking, unless the other maintainers are OK with this
> special logic.

If it's acceptable, I would prefer to always call. But at the same time make
sure the static key test is really inlined, i.e. force inline
alloc_tagging_slab_alloc_hook() (see my other reply looking at the disassembly).

Well or rather just open-code the contents of the
alloc_tagging_slab_alloc_hook and alloc_tagging_slab_free_hook (as they look
after this patch) into the callers. It's just two lines. The extra layer is
just unnecessary distraction.

Then it's probably inevitable the actual hook content after the static key
test should not be inline even with
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT as the result would be inlined
into too many places. But since we remove one call layer anyway thanks to
above, even without the full inlining the resulting performance could
hopefully be fine (compared to the state before your series).

> -- Steve
Suren Baghdasaryan Jan. 29, 2025, 5:26 p.m. UTC | #10
On Wed, Jan 29, 2025 at 1:50 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/29/25 01:03, Steven Rostedt wrote:
> > On Tue, 28 Jan 2025 15:43:13 -0800
> > Suren Baghdasaryan <surenb@google.com> wrote:
> >
> >> > How slow is it to always do the call instead of inlining?
> >>
> >> Let's see... The additional overhead if we always call is:
> >>
> >> Little core: 2.42%
> >> Middle core: 1.23%
> >> Big core: 0.66%
> >>
> >> Not a huge deal because the overhead of memory profiling when enabled
> >> is much higher. So, maybe for simplicity I should indeed always call?
> >
> > That's what I was thinking, unless the other maintainers are OK with this
> > special logic.
>
> If it's acceptable, I would prefer to always call.

Ok, I'll post that version. If this becomes an issue we can reconsider later.

> But at the same time make
> sure the static key test is really inlined, i.e. force inline
> alloc_tagging_slab_alloc_hook() (see my other reply looking at the disassembly).

Sorry, I should have made it clear that I uninlined
alloc_tagging_slab_alloc_hook() to localize the relevant code. If
reality it is inlined. Since inlined outputs are quite big, I'm
attaching disassembly of kmem_cache_alloc_noprof() which has
alloc_tagging_slab_alloc_hook() inlined in it.

>
> Well or rather just open-code the contents of the
> alloc_tagging_slab_alloc_hook and alloc_tagging_slab_free_hook (as they look
> after this patch) into the callers. It's just two lines. The extra layer is
> just unnecessary distraction.

alloc_tagging_slab_alloc_hook() is inlined, no need to open-code.

>
> Then it's probably inevitable the actual hook content after the static key
> test should not be inline even with
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT as the result would be inlined
> into too many places. But since we remove one call layer anyway thanks to
> above, even without the full inlining the resulting performance could
> hopefully be fine (compared to the state before your series).

Agree. Thanks for the feedback!
I'll prepare v2 with no dependency on
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT for not inlining (always
call).

>
> > -- Steve
>
ffffffc080398e08 <kmem_cache_alloc_noprof>:
ffffffc080398e08: d503233f     	paciasp
ffffffc080398e0c: d101c3ff     	sub	sp, sp, #0x70
ffffffc080398e10: a9017bfd     	stp	x29, x30, [sp, #0x10]
ffffffc080398e14: a9026ffc     	stp	x28, x27, [sp, #0x20]
ffffffc080398e18: a90367fa     	stp	x26, x25, [sp, #0x30]
ffffffc080398e1c: a9045ff8     	stp	x24, x23, [sp, #0x40]
ffffffc080398e20: a90557f6     	stp	x22, x21, [sp, #0x50]
ffffffc080398e24: a9064ff4     	stp	x20, x19, [sp, #0x60]
ffffffc080398e28: 910043fd     	add	x29, sp, #0x10
ffffffc080398e2c: d5384108     	mrs	x8, SP_EL0
ffffffc080398e30: aa0003f3     	mov	x19, x0
ffffffc080398e34: 2a0103f4     	mov	w20, w1
ffffffc080398e38: f9430908     	ldr	x8, [x8, #0x610]
ffffffc080398e3c: f90007e8     	str	x8, [sp, #0x8]
ffffffc080398e40: d50320ff     	xpaclri
ffffffc080398e44: aa1e03f5     	mov	x21, x30
ffffffc080398e48: b4000d60     	cbz	x0, 0xffffffc080398ff4 <kmem_cache_alloc_noprof+0x1ec>
ffffffc080398e4c: b9401e77     	ldr	w23, [x19, #0x1c]
ffffffc080398e50: d503201f     	nop
ffffffc080398e54: f90003ff     	str	xzr, [sp]
ffffffc080398e58: d538411a     	mrs	x26, SP_EL0
ffffffc080398e5c: f9400268     	ldr	x8, [x19]
ffffffc080398e60: d538d089     	mrs	x9, TPIDR_EL1
ffffffc080398e64: 8b080128     	add	x8, x9, x8
ffffffc080398e68: f9400518     	ldr	x24, [x8, #0x8]
ffffffc080398e6c: f9400116     	ldr	x22, [x8]
ffffffc080398e70: f9400908     	ldr	x8, [x8, #0x10]
ffffffc080398e74: f10002df     	cmp	x22, #0x0
ffffffc080398e78: fa401904     	ccmp	x8, #0x0, #0x4, ne
ffffffc080398e7c: 54000da0     	b.eq	0xffffffc080399030 <kmem_cache_alloc_noprof+0x228>
ffffffc080398e80: d378dec8     	lsl	x8, x22, #8
ffffffc080398e84: b9402a69     	ldr	w9, [x19, #0x28]
ffffffc080398e88: f9405e6a     	ldr	x10, [x19, #0xb8]
ffffffc080398e8c: 91008303     	add	x3, x24, #0x20
ffffffc080398e90: 8b882128     	add	x8, x9, x8, asr #8
ffffffc080398e94: f9400109     	ldr	x9, [x8]
ffffffc080398e98: b940134b     	ldr	w11, [x26, #0x10]
ffffffc080398e9c: dac00d08     	rev	x8, x8
ffffffc080398ea0: ca080148     	eor	x8, x10, x8
ffffffc080398ea4: 1100056b     	add	w11, w11, #0x1
ffffffc080398ea8: ca090119     	eor	x25, x8, x9
ffffffc080398eac: b900134b     	str	w11, [x26, #0x10]
ffffffc080398eb0: f940026b     	ldr	x11, [x19]
ffffffc080398eb4: d538d08c     	mrs	x12, TPIDR_EL1
ffffffc080398eb8: 8b0b0184     	add	x4, x12, x11
ffffffc080398ebc: 14000015     	b	0xffffffc080398f10 <kmem_cache_alloc_noprof+0x108>
ffffffc080398ec0: aa1603e0     	mov	x0, x22
ffffffc080398ec4: aa1803e1     	mov	x1, x24
ffffffc080398ec8: aa1903e2     	mov	x2, x25
ffffffc080398ecc: 48207c82     	casp	x0, x1, x2, x3, [x4]
ffffffc080398ed0: f9400b48     	ldr	x8, [x26, #0x10]
ffffffc080398ed4: f1000508     	subs	x8, x8, #0x1
ffffffc080398ed8: b9001348     	str	w8, [x26, #0x10]
ffffffc080398edc: 540000e0     	b.eq	0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee0: f9400b48     	ldr	x8, [x26, #0x10]
ffffffc080398ee4: b40000a8     	cbz	x8, 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee8: eb18003f     	cmp	x1, x24
ffffffc080398eec: fa560000     	ccmp	x0, x22, #0x0, eq
ffffffc080398ef0: 54000200     	b.eq	0xffffffc080398f30 <kmem_cache_alloc_noprof+0x128>
ffffffc080398ef4: 17ffffda     	b	0xffffffc080398e5c <kmem_cache_alloc_noprof+0x54>
ffffffc080398ef8: aa0103fb     	mov	x27, x1
ffffffc080398efc: aa0003fc     	mov	x28, x0
ffffffc080398f00: 94346cb4     	bl	0xffffffc0810b41d0 <preempt_schedule_notrace>
ffffffc080398f04: aa1c03e0     	mov	x0, x28
ffffffc080398f08: aa1b03e1     	mov	x1, x27
ffffffc080398f0c: 17fffff7     	b	0xffffffc080398ee8 <kmem_cache_alloc_noprof+0xe0>
ffffffc080398f10: f9800091     	prfm	pstl1strm, [x4]
ffffffc080398f14: c87f0480     	ldxp	x0, x1, [x4]
ffffffc080398f18: eb16001f     	cmp	x0, x22
ffffffc080398f1c: fa580020     	ccmp	x1, x24, #0x0, eq
ffffffc080398f20: 54000061     	b.ne	0xffffffc080398f2c <kmem_cache_alloc_noprof+0x124>
ffffffc080398f24: c8280c99     	stxp	w8, x25, x3, [x4]
ffffffc080398f28: 35ffff68     	cbnz	w8, 0xffffffc080398f14 <kmem_cache_alloc_noprof+0x10c>
ffffffc080398f2c: 17ffffe9     	b	0xffffffc080398ed0 <kmem_cache_alloc_noprof+0xc8>
ffffffc080398f30: b9402a68     	ldr	w8, [x19, #0x28]
ffffffc080398f34: 8b080328     	add	x8, x25, x8
ffffffc080398f38: f9800110     	prfm	pstl1keep, [x8]
ffffffc080398f3c: f90003f6     	str	x22, [sp]
ffffffc080398f40: d503201f     	nop
ffffffc080398f44: d503201f     	nop
ffffffc080398f48: f9402668     	ldr	x8, [x19, #0x48]
ffffffc080398f4c: b4000068     	cbz	x8, 0xffffffc080398f58 <kmem_cache_alloc_noprof+0x150>
ffffffc080398f50: 2a1f03f8     	mov	w24, wzr
ffffffc080398f54: 14000008     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f58: 79401268     	ldrh	w8, [x19, #0x8]
ffffffc080398f5c: 52804089     	mov	w9, #0x204      // =516
ffffffc080398f60: 6a09011f     	tst	w8, w9
ffffffc080398f64: 54000060     	b.eq	0xffffffc080398f70 <kmem_cache_alloc_noprof+0x168>
ffffffc080398f68: 53082298     	ubfx	w24, w20, #8, #1
ffffffc080398f6c: 14000002     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f70: 52800038     	mov	w24, #0x1       // =1
ffffffc080398f74: f000f308     	adrp	x8, 0xffffffc0821fb000 <max_load_balance_interval>
ffffffc080398f78: b9401e79     	ldr	w25, [x19, #0x1c]
ffffffc080398f7c: b9495908     	ldr	w8, [x8, #0x958]
ffffffc080398f80: d503201f     	nop
ffffffc080398f84: d503201f     	nop
ffffffc080398f88: 2a1803f7     	mov	w23, w24
ffffffc080398f8c: 14000007     	b	0xffffffc080398fa8 <kmem_cache_alloc_noprof+0x1a0>
ffffffc080398f90: 0a140102     	and	w2, w8, w20
ffffffc080398f94: aa1303e0     	mov	x0, x19
ffffffc080398f98: aa1603e1     	mov	x1, x22
ffffffc080398f9c: 2a1703e3     	mov	w3, w23
ffffffc080398fa0: 940077b3     	bl	0xffffffc0803b6e6c <__kasan_slab_alloc>
ffffffc080398fa4: aa0003f6     	mov	x22, x0
ffffffc080398fa8: f10002df     	cmp	x22, #0x0
ffffffc080398fac: 52000308     	eor	w8, w24, #0x1
ffffffc080398fb0: f90003f6     	str	x22, [sp]
ffffffc080398fb4: 1a9f1508     	csinc	w8, w8, wzr, ne
ffffffc080398fb8: 37000128     	tbnz	w8, #0x0, 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fbc: 34000077     	cbz	w23, 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc0: 14000002     	b	0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc4: 14000006     	b	0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fc8: 2a1903e2     	mov	w2, w25
ffffffc080398fcc: aa1603e0     	mov	x0, x22
ffffffc080398fd0: 2a1f03e1     	mov	w1, wzr
ffffffc080398fd4: 94337c7b     	bl	0xffffffc0810781c0 <memset>
ffffffc080398fd8: f94003f6     	ldr	x22, [sp]
ffffffc080398fdc: d503201f     	nop
ffffffc080398fe0: 14000004     	b	0xffffffc080398ff0 <kmem_cache_alloc_noprof+0x1e8>
ffffffc080398fe4: 37b00534     	tbnz	w20, #0x16, 0xffffffc080399088 <kmem_cache_alloc_noprof+0x280>
ffffffc080398fe8: 39402668     	ldrb	w8, [x19, #0x9]
ffffffc080398fec: 372804e8     	tbnz	w8, #0x5, 0xffffffc080399088 <kmem_cache_alloc_noprof+0x280>
ffffffc080398ff0: f94003e0     	ldr	x0, [sp]
ffffffc080398ff4: d503201f     	nop
ffffffc080398ff8: d5384108     	mrs	x8, SP_EL0
ffffffc080398ffc: f9430908     	ldr	x8, [x8, #0x610]
ffffffc080399000: f94007e9     	ldr	x9, [sp, #0x8]
ffffffc080399004: eb09011f     	cmp	x8, x9
ffffffc080399008: 54000581     	b.ne	0xffffffc0803990b8 <kmem_cache_alloc_noprof+0x2b0>
ffffffc08039900c: a9464ff4     	ldp	x20, x19, [sp, #0x60]
ffffffc080399010: a94557f6     	ldp	x22, x21, [sp, #0x50]
ffffffc080399014: a9445ff8     	ldp	x24, x23, [sp, #0x40]
ffffffc080399018: a94367fa     	ldp	x26, x25, [sp, #0x30]
ffffffc08039901c: a9426ffc     	ldp	x28, x27, [sp, #0x20]
ffffffc080399020: a9417bfd     	ldp	x29, x30, [sp, #0x10]
ffffffc080399024: 9101c3ff     	add	sp, sp, #0x70
ffffffc080399028: d50323bf     	autiasp
ffffffc08039902c: d65f03c0     	ret
ffffffc080399030: d5384118     	mrs	x24, SP_EL0
ffffffc080399034: b9401308     	ldr	w8, [x24, #0x10]
ffffffc080399038: aa1303e0     	mov	x0, x19
ffffffc08039903c: 2a1403e1     	mov	w1, w20
ffffffc080399040: 12800002     	mov	w2, #-0x1       // =-1
ffffffc080399044: aa1503e3     	mov	x3, x21
ffffffc080399048: 11000508     	add	w8, w8, #0x1
ffffffc08039904c: 2a1703e5     	mov	w5, w23
ffffffc080399050: b9001308     	str	w8, [x24, #0x10]
ffffffc080399054: f9400268     	ldr	x8, [x19]
ffffffc080399058: d538d089     	mrs	x9, TPIDR_EL1
ffffffc08039905c: 8b080124     	add	x4, x9, x8
ffffffc080399060: 94001600     	bl	0xffffffc08039e860 <___slab_alloc>
ffffffc080399064: aa0003f6     	mov	x22, x0
ffffffc080399068: f9400b08     	ldr	x8, [x24, #0x10]
ffffffc08039906c: f1000508     	subs	x8, x8, #0x1
ffffffc080399070: b9001308     	str	w8, [x24, #0x10]
ffffffc080399074: 54000060     	b.eq	0xffffffc080399080 <kmem_cache_alloc_noprof+0x278>
ffffffc080399078: f9400b08     	ldr	x8, [x24, #0x10]
ffffffc08039907c: b5fff608     	cbnz	x8, 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc080399080: 94346942     	bl	0xffffffc0810b3588 <preempt_schedule>
ffffffc080399084: 17ffffae     	b	0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc080399088: 910003e4     	mov	x4, sp
ffffffc08039908c: aa1303e0     	mov	x0, x19
ffffffc080399090: aa1f03e1     	mov	x1, xzr
ffffffc080399094: 2a1403e2     	mov	w2, w20
ffffffc080399098: 52800023     	mov	w3, #0x1        // =1
ffffffc08039909c: 940105fd     	bl	0xffffffc0803da890 <__memcg_slab_post_alloc_hook>
ffffffc0803990a0: 3707fa80     	tbnz	w0, #0x0, 0xffffffc080398ff0 <kmem_cache_alloc_noprof+0x1e8>
ffffffc0803990a4: f94003e1     	ldr	x1, [sp]
ffffffc0803990a8: aa1303e0     	mov	x0, x19
ffffffc0803990ac: 940027e0     	bl	0xffffffc0803a302c <memcg_alloc_abort_single>
ffffffc0803990b0: f90003ff     	str	xzr, [sp]
ffffffc0803990b4: 17ffffcf     	b	0xffffffc080398ff0 <kmem_cache_alloc_noprof+0x1e8>
ffffffc0803990b8: 94345d5d     	bl	0xffffffc0810b062c <__stack_chk_fail>
ffffffc0803990bc: d5384117     	mrs	x23, SP_EL0
ffffffc0803990c0: b9402ae8     	ldr	w8, [x23, #0x28]
ffffffc0803990c4: b000f30a     	adrp	x10, 0xffffffc0821fa000 <nf_conntrack_locks+0x500>
ffffffc0803990c8: 913ea14a     	add	x10, x10, #0xfa8
ffffffc0803990cc: d343fd09     	lsr	x9, x8, #3
ffffffc0803990d0: 927d6529     	and	x9, x9, #0x1ffffff8
ffffffc0803990d4: f8696949     	ldr	x9, [x10, x9]
ffffffc0803990d8: 9ac82528     	lsr	x8, x9, x8
ffffffc0803990dc: 3607f8e8     	tbz	w8, #0x0, 0xffffffc080398ff8 <kmem_cache_alloc_noprof+0x1f0>
ffffffc0803990e0: b94012e8     	ldr	w8, [x23, #0x10]
ffffffc0803990e4: aa0003f6     	mov	x22, x0
ffffffc0803990e8: aa1f03e0     	mov	x0, xzr
ffffffc0803990ec: aa1503e1     	mov	x1, x21
ffffffc0803990f0: aa1603e2     	mov	x2, x22
ffffffc0803990f4: aa1303e3     	mov	x3, x19
ffffffc0803990f8: 11000508     	add	w8, w8, #0x1
ffffffc0803990fc: 2a1403e4     	mov	w4, w20
ffffffc080399100: 12800005     	mov	w5, #-0x1       // =-1
ffffffc080399104: b90012e8     	str	w8, [x23, #0x10]
ffffffc080399108: 97fea8c1     	bl	0xffffffc08034340c <__traceiter_kmem_cache_alloc>
ffffffc08039910c: f9400ae8     	ldr	x8, [x23, #0x10]
ffffffc080399110: f1000508     	subs	x8, x8, #0x1
ffffffc080399114: b90012e8     	str	w8, [x23, #0x10]
ffffffc080399118: 54000080     	b.eq	0xffffffc080399128 <kmem_cache_alloc_noprof+0x320>
ffffffc08039911c: f9400ae8     	ldr	x8, [x23, #0x10]
ffffffc080399120: aa1603e0     	mov	x0, x22
ffffffc080399124: b5fff6a8     	cbnz	x8, 0xffffffc080398ff8 <kmem_cache_alloc_noprof+0x1f0>
ffffffc080399128: 94346c2a     	bl	0xffffffc0810b41d0 <preempt_schedule_notrace>
ffffffc08039912c: aa1603e0     	mov	x0, x22
ffffffc080399130: 17ffffb2     	b	0xffffffc080398ff8 <kmem_cache_alloc_noprof+0x1f0>
ffffffc080399134: 9000f9a8     	adrp	x8, 0xffffffc0822cd000 <page_alloc_sysctl_table+0xa8>
ffffffc080399138: b94e8908     	ldr	w8, [x8, #0xe88]
ffffffc08039913c: 7100051f     	cmp	w8, #0x1
ffffffc080399140: 54ffe8aa     	b.ge	0xffffffc080398e54 <kmem_cache_alloc_noprof+0x4c>
ffffffc080399144: aa1303e0     	mov	x0, x19
ffffffc080399148: aa1703e1     	mov	x1, x23
ffffffc08039914c: 2a1403e2     	mov	w2, w20
ffffffc080399150: 940080f0     	bl	0xffffffc0803b9510 <__kfence_alloc>
ffffffc080399154: f90003e0     	str	x0, [sp]
ffffffc080399158: b4ffe800     	cbz	x0, 0xffffffc080398e58 <kmem_cache_alloc_noprof+0x50>
ffffffc08039915c: aa0003f6     	mov	x22, x0
ffffffc080399160: 2a1f03f8     	mov	w24, wzr
ffffffc080399164: 17ffff84     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080399168: f9402668     	ldr	x8, [x19, #0x48]
ffffffc08039916c: b5ffeec8     	cbnz	x8, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399170: 79401268     	ldrh	w8, [x19, #0x8]
ffffffc080399174: 52804089     	mov	w9, #0x204      // =516
ffffffc080399178: 6a09011f     	tst	w8, w9
ffffffc08039917c: 54ffee41     	b.ne	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399180: b4ffee36     	cbz	x22, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399184: b9402a68     	ldr	w8, [x19, #0x28]
ffffffc080399188: b9405269     	ldr	w9, [x19, #0x50]
ffffffc08039918c: 6b09011f     	cmp	w8, w9
ffffffc080399190: 54ffeda2     	b.hs	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399194: 9340dec9     	sbfx	x9, x22, #0, #56
ffffffc080399198: f828693f     	str	xzr, [x9, x8]
ffffffc08039919c: 17ffff6a     	b	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991a0: b9400a69     	ldr	w9, [x19, #0x8]
ffffffc0803991a4: 5280104a     	mov	w10, #0x82      // =130
ffffffc0803991a8: 6a0a013f     	tst	w9, w10
ffffffc0803991ac: 54ffeec0     	b.eq	0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991b0: 721d013f     	tst	w9, #0x8
ffffffc0803991b4: 1a970339     	csel	w25, w25, w23, eq
ffffffc0803991b8: 17ffff73     	b	0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991bc: 2a1f03f7     	mov	w23, wzr
ffffffc0803991c0: 17ffff73     	b	0xffffffc080398f8c <kmem_cache_alloc_noprof+0x184>
ffffffc0803991c4: aa1303e0     	mov	x0, x19
ffffffc0803991c8: aa1603e1     	mov	x1, x22
ffffffc0803991cc: 2a1403e2     	mov	w2, w20
ffffffc0803991d0: 94002714     	bl	0xffffffc0803a2e20 <__alloc_tagging_slab_alloc_hook>
ffffffc0803991d4: 17ffff83     	b	0xffffffc080398fe0 <kmem_cache_alloc_noprof+0x1d8>
ffffffc0803991d8: 77 48 22 d5  	.word	0xd5224877
ffffffc080398e08 <kmem_cache_alloc_noprof>:
ffffffc080398e08: d503233f     	paciasp
ffffffc080398e0c: d101c3ff     	sub	sp, sp, #0x70
ffffffc080398e10: a9017bfd     	stp	x29, x30, [sp, #0x10]
ffffffc080398e14: a9026ffc     	stp	x28, x27, [sp, #0x20]
ffffffc080398e18: a90367fa     	stp	x26, x25, [sp, #0x30]
ffffffc080398e1c: a9045ff8     	stp	x24, x23, [sp, #0x40]
ffffffc080398e20: a90557f6     	stp	x22, x21, [sp, #0x50]
ffffffc080398e24: a9064ff4     	stp	x20, x19, [sp, #0x60]
ffffffc080398e28: 910043fd     	add	x29, sp, #0x10
ffffffc080398e2c: d5384108     	mrs	x8, SP_EL0
ffffffc080398e30: aa0003f3     	ffffffc080398e08 <kmem_cache_alloc_noprof>:
ffffffc080398e08: d503233f     	paciasp
ffffffc080398e0c: d101c3ff     	sub	sp, sp, #0x70
ffffffc080398e10: a9017bfd     	stp	x29, x30, [sp, #0x10]
ffffffc080398e14: a9026ffc     	stp	x28, x27, [sp, #0x20]
ffffffc080398e18: a90367fa     	stp	x26, x25, [sp, #0x30]
ffffffc080398e1c: a9045ff8     	stp	x24, x23, [sp, #0x40]
ffffffc080398e20: a90557f6     	stp	x22, x21, [sp, #0x50]
ffffffc080398e24: a9064ff4     	stp	x20, x19, [sp, #0x60]
ffffffc080398e28: 910043fd     	add	x29, sp, #0x10
ffffffc080398e2c: d5384108     	mrs	x8, SP_EL0
ffffffc080398e30: aa0003f3     	mov	x19, x0
ffffffc080398e34: 2a0103f4     	mov	w20, w1
ffffffc080398e38: f9430908     	ldr	x8, [x8, #0x610]
ffffffc080398e3c: f90007e8     	str	x8, [sp, #0x8]
ffffffc080398e40: d50320ff     	xpaclri
ffffffc080398e44: aa1e03f5     	mov	x21, x30
ffffffc080398e48: b4000dc0     	cbz	x0, 0xffffffc080399000 <kmem_cache_alloc_noprof+0x1f8>
ffffffc080398e4c: b9401e77     	ldr	w23, [x19, #0x1c]
ffffffc080398e50: d503201f     	nop
ffffffc080398e54: f90003ff     	str	xzr, [sp]
ffffffc080398e58: d538411a     	mrs	x26, SP_EL0
ffffffc080398e5c: f9400268     	ldr	x8, [x19]
ffffffc080398e60: d538d089     	mrs	x9, TPIDR_EL1
ffffffc080398e64: 8b080128     	add	x8, x9, x8
ffffffc080398e68: f9400518     	ldr	x24, [x8, #0x8]
ffffffc080398e6c: f9400116     	ldr	x22, [x8]
ffffffc080398e70: f9400908     	ldr	x8, [x8, #0x10]
ffffffc080398e74: f10002df     	cmp	x22, #0x0
ffffffc080398e78: fa401904     	ccmp	x8, #0x0, #0x4, ne
ffffffc080398e7c: 54000e00     	b.eq	0xffffffc08039903c <kmem_cache_alloc_noprof+0x234>
ffffffc080398e80: d378dec8     	lsl	x8, x22, #8
ffffffc080398e84: b9402a69     	ldr	w9, [x19, #0x28]
ffffffc080398e88: f9405e6a     	ldr	x10, [x19, #0xb8]
ffffffc080398e8c: 91008303     	add	x3, x24, #0x20
ffffffc080398e90: 8b882128     	add	x8, x9, x8, asr #8
ffffffc080398e94: f9400109     	ldr	x9, [x8]
ffffffc080398e98: b940134b     	ldr	w11, [x26, #0x10]
ffffffc080398e9c: dac00d08     	rev	x8, x8
ffffffc080398ea0: ca080148     	eor	x8, x10, x8
ffffffc080398ea4: 1100056b     	add	w11, w11, #0x1
ffffffc080398ea8: ca090119     	eor	x25, x8, x9
ffffffc080398eac: b900134b     	str	w11, [x26, #0x10]
ffffffc080398eb0: f940026b     	ldr	x11, [x19]
ffffffc080398eb4: d538d08c     	mrs	x12, TPIDR_EL1
ffffffc080398eb8: 8b0b0184     	add	x4, x12, x11
ffffffc080398ebc: 14000015     	b	0xffffffc080398f10 <kmem_cache_alloc_noprof+0x108>
ffffffc080398ec0: aa1603e0     	mov	x0, x22
ffffffc080398ec4: aa1803e1     	mov	x1, x24
ffffffc080398ec8: aa1903e2     	mov	x2, x25
ffffffc080398ecc: 48207c82     	casp	x0, x1, x2, x3, [x4]
ffffffc080398ed0: f9400b48     	ldr	x8, [x26, #0x10]
ffffffc080398ed4: f1000508     	subs	x8, x8, #0x1
ffffffc080398ed8: b9001348     	str	w8, [x26, #0x10]
ffffffc080398edc: 540000e0     	b.eq	0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee0: f9400b48     	ldr	x8, [x26, #0x10]
ffffffc080398ee4: b40000a8     	cbz	x8, 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee8: eb18003f     	cmp	x1, x24
ffffffc080398eec: fa560000     	ccmp	x0, x22, #0x0, eq
ffffffc080398ef0: 54000200     	b.eq	0xffffffc080398f30 <kmem_cache_alloc_noprof+0x128>
ffffffc080398ef4: 17ffffda     	b	0xffffffc080398e5c <kmem_cache_alloc_noprof+0x54>
ffffffc080398ef8: aa0103fb     	mov	x27, x1
ffffffc080398efc: aa0003fc     	mov	x28, x0
ffffffc080398f00: 94346d20     	bl	0xffffffc0810b4380 <preempt_schedule_notrace>
ffffffc080398f04: aa1c03e0     	mov	x0, x28
ffffffc080398f08: aa1b03e1     	mov	x1, x27
ffffffc080398f0c: 17fffff7     	b	0xffffffc080398ee8 <kmem_cache_alloc_noprof+0xe0>
ffffffc080398f10: f9800091     	prfm	pstl1strm, [x4]
ffffffc080398f14: c87f0480     	ldxp	x0, x1, [x4]
ffffffc080398f18: eb16001f     	cmp	x0, x22
ffffffc080398f1c: fa580020     	ccmp	x1, x24, #0x0, eq
ffffffc080398f20: 54000061     	b.ne	0xffffffc080398f2c <kmem_cache_alloc_noprof+0x124>
ffffffc080398f24: c8280c99     	stxp	w8, x25, x3, [x4]
ffffffc080398f28: 35ffff68     	cbnz	w8, 0xffffffc080398f14 <kmem_cache_alloc_noprof+0x10c>
ffffffc080398f2c: 17ffffe9     	b	0xffffffc080398ed0 <kmem_cache_alloc_noprof+0xc8>
ffffffc080398f30: b9402a68     	ldr	w8, [x19, #0x28]
ffffffc080398f34: 8b080328     	add	x8, x25, x8
ffffffc080398f38: f9800110     	prfm	pstl1keep, [x8]
ffffffc080398f3c: f90003f6     	str	x22, [sp]
ffffffc080398f40: d503201f     	nop
ffffffc080398f44: d503201f     	nop
ffffffc080398f48: f9402668     	ldr	x8, [x19, #0x48]
ffffffc080398f4c: b4000068     	cbz	x8, 0xffffffc080398f58 <kmem_cache_alloc_noprof+0x150>
ffffffc080398f50: 2a1f03f8     	mov	w24, wzr
ffffffc080398f54: 14000008     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f58: 79401268     	ldrh	w8, [x19, #0x8]
ffffffc080398f5c: 52804089     	mov	w9, #0x204      // =516
ffffffc080398f60: 6a09011f     	tst	w8, w9
ffffffc080398f64: 54000060     	b.eq	0xffffffc080398f70 <kmem_cache_alloc_noprof+0x168>
ffffffc080398f68: 53082298     	ubfx	w24, w20, #8, #1
ffffffc080398f6c: 14000002     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f70: 52800038     	mov	w24, #0x1       // =1
ffffffc080398f74: f000f308     	adrp	x8, 0xffffffc0821fb000 <max_load_balance_interval>
ffffffc080398f78: b9401e79     	ldr	w25, [x19, #0x1c]
ffffffc080398f7c: b9495908     	ldr	w8, [x8, #0x958]
ffffffc080398f80: d503201f     	nop
ffffffc080398f84: d503201f     	nop
ffffffc080398f88: 2a1803f7     	mov	w23, w24
ffffffc080398f8c: 14000007     	b	0xffffffc080398fa8 <kmem_cache_alloc_noprof+0x1a0>
ffffffc080398f90: 0a140102     	and	w2, w8, w20
ffffffc080398f94: aa1303e0     	mov	x0, x19
ffffffc080398f98: aa1603e1     	mov	x1, x22
ffffffc080398f9c: 2a1703e3     	mov	w3, w23
ffffffc080398fa0: 9400781d     	bl	0xffffffc0803b7014 <__kasan_slab_alloc>
ffffffc080398fa4: aa0003f6     	mov	x22, x0
ffffffc080398fa8: f10002df     	cmp	x22, #0x0
ffffffc080398fac: 52000308     	eor	w8, w24, #0x1
ffffffc080398fb0: f90003f6     	str	x22, [sp]
ffffffc080398fb4: 1a9f1508     	csinc	w8, w8, wzr, ne
ffffffc080398fb8: 37000128     	tbnz	w8, #0x0, 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fbc: 34000077     	cbz	w23, 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc0: 14000002     	b	0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc4: 14000006     	b	0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fc8: 2a1903e2     	mov	w2, w25
ffffffc080398fcc: aa1603e0     	mov	x0, x22
ffffffc080398fd0: 2a1f03e1     	mov	w1, wzr
ffffffc080398fd4: 94337ceb     	bl	0xffffffc081078380 <memset>
ffffffc080398fd8: f94003f6     	ldr	x22, [sp]
ffffffc080398fdc: aa1303e0     	mov	x0, x19
ffffffc080398fe0: aa1603e1     	mov	x1, x22
ffffffc080398fe4: 2a1403e2     	mov	w2, w20
ffffffc080398fe8: 940027d6     	bl	0xffffffc0803a2f40 <alloc_tagging_slab_alloc_hook>
ffffffc080398fec: 14000004     	b	0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc080398ff0: 37b00534     	tbnz	w20, #0x16, 0xffffffc080399094 <kmem_cache_alloc_noprof+0x28c>
ffffffc080398ff4: 39402668     	ldrb	w8, [x19, #0x9]
ffffffc080398ff8: 372804e8     	tbnz	w8, #0x5, 0xffffffc080399094 <kmem_cache_alloc_noprof+0x28c>
ffffffc080398ffc: f94003e0     	ldr	x0, [sp]
ffffffc080399000: d503201f     	nop
ffffffc080399004: d5384108     	mrs	x8, SP_EL0
ffffffc080399008: f9430908     	ldr	x8, [x8, #0x610]
ffffffc08039900c: f94007e9     	ldr	x9, [sp, #0x8]
ffffffc080399010: eb09011f     	cmp	x8, x9
ffffffc080399014: 54000581     	b.ne	0xffffffc0803990c4 <kmem_cache_alloc_noprof+0x2bc>
ffffffc080399018: a9464ff4     	ldp	x20, x19, [sp, #0x60]
ffffffc08039901c: a94557f6     	ldp	x22, x21, [sp, #0x50]
ffffffc080399020: a9445ff8     	ldp	x24, x23, [sp, #0x40]
ffffffc080399024: a94367fa     	ldp	x26, x25, [sp, #0x30]
ffffffc080399028: a9426ffc     	ldp	x28, x27, [sp, #0x20]
ffffffc08039902c: a9417bfd     	ldp	x29, x30, [sp, #0x10]
ffffffc080399030: 9101c3ff     	add	sp, sp, #0x70
ffffffc080399034: d50323bf     	autiasp
ffffffc080399038: d65f03c0     	ret
ffffffc08039903c: d5384118     	mrs	x24, SP_EL0
ffffffc080399040: b9401308     	ldr	w8, [x24, #0x10]
ffffffc080399044: aa1303e0     	mov	x0, x19
ffffffc080399048: 2a1403e1     	mov	w1, w20
ffffffc08039904c: 12800002     	mov	w2, #-0x1       // =-1
ffffffc080399050: aa1503e3     	mov	x3, x21
ffffffc080399054: 11000508     	add	w8, w8, #0x1
ffffffc080399058: 2a1703e5     	mov	w5, w23
ffffffc08039905c: b9001308     	str	w8, [x24, #0x10]
ffffffc080399060: f9400268     	ldr	x8, [x19]
ffffffc080399064: d538d089     	mrs	x9, TPIDR_EL1
ffffffc080399068: 8b080124     	add	x4, x9, x8
ffffffc08039906c: 94001645     	bl	0xffffffc08039e980 <___slab_alloc>
ffffffc080399070: aa0003f6     	mov	x22, x0
ffffffc080399074: f9400b08     	ldr	x8, [x24, #0x10]
ffffffc080399078: f1000508     	subs	x8, x8, #0x1
ffffffc08039907c: b9001308     	str	w8, [x24, #0x10]
ffffffc080399080: 54000060     	b.eq	0xffffffc08039908c <kmem_cache_alloc_noprof+0x284>
ffffffc080399084: f9400b08     	ldr	x8, [x24, #0x10]
ffffffc080399088: b5fff5a8     	cbnz	x8, 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc08039908c: 943469ab     	bl	0xffffffc0810b3738 <preempt_schedule>
ffffffc080399090: 17ffffab     	b	0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc080399094: 910003e4     	mov	x4, sp
ffffffc080399098: aa1303e0     	mov	x0, x19
ffffffc08039909c: aa1f03e1     	mov	x1, xzr
ffffffc0803990a0: 2a1403e2     	mov	w2, w20
ffffffc0803990a4: 52800023     	mov	w3, #0x1        // =1
ffffffc0803990a8: 94010664     	bl	0xffffffc0803daa38 <__memcg_slab_post_alloc_hook>
ffffffc0803990ac: 3707fa80     	tbnz	w0, #0x0, 0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc0803990b0: f94003e1     	ldr	x1, [sp]
ffffffc0803990b4: aa1303e0     	mov	x0, x19
ffffffc0803990b8: 94002833     	bl	0xffffffc0803a3184 <memcg_alloc_abort_single>
ffffffc0803990bc: f90003ff     	str	xzr, [sp]
ffffffc0803990c0: 17ffffcf     	b	0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc0803990c4: 94345dc6     	bl	0xffffffc0810b07dc <__stack_chk_fail>
ffffffc0803990c8: d5384117     	mrs	x23, SP_EL0
ffffffc0803990cc: b9402ae8     	ldr	w8, [x23, #0x28]
ffffffc0803990d0: b000f30a     	adrp	x10, 0xffffffc0821fa000 <nf_conntrack_locks+0x500>
ffffffc0803990d4: 913ea14a     	add	x10, x10, #0xfa8
ffffffc0803990d8: d343fd09     	lsr	x9, x8, #3
ffffffc0803990dc: 927d6529     	and	x9, x9, #0x1ffffff8
ffffffc0803990e0: f8696949     	ldr	x9, [x10, x9]
ffffffc0803990e4: 9ac82528     	lsr	x8, x9, x8
ffffffc0803990e8: 3607f8e8     	tbz	w8, #0x0, 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc0803990ec: b94012e8     	ldr	w8, [x23, #0x10]
ffffffc0803990f0: aa0003f6     	mov	x22, x0
ffffffc0803990f4: aa1f03e0     	mov	x0, xzr
ffffffc0803990f8: aa1503e1     	mov	x1, x21
ffffffc0803990fc: aa1603e2     	mov	x2, x22
ffffffc080399100: aa1303e3     	mov	x3, x19
ffffffc080399104: 11000508     	add	w8, w8, #0x1
ffffffc080399108: 2a1403e4     	mov	w4, w20
ffffffc08039910c: 12800005     	mov	w5, #-0x1       // =-1
ffffffc080399110: b90012e8     	str	w8, [x23, #0x10]
ffffffc080399114: 97fea8be     	bl	0xffffffc08034340c <__traceiter_kmem_cache_alloc>
ffffffc080399118: f9400ae8     	ldr	x8, [x23, #0x10]
ffffffc08039911c: f1000508     	subs	x8, x8, #0x1
ffffffc080399120: b90012e8     	str	w8, [x23, #0x10]
ffffffc080399124: 54000080     	b.eq	0xffffffc080399134 <kmem_cache_alloc_noprof+0x32c>
ffffffc080399128: f9400ae8     	ldr	x8, [x23, #0x10]
ffffffc08039912c: aa1603e0     	mov	x0, x22
ffffffc080399130: b5fff6a8     	cbnz	x8, 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc080399134: 94346c93     	bl	0xffffffc0810b4380 <preempt_schedule_notrace>
ffffffc080399138: aa1603e0     	mov	x0, x22
ffffffc08039913c: 17ffffb2     	b	0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc080399140: 9000f9a8     	adrp	x8, 0xffffffc0822cd000 <page_alloc_sysctl_table+0xa8>
ffffffc080399144: b94e8908     	ldr	w8, [x8, #0xe88]
ffffffc080399148: 7100051f     	cmp	w8, #0x1
ffffffc08039914c: 54ffe84a     	b.ge	0xffffffc080398e54 <kmem_cache_alloc_noprof+0x4c>
ffffffc080399150: aa1303e0     	mov	x0, x19
ffffffc080399154: aa1703e1     	mov	x1, x23
ffffffc080399158: 2a1403e2     	mov	w2, w20
ffffffc08039915c: 94008157     	bl	0xffffffc0803b96b8 <__kfence_alloc>
ffffffc080399160: f90003e0     	str	x0, [sp]
ffffffc080399164: b4ffe7a0     	cbz	x0, 0xffffffc080398e58 <kmem_cache_alloc_noprof+0x50>
ffffffc080399168: aa0003f6     	mov	x22, x0
ffffffc08039916c: 2a1f03f8     	mov	w24, wzr
ffffffc080399170: 17ffff81     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080399174: f9402668     	ldr	x8, [x19, #0x48]
ffffffc080399178: b5ffee68     	cbnz	x8, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc08039917c: 79401268     	ldrh	w8, [x19, #0x8]
ffffffc080399180: 52804089     	mov	w9, #0x204      // =516
ffffffc080399184: 6a09011f     	tst	w8, w9
ffffffc080399188: 54ffede1     	b.ne	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc08039918c: b4ffedd6     	cbz	x22, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399190: b9402a68     	ldr	w8, [x19, #0x28]
ffffffc080399194: b9405269     	ldr	w9, [x19, #0x50]
ffffffc080399198: 6b09011f     	cmp	w8, w9
ffffffc08039919c: 54ffed42     	b.hs	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991a0: 9340dec9     	sbfx	x9, x22, #0, #56
ffffffc0803991a4: f828693f     	str	xzr, [x9, x8]
ffffffc0803991a8: 17ffff67     	b	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991ac: b9400a69     	ldr	w9, [x19, #0x8]
ffffffc0803991b0: 5280104a     	mov	w10, #0x82      // =130
ffffffc0803991b4: 6a0a013f     	tst	w9, w10
ffffffc0803991b8: 54ffee60     	b.eq	0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991bc: 721d013f     	tst	w9, #0x8
ffffffc0803991c0: 1a970339     	csel	w25, w25, w23, eq
ffffffc0803991c4: 17ffff70     	b	0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991c8: 2a1f03f7     	mov	w23, wzr
ffffffc0803991cc: 17ffff70     	b	0xffffffc080398f8c <kmem_cache_alloc_noprof+0x184>
ffffffc0803991d0: 77 48 22 d5  	.word	0xd5224877
mov	x19, x0
ffffffc080398e34: 2a0103f4     	mov	w20, w1
ffffffc080398e38: f9430908     	ldr	x8, [x8, #0x610]
ffffffc080398e3c: f90007e8     	str	x8, [sp, #0x8]
ffffffc080398e40: d50320ff     	xpaclri
ffffffc080398e44: aa1e03f5     	mov	x21, x30
ffffffc080398e48: b4000dc0     	cbz	x0, 0xffffffc080399000 <kmem_cache_alloc_noprof+0x1f8>
ffffffc080398e4c: b9401e77     	ldr	w23, [x19, #0x1c]
ffffffc080398e50: d503201f     	nop
ffffffc080398e54: f90003ff     	str	xzr, [sp]
ffffffc080398e58: d538411a     	mrs	x26, SP_EL0
ffffffc080398e5c: f9400268     	ldr	x8, [x19]
ffffffc080398e60: d538d089     	mrs	x9, TPIDR_EL1
ffffffc080398e64: 8b080128     	add	x8, x9, x8
ffffffc080398e68: f9400518     	ldr	x24, [x8, #0x8]
ffffffc080398e6c: f9400116     	ldr	x22, [x8]
ffffffc080398e70: f9400908     	ldr	x8, [x8, #0x10]
ffffffc080398e74: f10002df     	cmp	x22, #0x0
ffffffc080398e78: fa401904     	ccmp	x8, #0x0, #0x4, ne
ffffffc080398e7c: 54000e00     	b.eq	0xffffffc08039903c <kmem_cache_alloc_noprof+0x234>
ffffffc080398e80: d378dec8     	lsl	x8, x22, #8
ffffffc080398e84: b9402a69     	ldr	w9, [x19, #0x28]
ffffffc080398e88: f9405e6a     	ldr	x10, [x19, #0xb8]
ffffffc080398e8c: 91008303     	add	x3, x24, #0x20
ffffffc080398e90: 8b882128     	add	x8, x9, x8, asr #8
ffffffc080398e94: f9400109     	ldr	x9, [x8]
ffffffc080398e98: b940134b     	ldr	w11, [x26, #0x10]
ffffffc080398e9c: dac00d08     	rev	x8, x8
ffffffc080398ea0: ca080148     	eor	x8, x10, x8
ffffffc080398ea4: 1100056b     	add	w11, w11, #0x1
ffffffc080398ea8: ca090119     	eor	x25, x8, x9
ffffffc080398eac: b900134b     	str	w11, [x26, #0x10]
ffffffc080398eb0: f940026b     	ldr	x11, [x19]
ffffffc080398eb4: d538d08c     	mrs	x12, TPIDR_EL1
ffffffc080398eb8: 8b0b0184     	add	x4, x12, x11
ffffffc080398ebc: 14000015     	b	0xffffffc080398f10 <kmem_cache_alloc_noprof+0x108>
ffffffc080398ec0: aa1603e0     	mov	x0, x22
ffffffc080398ec4: aa1803e1     	mov	x1, x24
ffffffc080398ec8: aa1903e2     	mov	x2, x25
ffffffc080398ecc: 48207c82     	casp	x0, x1, x2, x3, [x4]
ffffffc080398ed0: f9400b48     	ldr	x8, [x26, #0x10]
ffffffc080398ed4: f1000508     	subs	x8, x8, #0x1
ffffffc080398ed8: b9001348     	str	w8, [x26, #0x10]
ffffffc080398edc: 540000e0     	b.eq	0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee0: f9400b48     	ldr	x8, [x26, #0x10]
ffffffc080398ee4: b40000a8     	cbz	x8, 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee8: eb18003f     	cmp	x1, x24
ffffffc080398eec: fa560000     	ccmp	x0, x22, #0x0, eq
ffffffc080398ef0: 54000200     	b.eq	0xffffffc080398f30 <kmem_cache_alloc_noprof+0x128>
ffffffc080398ef4: 17ffffda     	b	0xffffffc080398e5c <kmem_cache_alloc_noprof+0x54>
ffffffc080398ef8: aa0103fb     	mov	x27, x1
ffffffc080398efc: aa0003fc     	mov	x28, x0
ffffffc080398f00: 94346d20     	bl	0xffffffc0810b4380 <preempt_schedule_notrace>
ffffffc080398f04: aa1c03e0     	mov	x0, x28
ffffffc080398f08: aa1b03e1     	mov	x1, x27
ffffffc080398f0c: 17fffff7     	b	0xffffffc080398ee8 <kmem_cache_alloc_noprof+0xe0>
ffffffc080398f10: f9800091     	prfm	pstl1strm, [x4]
ffffffc080398f14: c87f0480     	ldxp	x0, x1, [x4]
ffffffc080398f18: eb16001f     	cmp	x0, x22
ffffffc080398f1c: fa580020     	ccmp	x1, x24, #0x0, eq
ffffffc080398f20: 54000061     	b.ne	0xffffffc080398f2c <kmem_cache_alloc_noprof+0x124>
ffffffc080398f24: c8280c99     	stxp	w8, x25, x3, [x4]
ffffffc080398f28: 35ffff68     	cbnz	w8, 0xffffffc080398f14 <kmem_cache_alloc_noprof+0x10c>
ffffffc080398f2c: 17ffffe9     	b	0xffffffc080398ed0 <kmem_cache_alloc_noprof+0xc8>
ffffffc080398f30: b9402a68     	ldr	w8, [x19, #0x28]
ffffffc080398f34: 8b080328     	add	x8, x25, x8
ffffffc080398f38: f9800110     	prfm	pstl1keep, [x8]
ffffffc080398f3c: f90003f6     	str	x22, [sp]
ffffffc080398f40: d503201f     	nop
ffffffc080398f44: d503201f     	nop
ffffffc080398f48: f9402668     	ldr	x8, [x19, #0x48]
ffffffc080398f4c: b4000068     	cbz	x8, 0xffffffc080398f58 <kmem_cache_alloc_noprof+0x150>
ffffffc080398f50: 2a1f03f8     	mov	w24, wzr
ffffffc080398f54: 14000008     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f58: 79401268     	ldrh	w8, [x19, #0x8]
ffffffc080398f5c: 52804089     	mov	w9, #0x204      // =516
ffffffc080398f60: 6a09011f     	tst	w8, w9
ffffffc080398f64: 54000060     	b.eq	0xffffffc080398f70 <kmem_cache_alloc_noprof+0x168>
ffffffc080398f68: 53082298     	ubfx	w24, w20, #8, #1
ffffffc080398f6c: 14000002     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f70: 52800038     	mov	w24, #0x1       // =1
ffffffc080398f74: f000f308     	adrp	x8, 0xffffffc0821fb000 <max_load_balance_interval>
ffffffc080398f78: b9401e79     	ldr	w25, [x19, #0x1c]
ffffffc080398f7c: b9495908     	ldr	w8, [x8, #0x958]
ffffffc080398f80: d503201f     	nop
ffffffc080398f84: d503201f     	nop
ffffffc080398f88: 2a1803f7     	mov	w23, w24
ffffffc080398f8c: 14000007     	b	0xffffffc080398fa8 <kmem_cache_alloc_noprof+0x1a0>
ffffffc080398f90: 0a140102     	and	w2, w8, w20
ffffffc080398f94: aa1303e0     	mov	x0, x19
ffffffc080398f98: aa1603e1     	mov	x1, x22
ffffffc080398f9c: 2a1703e3     	mov	w3, w23
ffffffc080398fa0: 9400781d     	bl	0xffffffc0803b7014 <__kasan_slab_alloc>
ffffffc080398fa4: aa0003f6     	mov	x22, x0
ffffffc080398fa8: f10002df     	cmp	x22, #0x0
ffffffc080398fac: 52000308     	eor	w8, w24, #0x1
ffffffc080398fb0: f90003f6     	str	x22, [sp]
ffffffc080398fb4: 1a9f1508     	csinc	w8, w8, wzr, ne
ffffffc080398fb8: 37000128     	tbnz	w8, #0x0, 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fbc: 34000077     	cbz	w23, 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc0: 14000002     	b	0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc4: 14000006     	b	0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fc8: 2a1903e2     	mov	w2, w25
ffffffc080398fcc: aa1603e0     	mov	x0, x22
ffffffc080398fd0: 2a1f03e1     	mov	w1, wzr
ffffffc080398fd4: 94337ceb     	bl	0xffffffc081078380 <memset>
ffffffc080398fd8: f94003f6     	ldr	x22, [sp]
ffffffc080398fdc: aa1303e0     	mov	x0, x19
ffffffc080398fe0: aa1603e1     	mov	x1, x22
ffffffc080398fe4: 2a1403e2     	mov	w2, w20
ffffffc080398fe8: 940027d6     	bl	0xffffffc0803a2f40 <alloc_tagging_slab_alloc_hook>
ffffffc080398fec: 14000004     	b	0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc080398ff0: 37b00534     	tbnz	w20, #0x16, 0xffffffc080399094 <kmem_cache_alloc_noprof+0x28c>
ffffffc080398ff4: 39402668     	ldrb	w8, [x19, #0x9]
ffffffc080398ff8: 372804e8     	tbnz	w8, #0x5, 0xffffffc080399094 <kmem_cache_alloc_noprof+0x28c>
ffffffc080398ffc: f94003e0     	ldr	x0, [sp]
ffffffc080399000: d503201f     	nop
ffffffc080399004: d5384108     	mrs	x8, SP_EL0
ffffffc080399008: f9430908     	ldr	x8, [x8, #0x610]
ffffffc08039900c: f94007e9     	ldr	x9, [sp, #0x8]
ffffffc080399010: eb09011f     	cmp	x8, x9
ffffffc080399014: 54000581     	b.ne	0xffffffc0803990c4 <kmem_cache_alloc_noprof+0x2bc>
ffffffc080399018: a9464ff4     	ldp	x20, x19, [sp, #0x60]
ffffffc08039901c: a94557f6     	ldp	x22, x21, [sp, #0x50]
ffffffc080399020: a9445ff8     	ldp	x24, x23, [sp, #0x40]
ffffffc080399024: a94367fa     	ldp	x26, x25, [sp, #0x30]
ffffffc080399028: a9426ffc     	ldp	x28, x27, [sp, #0x20]
ffffffc08039902c: a9417bfd     	ldp	x29, x30, [sp, #0x10]
ffffffc080399030: 9101c3ff     	add	sp, sp, #0x70
ffffffc080399034: d50323bf     	autiasp
ffffffc080399038: d65f03c0     	ret
ffffffc08039903c: d5384118     	mrs	x24, SP_EL0
ffffffc080399040: b9401308     	ldr	w8, [x24, #0x10]
ffffffc080399044: aa1303e0     	mov	x0, x19
ffffffc080399048: 2a1403e1     	mov	w1, w20
ffffffc08039904c: 12800002     	mov	w2, #-0x1       // =-1
ffffffc080399050: aa1503e3     	mov	x3, x21
ffffffc080399054: 11000508     	add	w8, w8, #0x1
ffffffc080399058: 2a1703e5     	mov	w5, w23
ffffffc08039905c: b9001308     	str	w8, [x24, #0x10]
ffffffc080399060: f9400268     	ldr	x8, [x19]
ffffffc080399064: d538d089     	mrs	x9, TPIDR_EL1
ffffffc080399068: 8b080124     	add	x4, x9, x8
ffffffc08039906c: 94001645     	bl	0xffffffc08039e980 <___slab_alloc>
ffffffc080399070: aa0003f6     	mov	x22, x0
ffffffc080399074: f9400b08     	ldr	x8, [x24, #0x10]
ffffffc080399078: f1000508     	subs	x8, x8, #0x1
ffffffc08039907c: b9001308     	str	w8, [x24, #0x10]
ffffffc080399080: 54000060     	b.eq	0xffffffc08039908c <kmem_cache_alloc_noprof+0x284>
ffffffc080399084: f9400b08     	ldr	x8, [x24, #0x10]
ffffffc080399088: b5fff5a8     	cbnz	x8, 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc08039908c: 943469ab     	bl	0xffffffc0810b3738 <preempt_schedule>
ffffffc080399090: 17ffffab     	b	0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc080399094: 910003e4     	mov	x4, sp
ffffffc080399098: aa1303e0     	mov	x0, x19
ffffffc08039909c: aa1f03e1     	mov	x1, xzr
ffffffc0803990a0: 2a1403e2     	mov	w2, w20
ffffffc0803990a4: 52800023     	mov	w3, #0x1        // =1
ffffffc0803990a8: 94010664     	bl	0xffffffc0803daa38 <__memcg_slab_post_alloc_hook>
ffffffc0803990ac: 3707fa80     	tbnz	w0, #0x0, 0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc0803990b0: f94003e1     	ldr	x1, [sp]
ffffffc0803990b4: aa1303e0     	mov	x0, x19
ffffffc0803990b8: 94002833     	bl	0xffffffc0803a3184 <memcg_alloc_abort_single>
ffffffc0803990bc: f90003ff     	str	xzr, [sp]
ffffffc0803990c0: 17ffffcf     	b	0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc0803990c4: 94345dc6     	bl	0xffffffc0810b07dc <__stack_chk_fail>
ffffffc0803990c8: d5384117     	mrs	x23, SP_EL0
ffffffc0803990cc: b9402ae8     	ldr	w8, [x23, #0x28]
ffffffc0803990d0: b000f30a     	adrp	x10, 0xffffffc0821fa000 <nf_conntrack_locks+0x500>
ffffffc0803990d4: 913ea14a     	add	x10, x10, #0xfa8
ffffffc0803990d8: d343fd09     	lsr	x9, x8, #3
ffffffc0803990dc: 927d6529     	and	x9, x9, #0x1ffffff8
ffffffc0803990e0: f8696949     	ldr	x9, [x10, x9]
ffffffc0803990e4: 9ac82528     	lsr	x8, x9, x8
ffffffc0803990e8: 3607f8e8     	tbz	w8, #0x0, 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc0803990ec: b94012e8     	ldr	w8, [x23, #0x10]
ffffffc0803990f0: aa0003f6     	mov	x22, x0
ffffffc0803990f4: aa1f03e0     	mov	x0, xzr
ffffffc0803990f8: aa1503e1     	mov	x1, x21
ffffffc0803990fc: aa1603e2     	mov	x2, x22
ffffffc080399100: aa1303e3     	mov	x3, x19
ffffffc080399104: 11000508     	add	w8, w8, #0x1
ffffffc080399108: 2a1403e4     	mov	w4, w20
ffffffc08039910c: 12800005     	mov	w5, #-0x1       // =-1
ffffffc080399110: b90012e8     	str	w8, [x23, #0x10]
ffffffc080399114: 97fea8be     	bl	0xffffffc08034340c <__traceiter_kmem_cache_alloc>
ffffffc080399118: f9400ae8     	ldr	x8, [x23, #0x10]
ffffffc08039911c: f1000508     	subs	x8, x8, #0x1
ffffffc080399120: b90012e8     	str	w8, [x23, #0x10]
ffffffc080399124: 54000080     	b.eq	0xffffffc080399134 <kmem_cache_alloc_noprof+0x32c>
ffffffc080399128: f9400ae8     	ldr	x8, [x23, #0x10]
ffffffc08039912c: aa1603e0     	mov	x0, x22
ffffffc080399130: b5fff6a8     	cbnz	x8, 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc080399134: 94346c93     	bl	0xffffffc0810b4380 <preempt_schedule_notrace>
ffffffc080399138: aa1603e0     	mov	x0, x22
ffffffc08039913c: 17ffffb2     	b	0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc080399140: 9000f9a8     	adrp	x8, 0xffffffc0822cd000 <page_alloc_sysctl_table+0xa8>
ffffffc080399144: b94e8908     	ldr	w8, [x8, #0xe88]
ffffffc080399148: 7100051f     	cmp	w8, #0x1
ffffffc08039914c: 54ffe84a     	b.ge	0xffffffc080398e54 <kmem_cache_alloc_noprof+0x4c>
ffffffc080399150: aa1303e0     	mov	x0, x19
ffffffc080399154: aa1703e1     	mov	x1, x23
ffffffc080399158: 2a1403e2     	mov	w2, w20
ffffffc08039915c: 94008157     	bl	0xffffffc0803b96b8 <__kfence_alloc>
ffffffc080399160: f90003e0     	str	x0, [sp]
ffffffc080399164: b4ffe7a0     	cbz	x0, 0xffffffc080398e58 <kmem_cache_alloc_noprof+0x50>
ffffffc080399168: aa0003f6     	mov	x22, x0
ffffffc08039916c: 2a1f03f8     	mov	w24, wzr
ffffffc080399170: 17ffff81     	b	0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080399174: f9402668     	ldr	x8, [x19, #0x48]
ffffffc080399178: b5ffee68     	cbnz	x8, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc08039917c: 79401268     	ldrh	w8, [x19, #0x8]
ffffffc080399180: 52804089     	mov	w9, #0x204      // =516
ffffffc080399184: 6a09011f     	tst	w8, w9
ffffffc080399188: 54ffede1     	b.ne	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc08039918c: b4ffedd6     	cbz	x22, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399190: b9402a68     	ldr	w8, [x19, #0x28]
ffffffc080399194: b9405269     	ldr	w9, [x19, #0x50]
ffffffc080399198: 6b09011f     	cmp	w8, w9
ffffffc08039919c: 54ffed42     	b.hs	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991a0: 9340dec9     	sbfx	x9, x22, #0, #56
ffffffc0803991a4: f828693f     	str	xzr, [x9, x8]
ffffffc0803991a8: 17ffff67     	b	0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991ac: b9400a69     	ldr	w9, [x19, #0x8]
ffffffc0803991b0: 5280104a     	mov	w10, #0x82      // =130
ffffffc0803991b4: 6a0a013f     	tst	w9, w10
ffffffc0803991b8: 54ffee60     	b.eq	0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991bc: 721d013f     	tst	w9, #0x8
ffffffc0803991c0: 1a970339     	csel	w25, w25, w23, eq
ffffffc0803991c4: 17ffff70     	b	0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991c8: 2a1f03f7     	mov	w23, wzr
ffffffc0803991cc: 17ffff70     	b	0xffffffc080398f8c <kmem_cache_alloc_noprof+0x184>
ffffffc0803991d0: 77 48 22 d5  	.word	0xd5224877
diff mbox series

Patch

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index a946e0203e6d..c5de2a0c1780 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -116,6 +116,12 @@  DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
 DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
 			mem_alloc_profiling_key);
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+#define inline_if_mem_alloc_prof	inline
+#else
+#define inline_if_mem_alloc_prof	noinline
+#endif
+
 static inline bool mem_alloc_profiling_enabled(void)
 {
 	return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
diff --git a/mm/slub.c b/mm/slub.c
index 996691c137eb..3107d43dfddc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2000,7 +2000,7 @@  int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
 	return 0;
 }
 
-static inline void free_slab_obj_exts(struct slab *slab)
+static inline_if_mem_alloc_prof void free_slab_obj_exts(struct slab *slab)
 {
 	struct slabobj_ext *obj_exts;
 
@@ -2077,33 +2077,35 @@  prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
 	return slab_obj_exts(slab) + obj_to_index(s, slab, p);
 }
 
-static inline void
-alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+static inline_if_mem_alloc_prof void
+__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 {
-	if (need_slab_obj_ext()) {
-		struct slabobj_ext *obj_exts;
+	struct slabobj_ext *obj_exts;
 
-		obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
-		/*
-		 * Currently obj_exts is used only for allocation profiling.
-		 * If other users appear then mem_alloc_profiling_enabled()
-		 * check should be added before alloc_tag_add().
-		 */
-		if (likely(obj_exts))
-			alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
-	}
+	obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
+	/*
+	 * Currently obj_exts is used only for allocation profiling.
+	 * If other users appear then mem_alloc_profiling_enabled()
+	 * check should be added before alloc_tag_add().
+	 */
+	if (likely(obj_exts))
+		alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
 }
 
 static inline void
-alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+{
+	if (need_slab_obj_ext())
+		__alloc_tagging_slab_alloc_hook(s, object, flags);
+}
+
+static inline_if_mem_alloc_prof void
+__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 			     int objects)
 {
 	struct slabobj_ext *obj_exts;
 	int i;
 
-	if (!mem_alloc_profiling_enabled())
-		return;
-
 	/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
 	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
 		return;
@@ -2119,6 +2121,14 @@  alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 	}
 }
 
+static inline void
+alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+			     int objects)
+{
+	if (mem_alloc_profiling_enabled())
+		__alloc_tagging_slab_free_hook(s, slab, p, objects);
+}
+
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void