diff mbox series

NUMA: Early use of cpu_to_node() returns 0 instead of the correct node id

Message ID 20240119033227.14113-1-shijie@os.amperecomputing.com (mailing list archive)
State Superseded
Headers show
Series NUMA: Early use of cpu_to_node() returns 0 instead of the correct node id | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR success PR summary
conchuod/patch-1-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-1-test-2 success .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-1-test-3 success .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-1-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-1-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-1-test-6 warning .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-1-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-1-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-1-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-1-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-1-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-1-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Huang Shijie Jan. 19, 2024, 3:32 a.m. UTC
During the kernel booting, the generic cpu_to_node() is called too early in
arm64, powerpc and riscv when CONFIG_NUMA is enabled.

There are at least four places in the common code where
the generic cpu_to_node() is called before it is initialized:
	   1.) early_trace_init()         in kernel/trace/trace.c
	   2.) sched_init()               in kernel/sched/core.c
	   3.) init_sched_fair_class()    in kernel/sched/fair.c
	   4.) workqueue_init_early()     in kernel/workqueue.c

In order to fix the bug, the patch changes generic cpu_to_node to
function pointer, and export it for kernel modules.
Introduce smp_prepare_boot_cpu_start() to wrap the original
smp_prepare_boot_cpu(), and set cpu_to_node with early_cpu_to_node.
Introduce smp_prepare_cpus_done() to wrap the original smp_prepare_cpus(),
and set the cpu_to_node to formal _cpu_to_node().

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 drivers/base/arch_numa.c | 11 +++++++++++
 include/linux/topology.h |  6 ++----
 init/main.c              | 29 +++++++++++++++++++++++++++--
 3 files changed, 40 insertions(+), 6 deletions(-)

Comments

Yury Norov Jan. 19, 2024, 4:42 a.m. UTC | #1
On Fri, Jan 19, 2024 at 11:32:27AM +0800, Huang Shijie wrote:
> hZ7bkEvc+Z19RHkS/HVG3KMg
> X-MS-Exchange-Transport-CrossTenantHeadersStamped: DM8PR01MB7144
> Status: O
> Content-Length: 3779
> Lines: 126
> 
> During the kernel booting, the generic cpu_to_node() is called too early in
> arm64, powerpc and riscv when CONFIG_NUMA is enabled.
> 
> There are at least four places in the common code where
> the generic cpu_to_node() is called before it is initialized:
> 	   1.) early_trace_init()         in kernel/trace/trace.c
> 	   2.) sched_init()               in kernel/sched/core.c
> 	   3.) init_sched_fair_class()    in kernel/sched/fair.c
> 	   4.) workqueue_init_early()     in kernel/workqueue.c
> 
> In order to fix the bug, the patch changes generic cpu_to_node to
> function pointer, and export it for kernel modules.
> Introduce smp_prepare_boot_cpu_start() to wrap the original
> smp_prepare_boot_cpu(), and set cpu_to_node with early_cpu_to_node.
> Introduce smp_prepare_cpus_done() to wrap the original smp_prepare_cpus(),
> and set the cpu_to_node to formal _cpu_to_node().

This adds another level of indirection, I think. Currently cpu_to_node
is a simple inliner. After the patch it would be a real function with
all the associate overhead. Can you share a bloat-o-meter output here?

Regardless, I don't think that the approach is correct. As per your
description, some initialization functions erroneously call
cpu_to_node() instead of early_cpu_to_node() which exists specifically
for that case.

If the above correct, it's clearly a caller problem, and the fix is to
simply switch all those callers to use early version.

I would also initialize the numa_node with NUMA_NO_NODE at declaration,
so that if someone calls cpu_to_node() before the variable is properly
initialized at runtime, he'll get NO_NODE, which is obviously an error.

Thanks,
Yury
 
> Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
> ---
>  drivers/base/arch_numa.c | 11 +++++++++++
>  include/linux/topology.h |  6 ++----
>  init/main.c              | 29 +++++++++++++++++++++++++++--
>  3 files changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
> index 5b59d133b6af..867a477fa975 100644
> --- a/drivers/base/arch_numa.c
> +++ b/drivers/base/arch_numa.c
> @@ -61,6 +61,17 @@ EXPORT_SYMBOL(cpumask_of_node);
>  
>  #endif
>  
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> +int _cpu_to_node(int cpu)
> +{
> +	return per_cpu(numa_node, cpu);
> +}
> +int (*cpu_to_node)(int cpu);
> +EXPORT_SYMBOL(cpu_to_node);
> +#endif
> +#endif
> +
>  static void numa_update_cpu(unsigned int cpu, bool remove)
>  {
>  	int nid = cpu_to_node(cpu);
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 52f5850730b3..e7ce2bae11dd 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -91,10 +91,8 @@ static inline int numa_node_id(void)
>  #endif
>  
>  #ifndef cpu_to_node
> -static inline int cpu_to_node(int cpu)
> -{
> -	return per_cpu(numa_node, cpu);
> -}
> +extern int (*cpu_to_node)(int cpu);
> +extern int _cpu_to_node(int cpu);
>  #endif
>  
>  #ifndef set_numa_node
> diff --git a/init/main.c b/init/main.c
> index e24b0780fdff..b142e9c51161 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -870,6 +870,18 @@ static void __init print_unknown_bootoptions(void)
>  	memblock_free(unknown_options, len);
>  }
>  
> +static void __init smp_prepare_boot_cpu_start(void)
> +{
> +	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
> +
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> +	/* The early_cpu_to_node should be ready now. */
> +	cpu_to_node = early_cpu_to_node;
> +#endif
> +#endif
> +}
> +
>  asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
>  void start_kernel(void)
>  {
> @@ -899,7 +911,7 @@ void start_kernel(void)
>  	setup_command_line(command_line);
>  	setup_nr_cpu_ids();
>  	setup_per_cpu_areas();
> -	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
> +	smp_prepare_boot_cpu_start();
>  	boot_cpu_hotplug_init();
>  
>  	pr_notice("Kernel command line: %s\n", saved_command_line);
> @@ -1519,6 +1531,19 @@ void __init console_on_rootfs(void)
>  	fput(file);
>  }
>  
> +static void __init smp_prepare_cpus_done(unsigned int setup_max_cpus)
> +{
> +	/* Different ARCHs may override smp_prepare_cpus() */
> +	smp_prepare_cpus(setup_max_cpus);
> +
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> +	/* Change to the formal function. */
> +	cpu_to_node = _cpu_to_node;
> +#endif
> +#endif
> +}
> +
>  static noinline void __init kernel_init_freeable(void)
>  {
>  	/* Now the scheduler is fully set up and can do blocking allocations */
> @@ -1531,7 +1556,7 @@ static noinline void __init kernel_init_freeable(void)
>  
>  	cad_pid = get_pid(t
Greg KH Jan. 19, 2024, 5:35 a.m. UTC | #2
On Fri, Jan 19, 2024 at 11:32:27AM +0800, Huang Shijie wrote:
> During the kernel booting, the generic cpu_to_node() is called too early in
> arm64, powerpc and riscv when CONFIG_NUMA is enabled.
> 
> There are at least four places in the common code where
> the generic cpu_to_node() is called before it is initialized:
> 	   1.) early_trace_init()         in kernel/trace/trace.c
> 	   2.) sched_init()               in kernel/sched/core.c
> 	   3.) init_sched_fair_class()    in kernel/sched/fair.c
> 	   4.) workqueue_init_early()     in kernel/workqueue.c
> 
> In order to fix the bug, the patch changes generic cpu_to_node to
> function pointer, and export it for kernel modules.
> Introduce smp_prepare_boot_cpu_start() to wrap the original
> smp_prepare_boot_cpu(), and set cpu_to_node with early_cpu_to_node.
> Introduce smp_prepare_cpus_done() to wrap the original smp_prepare_cpus(),
> and set the cpu_to_node to formal _cpu_to_node().
> 
> Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
> ---
>  drivers/base/arch_numa.c | 11 +++++++++++
>  include/linux/topology.h |  6 ++----
>  init/main.c              | 29 +++++++++++++++++++++++++++--
>  3 files changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
> index 5b59d133b6af..867a477fa975 100644
> --- a/drivers/base/arch_numa.c
> +++ b/drivers/base/arch_numa.c
> @@ -61,6 +61,17 @@ EXPORT_SYMBOL(cpumask_of_node);
>  
>  #endif
>  
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> +int _cpu_to_node(int cpu)
> +{
> +	return per_cpu(numa_node, cpu);
> +}
> +int (*cpu_to_node)(int cpu);
> +EXPORT_SYMBOL(cpu_to_node);
> +#endif
> +#endif
> +
>  static void numa_update_cpu(unsigned int cpu, bool remove)
>  {
>  	int nid = cpu_to_node(cpu);
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 52f5850730b3..e7ce2bae11dd 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -91,10 +91,8 @@ static inline int numa_node_id(void)
>  #endif
>  
>  #ifndef cpu_to_node
> -static inline int cpu_to_node(int cpu)
> -{
> -	return per_cpu(numa_node, cpu);
> -}
> +extern int (*cpu_to_node)(int cpu);
> +extern int _cpu_to_node(int cpu);
>  #endif
>  
>  #ifndef set_numa_node
> diff --git a/init/main.c b/init/main.c
> index e24b0780fdff..b142e9c51161 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -870,6 +870,18 @@ static void __init print_unknown_bootoptions(void)
>  	memblock_free(unknown_options, len);
>  }
>  
> +static void __init smp_prepare_boot_cpu_start(void)
> +{
> +	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
> +
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> +	/* The early_cpu_to_node should be ready now. */
> +	cpu_to_node = early_cpu_to_node;
> +#endif
> +#endif
> +}
> +
>  asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
>  void start_kernel(void)
>  {
> @@ -899,7 +911,7 @@ void start_kernel(void)
>  	setup_command_line(command_line);
>  	setup_nr_cpu_ids();
>  	setup_per_cpu_areas();
> -	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
> +	smp_prepare_boot_cpu_start();
>  	boot_cpu_hotplug_init();
>  
>  	pr_notice("Kernel command line: %s\n", saved_command_line);
> @@ -1519,6 +1531,19 @@ void __init console_on_rootfs(void)
>  	fput(file);
>  }
>  
> +static void __init smp_prepare_cpus_done(unsigned int setup_max_cpus)
> +{
> +	/* Different ARCHs may override smp_prepare_cpus() */
> +	smp_prepare_cpus(setup_max_cpus);
> +
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> +	/* Change to the formal function. */
> +	cpu_to_node = _cpu_to_node;
> +#endif
> +#endif
> +}
> +
>  static noinline void __init kernel_init_freeable(void)
>  {
>  	/* Now the scheduler is fully set up and can do blocking allocations */
> @@ -1531,7 +1556,7 @@ static noinline void __init kernel_init_freeable(void)
>  
>  	cad_pid = get_pid(task_pid(current));
>  
> -	smp_prepare_cpus(setup_max_cpus);
> +	smp_prepare_cpus_done(setup_max_cpus);
>  
>  	workqueue_init();
>  
> -- 
> 2.40.1
> 

Hi,

This is the friendly patch-bot of Greg Kroah-Hartman.  You have sent him
a patch that has triggered this response.  He used to manually respond
to these common problems, but in order to save his sanity (he kept
writing the same thing over and over, yet to different people), I was
created.  Hopefully you will not take offence and will fix the problem
in your patch and resubmit it so that it can be accepted into the Linux
kernel tree.

You are receiving this message because of the following common error(s)
as indicated below:

- This looks like a new version of a previously submitted patch, but you
  did not list below the --- line any changes from the previous version.
  Please read the section entitled "The canonical patch format" in the
  kernel file, Documentation/process/submitting-patches.rst for what
  needs to be done here to properly describe this.

If you wish to discuss this problem further, or you have questions about
how to resolve this issue, please feel free to respond to this email and
Greg will reply once he has dug out from the pending patches received
from other developers.

thanks,

greg k-h's patch email bot
Shijie Huang Jan. 19, 2024, 6:46 a.m. UTC | #3
在 2024/1/19 12:42, Yury Norov 写道:
> This adds another level of indirection, I think. Currently cpu_to_node
> is a simple inliner. After the patch it would be a real function with
> all the associate overhead. Can you share a bloat-o-meter output here?
#./scripts/bloat-o-meter vmlinux vmlinux.new
add/remove: 6/1 grow/shrink: 61/51 up/down: 1168/-588 (580)
Function                                     old     new   delta
numa_update_cpu                              148     244     +96

  ...................................................................................................................................(to many to skip)

Total: Before=32990130, After=32990710, chg +0.00%


>
> Regardless, I don't think that the approach is correct. As per your
> description, some initialization functions erroneously call
> cpu_to_node() instead of early_cpu_to_node() which exists specifically
> for that case.
>
> If the above correct, it's clearly a caller problem, and the fix is to
> simply switch all those callers to use early version.

It is easy to change to early_cpu_to_node() for sched_init(), 
init_sched_fair_class()

and workqueue_init_early(). These three places call the cpu_to_node() in 
the __init function.


But it is a little hard to change the early_trace_init(), since it calls 
cpu_to_node in the deep

function stack:

   early_trace_init() --> ring_buffer_alloc() -->rb_allocate_cpu_buffer()


For early_trace_init(), we need to change more code.


Anyway, If we think it is not a good idea to change the common code, I 
am oaky too.


>
> I would also initialize the numa_node with NUMA_NO_NODE at declaration,
> so that if someone calls cpu_to_node() before the variable is properly
> initialized at runtime, he'll get NO_NODE, which is obviously an error.

Even we set the numa_node with NUMA_NO_NODE, it does not always produce 
error.

Please see the alloc_pages_node().


Thanks

Huang Shijie
Shijie Huang Jan. 19, 2024, 7:02 a.m. UTC | #4
在 2024/1/19 14:46, Shijie Huang 写道:
>
> 在 2024/1/19 12:42, Yury Norov 写道:
>> This adds another level of indirection, I think. Currently cpu_to_node
>> is a simple inliner. After the patch it would be a real function with
>> all the associate overhead. Can you share a bloat-o-meter output here?
> #./scripts/bloat-o-meter vmlinux vmlinux.new
> add/remove: 6/1 grow/shrink: 61/51 up/down: 1168/-588 (580)
> Function                                     old     new   delta
> numa_update_cpu                              148     244     +96
>
>  ...................................................................................................................................(to 
> many to skip)
>
> Total: Before=32990130, After=32990710, chg +0.00%
>
>
>>
>> Regardless, I don't think that the approach is correct. As per your
>> description, some initialization functions erroneously call
>> cpu_to_node() instead of early_cpu_to_node() which exists specifically
>> for that case.

sorry, I missed something.

I am not sure if  the early_cpu_to_node() works on all ARCHs.


Thanks

Huang Shijie

>>
>> If the above correct, it's clearly a caller problem, and the fix is to
>> simply switch all those callers to use early version.
>
> It is easy to change to early_cpu_to_node() for sched_init(), 
> init_sched_fair_class()
>
> and workqueue_init_early(). These three places call the cpu_to_node() 
> in the __init function.
>
>
> But it is a little hard to change the early_trace_init(), since it 
> calls cpu_to_node in the deep
>
> function stack:
>
>   early_trace_init() --> ring_buffer_alloc() -->rb_allocate_cpu_buffer()
>
>
> For early_trace_init(), we need to change more code.
>
>
> Anyway, If we think it is not a good idea to change the common code, I 
> am oaky too.
>
>
>>
>> I would also initialize the numa_node with NUMA_NO_NODE at declaration,
>> so that if someone calls cpu_to_node() before the variable is properly
>> initialized at runtime, he'll get NO_NODE, which is obviously an error.
>
> Even we set the numa_node with NUMA_NO_NODE, it does not always 
> produce error.
>
> Please see the alloc_pages_node().
>
>
> Thanks
>
> Huang Shijie
>
Shijie Huang Jan. 19, 2024, 7:42 a.m. UTC | #5
在 2024/1/19 12:42, Yury Norov 写道:
> Regardless, I don't think that the approach is correct. As per your
> description, some initialization functions erroneously call
> cpu_to_node() instead of early_cpu_to_node() which exists specifically
> for that case.

I checked the code again.

The sparc, mips and s390 (which support the NUMA) do not support 
early_cpu_to_node().

So we cannot use early_cpu_to_node() for these functions.


Thanks

Huang Shijie
Mike Rapoport Jan. 19, 2024, 8:42 a.m. UTC | #6
On Fri, Jan 19, 2024 at 02:46:16PM +0800, Shijie Huang wrote:
> 
> 在 2024/1/19 12:42, Yury Norov 写道:
> > This adds another level of indirection, I think. Currently cpu_to_node
> > is a simple inliner. After the patch it would be a real function with
> > all the associate overhead. Can you share a bloat-o-meter output here?
> #./scripts/bloat-o-meter vmlinux vmlinux.new
> add/remove: 6/1 grow/shrink: 61/51 up/down: 1168/-588 (580)
> Function                                     old     new   delta
> numa_update_cpu                              148     244     +96
> 
>  ...................................................................................................................................(to many to skip)
> 
> Total: Before=32990130, After=32990710, chg +0.00%
 
It's not only about text size, the indirect call also hurts performance
 
> > 
> > Regardless, I don't think that the approach is correct. As per your
> > description, some initialization functions erroneously call
> > cpu_to_node() instead of early_cpu_to_node() which exists specifically
> > for that case.
> > 
> > If the above correct, it's clearly a caller problem, and the fix is to
> > simply switch all those callers to use early version.
> 
> It is easy to change to early_cpu_to_node() for sched_init(),
> init_sched_fair_class()
> 
> and workqueue_init_early(). These three places call the cpu_to_node() in the
> __init function.
> 
> 
> But it is a little hard to change the early_trace_init(), since it calls
> cpu_to_node in the deep
> 
> function stack:
> 
>   early_trace_init() --> ring_buffer_alloc() -->rb_allocate_cpu_buffer()
> 
> 
> For early_trace_init(), we need to change more code.
> 
> 
> Anyway, If we think it is not a good idea to change the common code, I am
> oaky too.
 
Is there a fundamental reason to have early_cpu_to_node() at all?
It seems that all the mappings are known by the end of setup_arch() and the
initialization of numa_node can be moved earlier. 
 
> > I would also initialize the numa_node with NUMA_NO_NODE at declaration,
> > so that if someone calls cpu_to_node() before the variable is properly
> > initialized at runtime, he'll get NO_NODE, which is obviously an error.
> 
> Even we set the numa_node with NUMA_NO_NODE, it does not always produce
> error.
> 
> Please see the alloc_pages_node().
> 
> 
> Thanks
> 
> Huang Shijie
>
Shijie Huang Jan. 19, 2024, 8:50 a.m. UTC | #7
在 2024/1/19 16:42, Mike Rapoport 写道:
> On Fri, Jan 19, 2024 at 02:46:16PM +0800, Shijie Huang wrote:
>> 在 2024/1/19 12:42, Yury Norov 写道:
>>> This adds another level of indirection, I think. Currently cpu_to_node
>>> is a simple inliner. After the patch it would be a real function with
>>> all the associate overhead. Can you share a bloat-o-meter output here?
>> #./scripts/bloat-o-meter vmlinux vmlinux.new
>> add/remove: 6/1 grow/shrink: 61/51 up/down: 1168/-588 (580)
>> Function                                     old     new   delta
>> numa_update_cpu                              148     244     +96
>>
>>   ...................................................................................................................................(to many to skip)
>>
>> Total: Before=32990130, After=32990710, chg +0.00%
>   
> It's not only about text size, the indirect call also hurts performance
>   

The cpu_to_node() is called at very low frequency, most of the times is 
in the kernel booting time.



>>> Regardless, I don't think that the approach is correct. As per your
>>> description, some initialization functions erroneously call
>>> cpu_to_node() instead of early_cpu_to_node() which exists specifically
>>> for that case.
>>>
>>> If the above correct, it's clearly a caller problem, and the fix is to
>>> simply switch all those callers to use early version.
>> It is easy to change to early_cpu_to_node() for sched_init(),
>> init_sched_fair_class()
>>
>> and workqueue_init_early(). These three places call the cpu_to_node() in the
>> __init function.
>>
>>
>> But it is a little hard to change the early_trace_init(), since it calls
>> cpu_to_node in the deep
>>
>> function stack:
>>
>>    early_trace_init() --> ring_buffer_alloc() -->rb_allocate_cpu_buffer()
>>
>>
>> For early_trace_init(), we need to change more code.
>>
>>
>> Anyway, If we think it is not a good idea to change the common code, I am
>> oaky too.
>   
> Is there a fundamental reason to have early_cpu_to_node() at all?

The early_cpu_to_node does not work on some ARCHs (which support the 
NUMA), such

as  SPARC, MIPS and S390.


Thanks

Huang Shijie

> It seems that all the mappings are known by the end of setup_arch() and the
> initialization of numa_node can be moved earlier.
>   
>>> I would also initialize the numa_node with NUMA_NO_NODE at declaration,
>>> so that if someone calls cpu_to_node() before the variable is properly
>>> initialized at runtime, he'll get NO_NODE, which is obviously an error.
>> Even we set the numa_node with NUMA_NO_NODE, it does not always produce
>> error.
>>
>> Please see the alloc_pages_node().
>>
>>
>> Thanks
>>
>> Huang Shijie
>>
kernel test robot Jan. 19, 2024, 4:32 p.m. UTC | #8
Hi Huang,

kernel test robot noticed the following build errors:

[auto build test ERROR on driver-core/driver-core-testing]
[also build test ERROR on driver-core/driver-core-next driver-core/driver-core-linus akpm-mm/mm-everything linus/master v6.7 next-20240119]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Huang-Shijie/NUMA-Early-use-of-cpu_to_node-returns-0-instead-of-the-correct-node-id/20240119-113623
base:   driver-core/driver-core-testing
patch link:    https://lore.kernel.org/r/20240119033227.14113-1-shijie%40os.amperecomputing.com
patch subject: [PATCH] NUMA: Early use of cpu_to_node() returns 0 instead of the correct node id
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20240120/202401200006.wOMN1YgH-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240120/202401200006.wOMN1YgH-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401200006.wOMN1YgH-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: vmlinux.o: in function `rapl_cpu_online':
>> rapl.c:(.text+0x75ec): undefined reference to `cpu_to_node'
   ld: vmlinux.o: in function `amd_pmu_cpu_prepare':
>> core.c:(.text+0x8580): undefined reference to `cpu_to_node'
>> ld: core.c:(.text+0x85d6): undefined reference to `cpu_to_node'
   ld: vmlinux.o: in function `amd_uncore_ctx_init.part.0':
>> uncore.c:(.text+0xc3bd): undefined reference to `cpu_to_node'
   ld: vmlinux.o: in function `intel_cpuc_prepare':
>> (.text+0x129ee): undefined reference to `cpu_to_node'
   ld: vmlinux.o:(.text+0x12a71): more undefined references to `cpu_to_node' follow
   ld: vmlinux.o: in function `kernel_init_freeable':
>> main.c:(.init.text+0x1240): undefined reference to `_cpu_to_node'
   ld: vmlinux.o: in function `check_timer':
>> io_apic.c:(.init.text+0x1ec1d): undefined reference to `cpu_to_node'
   ld: vmlinux.o: in function `kvm_alloc_cpumask':
>> kvm.c:(.init.text+0x21678): undefined reference to `cpu_to_node'
   ld: vmlinux.o: in function `fork_idle':
>> (.init.text+0x28fe2): undefined reference to `cpu_to_node'
   ld: vmlinux.o: in function `cpus_share_numa':
>> workqueue.c:(.init.text+0x2a4c8): undefined reference to `cpu_to_node'
>> ld: workqueue.c:(.init.text+0x2a4d8): undefined reference to `cpu_to_node'
   ld: vmlinux.o:workqueue.c:(.init.text+0x2a7bc): more undefined references to `cpu_to_node' follow
Yury Norov Jan. 19, 2024, 6:02 p.m. UTC | #9
On Fri, Jan 19, 2024 at 04:50:53PM +0800, Shijie Huang wrote:
> 
> 在 2024/1/19 16:42, Mike Rapoport 写道:
> > On Fri, Jan 19, 2024 at 02:46:16PM +0800, Shijie Huang wrote:
> > > 在 2024/1/19 12:42, Yury Norov 写道:
> > > > This adds another level of indirection, I think. Currently cpu_to_node
> > > > is a simple inliner. After the patch it would be a real function with
> > > > all the associate overhead. Can you share a bloat-o-meter output here?
> > > #./scripts/bloat-o-meter vmlinux vmlinux.new
> > > add/remove: 6/1 grow/shrink: 61/51 up/down: 1168/-588 (580)
> > > Function                                     old     new   delta
> > > numa_update_cpu                              148     244     +96
> > > 
> > >   ...................................................................................................................................(to many to skip)
> > > 
> > > Total: Before=32990130, After=32990710, chg +0.00%
> > It's not only about text size, the indirect call also hurts performance
> 
> The cpu_to_node() is called at very low frequency, most of the times is in
> the kernel booting time.
 
That doesn't matter. This function is a simple inliner that dereferences
a pointer, and I believe all of us want to keep it simple. 
 
> > > > Regardless, I don't think that the approach is correct. As per your
> > > > description, some initialization functions erroneously call
> > > > cpu_to_node() instead of early_cpu_to_node() which exists specifically
> > > > for that case.
> > > > 
> > > > If the above correct, it's clearly a caller problem, and the fix is to
> > > > simply switch all those callers to use early version.
> > > It is easy to change to early_cpu_to_node() for sched_init(),
> > > init_sched_fair_class()
> > > 
> > > and workqueue_init_early(). These three places call the cpu_to_node() in the
> > > __init function.
> > > 
> > > 
> > > But it is a little hard to change the early_trace_init(), since it calls
> > > cpu_to_node in the deep
> > > 
> > > function stack:
> > > 
> > >    early_trace_init() --> ring_buffer_alloc() -->rb_allocate_cpu_buffer()
> > > 
> > > 
> > > For early_trace_init(), we need to change more code.
> > > 
> > > 
> > > Anyway, If we think it is not a good idea to change the common code, I am
> > > oaky too.
> > Is there a fundamental reason to have early_cpu_to_node() at all?
> 
> The early_cpu_to_node does not work on some ARCHs (which support the NUMA),
> such
> 
> as  SPARC, MIPS and S390.

So, your approach wouldn't work either, right? I think you've got a
testing bot report on it already...

You can make it like this:

  #ifdef CONFIG_ARCH_NO_EARLY_CPU_TO_NODE
  #define early_cpu_to_node cpu_to_node
  #endif
 
> > It seems that all the mappings are known by the end of setup_arch() and the
> > initialization of numa_node can be moved earlier.
> > > > I would also initialize the numa_node with NUMA_NO_NODE at declaration,
> > > > so that if someone calls cpu_to_node() before the variable is properly
> > > > initialized at runtime, he'll get NO_NODE, which is obviously an error.
> > > Even we set the numa_node with NUMA_NO_NODE, it does not always produce
> > > error.

You can print this error yourself:

  #ifndef cpu_to_node
  static inline int cpu_to_node(int cpu)
  {
        int node = per_cpu(numa_node, cpu);

  #ifdef CONFIG_DEBUG_PER_CPU_MAPS
        if (node == NUMA_NO_NODE)
                pr_err(...);
  #endif

          return node;
  }
  #endif
Shijie Huang Jan. 22, 2024, 7:32 a.m. UTC | #10
在 2024/1/20 2:02, Yury Norov 写道:
> [EXTERNAL EMAIL NOTICE: This email originated from an external sender. Please be mindful of safe email handling and proprietary information protection practices.]
>
>
> On Fri, Jan 19, 2024 at 04:50:53PM +0800, Shijie Huang wrote:
>> 在 2024/1/19 16:42, Mike Rapoport 写道:
>>> On Fri, Jan 19, 2024 at 02:46:16PM +0800, Shijie Huang wrote:
>>>> 在 2024/1/19 12:42, Yury Norov 写道:
>>>>> This adds another level of indirection, I think. Currently cpu_to_node
>>>>> is a simple inliner. After the patch it would be a real function with
>>>>> all the associate overhead. Can you share a bloat-o-meter output here?
>>>> #./scripts/bloat-o-meter vmlinux vmlinux.new
>>>> add/remove: 6/1 grow/shrink: 61/51 up/down: 1168/-588 (580)
>>>> Function                                     old     new   delta
>>>> numa_update_cpu                              148     244     +96
>>>>
>>>>    ...................................................................................................................................(to many to skip)
>>>>
>>>> Total: Before=32990130, After=32990710, chg +0.00%
>>> It's not only about text size, the indirect call also hurts performance
>> The cpu_to_node() is called at very low frequency, most of the times is in
>> the kernel booting time.
> That doesn't matter. This function is a simple inliner that dereferences
> a pointer, and I believe all of us want to keep it simple.

Yes. I agree.

I also want to keep it simple too.


>>>>> Regardless, I don't think that the approach is correct. As per your
>>>>> description, some initialization functions erroneously call
>>>>> cpu_to_node() instead of early_cpu_to_node() which exists specifically
>>>>> for that case.
>>>>>
>>>>> If the above correct, it's clearly a caller problem, and the fix is to
>>>>> simply switch all those callers to use early version.
>>>> It is easy to change to early_cpu_to_node() for sched_init(),
>>>> init_sched_fair_class()
>>>>
>>>> and workqueue_init_early(). These three places call the cpu_to_node() in the
>>>> __init function.
>>>>
>>>>
>>>> But it is a little hard to change the early_trace_init(), since it calls
>>>> cpu_to_node in the deep
>>>>
>>>> function stack:
>>>>
>>>>     early_trace_init() --> ring_buffer_alloc() -->rb_allocate_cpu_buffer()
>>>>
>>>>
>>>> For early_trace_init(), we need to change more code.
>>>>
>>>>
>>>> Anyway, If we think it is not a good idea to change the common code, I am
>>>> oaky too.
>>> Is there a fundamental reason to have early_cpu_to_node() at all?
>> The early_cpu_to_node does not work on some ARCHs (which support the NUMA),
>> such
>>
>> as  SPARC, MIPS and S390.
> So, your approach wouldn't work either, right? I think you've got a
> testing bot report on it already...

IMHO, my patch works fine for them.

They have their own cpu_to_node.


The x86 reported an compiling error, because the x86 does not compile

the driver/base/arch_numa.c.

I have fixed it by moving the cpu_to_node from

      driver/base/arch_numa.c to driver/base/node.c


The driver/base/node.c is  built-in for all the NUMA ARCHs.

> You can make it like this:
>
>    #ifdef CONFIG_ARCH_NO_EARLY_CPU_TO_NODE
>    #define early_cpu_to_node cpu_to_node
>    #endif

Thanks. Add this make it more complicated..


>>> It seems that all the mappings are known by the end of setup_arch() and the
>>> initialization of numa_node can be moved earlier.
>>>>> I would also initialize the numa_node with NUMA_NO_NODE at declaration,
>>>>> so that if someone calls cpu_to_node() before the variable is properly
>>>>> initialized at runtime, he'll get NO_NODE, which is obviously an error.
>>>> Even we set the numa_node with NUMA_NO_NODE, it does not always produce
>>>> error.
> You can print this error yourself:
>
>    #ifndef cpu_to_node
>    static inline int cpu_to_node(int cpu)
>    {
>          int node = per_cpu(numa_node, cpu);
>
>    #ifdef CONFIG_DEBUG_PER_CPU_MAPS
>          if (node == NUMA_NO_NODE)
>                  pr_err(...);
>    #endif
>
>            return node;
>    }
>    #endif

Thanks.  I had a samiliar private to detect it.

After my patch, there is no need to detect the error again.


Thanks

Huang Shijie
Mike Rapoport Jan. 22, 2024, 7:41 a.m. UTC | #11
On Fri, Jan 19, 2024 at 04:50:53PM +0800, Shijie Huang wrote:
> 
> 在 2024/1/19 16:42, Mike Rapoport 写道:
> > Is there a fundamental reason to have early_cpu_to_node() at all?
> 
> The early_cpu_to_node does not work on some ARCHs (which support the NUMA),
> such as  SPARC, MIPS and S390.

My question was why we need early_cpu_to_node() at all and why can't we use
cpu_to_node() early on arches that do have it.
 
> Thanks
> 
> Huang Shijie
> 
> > It seems that all the mappings are known by the end of setup_arch() and the
> > initialization of numa_node can be moved earlier.
> > > > I would also initialize the numa_node with NUMA_NO_NODE at declaration,
> > > > so that if someone calls cpu_to_node() before the variable is properly
> > > > initialized at runtime, he'll get NO_NODE, which is obviously an error.
> > > Even we set the numa_node with NUMA_NO_NODE, it does not always produce
> > > error.
> > > 
> > > Please see the alloc_pages_node().
> > > 
> > > 
> > > Thanks
> > > 
> > > Huang Shijie
> > >
Shijie Huang Jan. 22, 2024, 8:27 a.m. UTC | #12
在 2024/1/22 15:41, Mike Rapoport 写道:
> On Fri, Jan 19, 2024 at 04:50:53PM +0800, Shijie Huang wrote:
>> 在 2024/1/19 16:42, Mike Rapoport 写道:
>>> Is there a fundamental reason to have early_cpu_to_node() at all?
>> The early_cpu_to_node does not work on some ARCHs (which support the NUMA),
>> such as  SPARC, MIPS and S390.
> My question was why we need early_cpu_to_node() at all and why can't we use
> cpu_to_node() early on arches that do have it

As you see, some ARCHs use cpu_to_node() all the time, such as 
SPARC,mips and S390.

They do not use early_cpu_to_node() at all.


In some ARCHs(arm64, powerpc riscv), the cpu_to_node() is ready at:

     start_kernel --> arch_call_rest_init() --> rest_init()
                            --> kernel_init() --> kernel_init_freeable()
                            --> smp_prepare_cpus()


The cpu_to_node() is initialized too late.


I am not sure if we can move "cpu_to_node initialization" to an early place.

Move "cpu_to_node() initization" to an early place is more complicated, 
I guess.


Thanks

Huang Shijie
diff mbox series

Patch

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 5b59d133b6af..867a477fa975 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -61,6 +61,17 @@  EXPORT_SYMBOL(cpumask_of_node);
 
 #endif
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+int _cpu_to_node(int cpu)
+{
+	return per_cpu(numa_node, cpu);
+}
+int (*cpu_to_node)(int cpu);
+EXPORT_SYMBOL(cpu_to_node);
+#endif
+#endif
+
 static void numa_update_cpu(unsigned int cpu, bool remove)
 {
 	int nid = cpu_to_node(cpu);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3..e7ce2bae11dd 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -91,10 +91,8 @@  static inline int numa_node_id(void)
 #endif
 
 #ifndef cpu_to_node
-static inline int cpu_to_node(int cpu)
-{
-	return per_cpu(numa_node, cpu);
-}
+extern int (*cpu_to_node)(int cpu);
+extern int _cpu_to_node(int cpu);
 #endif
 
 #ifndef set_numa_node
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..b142e9c51161 100644
--- a/init/main.c
+++ b/init/main.c
@@ -870,6 +870,18 @@  static void __init print_unknown_bootoptions(void)
 	memblock_free(unknown_options, len);
 }
 
+static void __init smp_prepare_boot_cpu_start(void)
+{
+	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* The early_cpu_to_node should be ready now. */
+	cpu_to_node = early_cpu_to_node;
+#endif
+#endif
+}
+
 asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
 void start_kernel(void)
 {
@@ -899,7 +911,7 @@  void start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
-	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+	smp_prepare_boot_cpu_start();
 	boot_cpu_hotplug_init();
 
 	pr_notice("Kernel command line: %s\n", saved_command_line);
@@ -1519,6 +1531,19 @@  void __init console_on_rootfs(void)
 	fput(file);
 }
 
+static void __init smp_prepare_cpus_done(unsigned int setup_max_cpus)
+{
+	/* Different ARCHs may override smp_prepare_cpus() */
+	smp_prepare_cpus(setup_max_cpus);
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* Change to the formal function. */
+	cpu_to_node = _cpu_to_node;
+#endif
+#endif
+}
+
 static noinline void __init kernel_init_freeable(void)
 {
 	/* Now the scheduler is fully set up and can do blocking allocations */
@@ -1531,7 +1556,7 @@  static noinline void __init kernel_init_freeable(void)
 
 	cad_pid = get_pid(task_pid(current));
 
-	smp_prepare_cpus(setup_max_cpus);
+	smp_prepare_cpus_done(setup_max_cpus);
 
 	workqueue_init();