diff mbox series

srcu: Delegate work to the first online cpu if using SRCU_SIZE_SMALL

Message ID 20221026032716.78674-1-kernelfans@gmail.com (mailing list archive)
State New, archived
Headers show
Series srcu: Delegate work to the first online cpu if using SRCU_SIZE_SMALL | expand

Commit Message

Pingfan Liu Oct. 26, 2022, 3:27 a.m. UTC
commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
snp_node array") assumes that cpu 0 is always online, but that is not
the truth when using maxcpus=1 in the command line for the kdump kernel.

On a PowerPC, the following kdump kernel hanging is observed:
...
[    1.740036] systemd[1]: Hostname set to <xyz.com>
[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
[  243.686264]       Not tainted 6.1.0-rc1 #1
[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
[  243.686296] Call Trace:
[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
[  243.686572] IRQMASK: 0
[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
[  243.686691] LR [0000000000000000] 0x0
[  243.686698] --- interrupt: 3000
[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
[  243.686717]       Not tainted 6.1.0-rc1 #1
[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
[  243.686758] Call Trace:
[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
[  366.566298]       Not tainted 6.1.0-rc1 #1
[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
[  366.566329] Call Trace:
...

In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
kernel command line on the PowerPC platform. Consequently, the crash cpu
is the only onlined cpu in the kdump kernel, but with its logical id not
necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
thread is created, so sdp->work will be never executed and
__synchronize_srcu() can not be completed.

Tackle this issue by queueing sdp->work on the first onlined cpu.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
To: rcu@vger.kernel.org
---
 kernel/rcu/srcutree.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

Comments

Zqiang Oct. 26, 2022, 4:36 a.m. UTC | #1
>
>commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
>snp_node array") assumes that cpu 0 is always online, but that is not
>the truth when using maxcpus=1 in the command line for the kdump kernel.
>
>On a PowerPC, the following kdump kernel hanging is observed:
>...
>[    1.740036] systemd[1]: Hostname set to <xyz.com>
>[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
>[  243.686264]       Not tainted 6.1.0-rc1 #1
>[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
>[  243.686296] Call Trace:
>[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
>[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
>[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
>[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
>[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
>[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
>[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
>[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
>[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
>[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
>[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
>[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
>[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
>[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
>[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
>[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
>[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
>[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
>[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
>[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
>[  243.686572] IRQMASK: 0
>[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
>[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
>[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
>[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
>[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
>[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
>[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
>[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
>[  243.686691] LR [0000000000000000] 0x0
>[  243.686698] --- interrupt: 3000
>[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
>[  243.686717]       Not tainted 6.1.0-rc1 #1
>[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
>[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
>[  243.686758] Call Trace:
>[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
>[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
>[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
>[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
>[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
>[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
>[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
>[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
>[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
>[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
>[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
>[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
>[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
>[  366.566298]       Not tainted 6.1.0-rc1 #1
>[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
>[  366.566329] Call Trace:
>...
>
>In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
>kernel command line on the PowerPC platform. Consequently, the crash cpu
>is the only onlined cpu in the kdump kernel, but with its logical id not
>necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
>thread is created, so sdp->work will be never executed and
>__synchronize_srcu() can not be completed.
>
>Tackle this issue by queueing sdp->work on the first onlined cpu.
>
>Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
>Cc: "Paul E. McKenney" <paulmck@kernel.org>
>Cc: Lai Jiangshan <jiangshanlai@gmail.com>
>Cc: Josh Triplett <josh@joshtriplett.org>
>Cc: Steven Rostedt <rostedt@goodmis.org>
>Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>To: rcu@vger.kernel.org
>---
> kernel/rcu/srcutree.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
>diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
>index 1c304fec89c0..f2df561e0995 100644
>--- a/kernel/rcu/srcutree.c
>+++ b/kernel/rcu/srcutree.c
>@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> 	int state;
> 
> 	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
>-		sdp = per_cpu_ptr(ssp->sda, 0);
>+		sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> 	else
> 		sdp = this_cpu_ptr(ssp->sda);
> 	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
>@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> 	/* Initiate callback invocation as needed. */
> 	ss_state = smp_load_acquire(&ssp->srcu_size_state);
> 	if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
>-		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
>+		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
>+					cpumask_first(cpu_online_mask)), cbdelay);
> 	} else {
> 		idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> 		srcu_for_each_node_breadth_first(ssp, snp) {
>@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> 	idx = srcu_read_lock(ssp);
> 	ss_state = smp_load_acquire(&ssp->srcu_size_state);
> 	if (ss_state < SRCU_SIZE_WAIT_CALL)
>-		sdp = per_cpu_ptr(ssp->sda, 0);
>+		sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));

Hi Pingfan

If CPU0 support hotplug.
For example , in this time cpumask_first(cpu_online_mask) return 0

> 	else
> 		sdp = raw_cpu_ptr(ssp->sda);
> 	spin_lock_irqsave_sdp_contention(sdp, &flags);
>@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> 
> 	idx = srcu_read_lock(ssp);
> 	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
>-		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
>+		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
>+					cpumask_first(cpu_online_mask)));

Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.

Thanks
Zqiang


> 	else
> 		for_each_possible_cpu(cpu)
> 			srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
>-- 
>2.31.1
Pingfan Liu Oct. 26, 2022, 8:21 a.m. UTC | #2
On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
>
> >
> >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> >snp_node array") assumes that cpu 0 is always online, but that is not
> >the truth when using maxcpus=1 in the command line for the kdump kernel.
> >
> >On a PowerPC, the following kdump kernel hanging is observed:
> >...
> >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> >[  243.686264]       Not tainted 6.1.0-rc1 #1
> >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> >[  243.686296] Call Trace:
> >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> >[  243.686572] IRQMASK: 0
> >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> >[  243.686691] LR [0000000000000000] 0x0
> >[  243.686698] --- interrupt: 3000
> >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> >[  243.686717]       Not tainted 6.1.0-rc1 #1
> >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> >[  243.686758] Call Trace:
> >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> >[  366.566298]       Not tainted 6.1.0-rc1 #1
> >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> >[  366.566329] Call Trace:
> >...
> >
> >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> >kernel command line on the PowerPC platform. Consequently, the crash cpu
> >is the only onlined cpu in the kdump kernel, but with its logical id not
> >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> >thread is created, so sdp->work will be never executed and
> >__synchronize_srcu() can not be completed.
> >
> >Tackle this issue by queueing sdp->work on the first onlined cpu.
> >
> >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> >Cc: Josh Triplett <josh@joshtriplett.org>
> >Cc: Steven Rostedt <rostedt@goodmis.org>
> >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> >To: rcu@vger.kernel.org
> >---
> > kernel/rcu/srcutree.c | 10 ++++++----
> > 1 file changed, 6 insertions(+), 4 deletions(-)
> >
> >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> >index 1c304fec89c0..f2df561e0995 100644
> >--- a/kernel/rcu/srcutree.c
> >+++ b/kernel/rcu/srcutree.c
> >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> >       int state;
> >
> >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> >-              sdp = per_cpu_ptr(ssp->sda, 0);
> >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> >       else
> >               sdp = this_cpu_ptr(ssp->sda);
> >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> >       /* Initiate callback invocation as needed. */
> >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> >       } else {
> >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> >               srcu_for_each_node_breadth_first(ssp, snp) {
> >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> >       idx = srcu_read_lock(ssp);
> >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> >-              sdp = per_cpu_ptr(ssp->sda, 0);
> >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
>
> Hi Pingfan
>
> If CPU0 support hotplug.
> For example , in this time cpumask_first(cpu_online_mask) return 0
>
> >       else
> >               sdp = raw_cpu_ptr(ssp->sda);
> >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> >
> >       idx = srcu_read_lock(ssp);
> >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> >+                                      cpumask_first(cpu_online_mask)));
>
> Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
>

Thanks for catching this bug.

It seems that get_boot_cpu_id() can tackle the whole issue.  I will
wait for more comments and send out V2.

Thanks,

    Pingfan
Paul E. McKenney Oct. 26, 2022, 1:52 p.m. UTC | #3
On Wed, Oct 26, 2022 at 11:27:16AM +0800, Pingfan Liu wrote:
> commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> snp_node array") assumes that cpu 0 is always online, but that is not
> the truth when using maxcpus=1 in the command line for the kdump kernel.
> 
> On a PowerPC, the following kdump kernel hanging is observed:

Adding a few PowerPC folks on CC for their thoughts systems booting with
some CPU other than CPU 0 as the boot CPU.  Is this intended/supported?


> ...
> [    1.740036] systemd[1]: Hostname set to <xyz.com>
> [  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> [  243.686264]       Not tainted 6.1.0-rc1 #1
> [  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> [  243.686296] Call Trace:
> [  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> [  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> [  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> [  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> [  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> [  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> [  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> [  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> [  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> [  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> [  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> [  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> [  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> [  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> [  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> [  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> [  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> [  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> [  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> [  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> [  243.686572] IRQMASK: 0
> [  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> [  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> [  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> [  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> [  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> [  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> [  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> [  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> [  243.686691] LR [0000000000000000] 0x0
> [  243.686698] --- interrupt: 3000
> [  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> [  243.686717]       Not tainted 6.1.0-rc1 #1
> [  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> [  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> [  243.686758] Call Trace:
> [  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> [  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> [  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> [  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> [  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> [  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> [  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> [  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> [  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> [  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> [  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> [  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> [  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> [  366.566298]       Not tainted 6.1.0-rc1 #1
> [  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> [  366.566329] Call Trace:
> ...
> 
> In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> kernel command line on the PowerPC platform. Consequently, the crash cpu
> is the only onlined cpu in the kdump kernel, but with its logical id not
> necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> thread is created, so sdp->work will be never executed and
> __synchronize_srcu() can not be completed.
> 
> Tackle this issue by queueing sdp->work on the first onlined cpu.
> 
> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>

Good catch!!!  New one on me!  ;-)

But a few questions...

1.	As noted above, is booting without CPU 0 an intentional and
	supported feature of PowerPC?  If not, perhaps a better approach
	would be to rule out this configuration.

2.	When booting without CPU 0, is it guaranteed that CPU 0 will
	never come online?  If not, then isn't the patch below subject
	to failure modes when that happens?

3.	More generally, when CPU N is the boot CPU, is it guaranteed
	that CPU M, M < N, will never come online?  Same as above on
	failure modes.

Thoughts?

							Thanx, Paul

> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> Cc: Josh Triplett <josh@joshtriplett.org>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> To: rcu@vger.kernel.org
> ---
>  kernel/rcu/srcutree.c | 10 ++++++----
>  1 file changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> index 1c304fec89c0..f2df561e0995 100644
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> @@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
>  	int state;
>  
>  	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> -		sdp = per_cpu_ptr(ssp->sda, 0);
> +		sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
>  	else
>  		sdp = this_cpu_ptr(ssp->sda);
>  	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> @@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
>  	/* Initiate callback invocation as needed. */
>  	ss_state = smp_load_acquire(&ssp->srcu_size_state);
>  	if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> -		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> +		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> +					cpumask_first(cpu_online_mask)), cbdelay);
>  	} else {
>  		idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
>  		srcu_for_each_node_breadth_first(ssp, snp) {
> @@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
>  	idx = srcu_read_lock(ssp);
>  	ss_state = smp_load_acquire(&ssp->srcu_size_state);
>  	if (ss_state < SRCU_SIZE_WAIT_CALL)
> -		sdp = per_cpu_ptr(ssp->sda, 0);
> +		sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
>  	else
>  		sdp = raw_cpu_ptr(ssp->sda);
>  	spin_lock_irqsave_sdp_contention(sdp, &flags);
> @@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
>  
>  	idx = srcu_read_lock(ssp);
>  	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> -		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> +		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> +					cpumask_first(cpu_online_mask)));
>  	else
>  		for_each_possible_cpu(cpu)
>  			srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
> -- 
> 2.31.1
>
Paul E. McKenney Oct. 26, 2022, 1:55 p.m. UTC | #4
On Wed, Oct 26, 2022 at 04:21:26PM +0800, Pingfan Liu wrote:
> On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
> >
> > >
> > >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > >snp_node array") assumes that cpu 0 is always online, but that is not
> > >the truth when using maxcpus=1 in the command line for the kdump kernel.
> > >
> > >On a PowerPC, the following kdump kernel hanging is observed:
> > >...
> > >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> > >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > >[  243.686264]       Not tainted 6.1.0-rc1 #1
> > >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > >[  243.686296] Call Trace:
> > >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > >[  243.686572] IRQMASK: 0
> > >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > >[  243.686691] LR [0000000000000000] 0x0
> > >[  243.686698] --- interrupt: 3000
> > >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > >[  243.686717]       Not tainted 6.1.0-rc1 #1
> > >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > >[  243.686758] Call Trace:
> > >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > >[  366.566298]       Not tainted 6.1.0-rc1 #1
> > >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > >[  366.566329] Call Trace:
> > >...
> > >
> > >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > >kernel command line on the PowerPC platform. Consequently, the crash cpu
> > >is the only onlined cpu in the kdump kernel, but with its logical id not
> > >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > >thread is created, so sdp->work will be never executed and
> > >__synchronize_srcu() can not be completed.
> > >
> > >Tackle this issue by queueing sdp->work on the first onlined cpu.
> > >
> > >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > >Cc: Josh Triplett <josh@joshtriplett.org>
> > >Cc: Steven Rostedt <rostedt@goodmis.org>
> > >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > >To: rcu@vger.kernel.org
> > >---
> > > kernel/rcu/srcutree.c | 10 ++++++----
> > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > >
> > >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > >index 1c304fec89c0..f2df561e0995 100644
> > >--- a/kernel/rcu/srcutree.c
> > >+++ b/kernel/rcu/srcutree.c
> > >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> > >       int state;
> > >
> > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > >       else
> > >               sdp = this_cpu_ptr(ssp->sda);
> > >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> > >       /* Initiate callback invocation as needed. */
> > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> > >       } else {
> > >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> > >               srcu_for_each_node_breadth_first(ssp, snp) {
> > >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> > >       idx = srcu_read_lock(ssp);
> > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> >
> > Hi Pingfan
> >
> > If CPU0 support hotplug.
> > For example , in this time cpumask_first(cpu_online_mask) return 0
> >
> > >       else
> > >               sdp = raw_cpu_ptr(ssp->sda);
> > >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> > >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> > >
> > >       idx = srcu_read_lock(ssp);
> > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > >+                                      cpumask_first(cpu_online_mask)));
> >
> > Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> > in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
> 
> Thanks for catching this bug.

Indeed, good catch!

> It seems that get_boot_cpu_id() can tackle the whole issue.  I will
> wait for more comments and send out V2.

Does your patch require the CPU indicated by get_boot_cpu_id() to always
stay online?  If so, is that CPU guaranteed to always stay online?

I am still asking my question about PowerPC booting with a non-zero CPU.
After all, if all remaining architectures always boot with CPU 0, then
perhaps some simplification is in order.

							Thanx, Paul
Pingfan Liu Oct. 27, 2022, 3:03 a.m. UTC | #5
On Wed, Oct 26, 2022 at 06:55:52AM -0700, Paul E. McKenney wrote:
> On Wed, Oct 26, 2022 at 04:21:26PM +0800, Pingfan Liu wrote:
> > On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
> > >
> > > >
> > > >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > > >snp_node array") assumes that cpu 0 is always online, but that is not
> > > >the truth when using maxcpus=1 in the command line for the kdump kernel.
> > > >
> > > >On a PowerPC, the following kdump kernel hanging is observed:
> > > >...
> > > >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> > > >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > > >[  243.686264]       Not tainted 6.1.0-rc1 #1
> > > >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > >[  243.686296] Call Trace:
> > > >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > > >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > > >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > > >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > > >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > > >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > > >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > > >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > > >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > > >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > > >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > > >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > > >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > > >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > > >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > > >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > > >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > > >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > > >[  243.686572] IRQMASK: 0
> > > >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > > >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > > >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > > >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > > >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > > >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > > >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > > >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > > >[  243.686691] LR [0000000000000000] 0x0
> > > >[  243.686698] --- interrupt: 3000
> > > >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > > >[  243.686717]       Not tainted 6.1.0-rc1 #1
> > > >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > > >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > > >[  243.686758] Call Trace:
> > > >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > > >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > > >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > > >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > > >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > > >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > > >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > > >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > > >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > > >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > > >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > > >[  366.566298]       Not tainted 6.1.0-rc1 #1
> > > >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > >[  366.566329] Call Trace:
> > > >...
> > > >
> > > >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > > >kernel command line on the PowerPC platform. Consequently, the crash cpu
> > > >is the only onlined cpu in the kdump kernel, but with its logical id not
> > > >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > > >thread is created, so sdp->work will be never executed and
> > > >__synchronize_srcu() can not be completed.
> > > >
> > > >Tackle this issue by queueing sdp->work on the first onlined cpu.
> > > >
> > > >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > > >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > > >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > > >Cc: Josh Triplett <josh@joshtriplett.org>
> > > >Cc: Steven Rostedt <rostedt@goodmis.org>
> > > >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > > >To: rcu@vger.kernel.org
> > > >---
> > > > kernel/rcu/srcutree.c | 10 ++++++----
> > > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > > >
> > > >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > > >index 1c304fec89c0..f2df561e0995 100644
> > > >--- a/kernel/rcu/srcutree.c
> > > >+++ b/kernel/rcu/srcutree.c
> > > >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> > > >       int state;
> > > >
> > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > >       else
> > > >               sdp = this_cpu_ptr(ssp->sda);
> > > >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > > >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> > > >       /* Initiate callback invocation as needed. */
> > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > > >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > > >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > > >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> > > >       } else {
> > > >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> > > >               srcu_for_each_node_breadth_first(ssp, snp) {
> > > >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> > > >       idx = srcu_read_lock(ssp);
> > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > >
> > > Hi Pingfan
> > >
> > > If CPU0 support hotplug.
> > > For example , in this time cpumask_first(cpu_online_mask) return 0
> > >
> > > >       else
> > > >               sdp = raw_cpu_ptr(ssp->sda);
> > > >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> > > >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> > > >
> > > >       idx = srcu_read_lock(ssp);
> > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > > >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > > >+                                      cpumask_first(cpu_online_mask)));
> > >
> > > Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> > > in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
> > 
> > Thanks for catching this bug.
> 
> Indeed, good catch!
> 
> > It seems that get_boot_cpu_id() can tackle the whole issue.  I will
> > wait for more comments and send out V2.
> 
> Does your patch require the CPU indicated by get_boot_cpu_id() to always
> stay online?  If so, is that CPU guaranteed to always stay online?
> 

No, it is just a replacement for cpu 0.

A qualified candidate should meet two requirements:
-1. worker_pool is initialized, so it has worker thread to dispatch
work. This is satisfied by once the cpu was onlined.  

-2. this cpu id is persistent as Zqiang pointed out.

For hotplug, it is the same as cpu 0, which is taken care of by the
workqueue infrastructure.

> I am still asking my question about PowerPC booting with a non-zero CPU.
> After all, if all remaining architectures always boot with CPU 0, then
> perhaps some simplification is in order.
> 

As my understanding, it is not easy. Let us unfold the discussion in
another thread post by you.


Thanks,

	Pingfan
Pingfan Liu Oct. 27, 2022, 3:40 a.m. UTC | #6
On Wed, Oct 26, 2022 at 06:52:11AM -0700, Paul E. McKenney wrote:
> On Wed, Oct 26, 2022 at 11:27:16AM +0800, Pingfan Liu wrote:
> > commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > snp_node array") assumes that cpu 0 is always online, but that is not
> > the truth when using maxcpus=1 in the command line for the kdump kernel.
> > 
> > On a PowerPC, the following kdump kernel hanging is observed:
> 
> Adding a few PowerPC folks on CC for their thoughts systems booting with
> some CPU other than CPU 0 as the boot CPU.  Is this intended/supported?
> 
> 
> > ...
> > [    1.740036] systemd[1]: Hostname set to <xyz.com>
> > [  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > [  243.686264]       Not tainted 6.1.0-rc1 #1
> > [  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > [  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > [  243.686296] Call Trace:
> > [  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > [  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > [  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > [  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > [  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > [  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > [  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > [  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > [  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > [  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > [  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > [  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > [  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > [  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > [  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > [  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > [  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > [  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > [  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > [  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > [  243.686572] IRQMASK: 0
> > [  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > [  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > [  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > [  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > [  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > [  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > [  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > [  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > [  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > [  243.686691] LR [0000000000000000] 0x0
> > [  243.686698] --- interrupt: 3000
> > [  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > [  243.686717]       Not tainted 6.1.0-rc1 #1
> > [  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > [  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > [  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > [  243.686758] Call Trace:
> > [  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > [  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > [  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > [  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > [  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > [  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > [  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > [  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > [  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > [  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > [  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > [  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > [  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > [  366.566298]       Not tainted 6.1.0-rc1 #1
> > [  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > [  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > [  366.566329] Call Trace:
> > ...
> > 
> > In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > kernel command line on the PowerPC platform. Consequently, the crash cpu
> > is the only onlined cpu in the kdump kernel, but with its logical id not
> > necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > thread is created, so sdp->work will be never executed and
> > __synchronize_srcu() can not be completed.
> > 
> > Tackle this issue by queueing sdp->work on the first onlined cpu.
> > 
> > Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> 
> Good catch!!!  New one on me!  ;-)
> 
> But a few questions...
> 
> 1.	As noted above, is booting without CPU 0 an intentional and
> 	supported feature of PowerPC?  If not, perhaps a better approach
> 	would be to rule out this configuration.
> 

Try to answer based on my limited knowledge about PowerPC. Hope that PowerPC
experts can correct me if I am wrong.

The boot cpu logical id is decided in
arch/powerpc/kernel/prom.c:static int __init early_init_dt_scan_cpus()
and stored in boot_cpuid.

Why it does not start from 0? I think it is intentional, since on the
PowerPC platform the logical cpu id also implies some cpu hardware
topology info.  E.g.
static inline int cpu_thread_in_core(int cpu)
{
	return cpu & (threads_per_core - 1);
}

static inline int cpu_thread_in_subcore(int cpu)
{
	return cpu & (threads_per_subcore - 1);
}

static inline int cpu_first_thread_sibling(int cpu)
{
	return cpu & ~(threads_per_core - 1);
}

static inline int cpu_last_thread_sibling(int cpu)
{
	return cpu | (threads_per_core - 1);
}


But that may be not the whole story.

> 2.	When booting without CPU 0, is it guaranteed that CPU 0 will
> 	never come online?  If not, then isn't the patch below subject
> 	to failure modes when that happens?
> 

Whether cpu 0 comes online or not, it depends. If not limiting the
maxcpus plus cpu 0 is present, it is ture. But the kdump kernel caps the
max online cpu number with the param 'maxcpus=1' to save the memory
usage, cpu 0 will never be online if not the boot cpu. (The boot cpu is
the crashed cpu)

> 3.	More generally, when CPU N is the boot CPU, is it guaranteed
> 	that CPU M, M < N, will never come online?  Same as above on
> 	failure modes.
> 

I think that the answer to the 2nd question also stands here. Or I miss
some nuance?

> Thoughts?
> 
> 							Thanx, Paul
> 
> > Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > Cc: Josh Triplett <josh@joshtriplett.org>
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > To: rcu@vger.kernel.org
> > ---
> >  kernel/rcu/srcutree.c | 10 ++++++----
> >  1 file changed, 6 insertions(+), 4 deletions(-)
> > 
> > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > index 1c304fec89c0..f2df561e0995 100644
> > --- a/kernel/rcu/srcutree.c
> > +++ b/kernel/rcu/srcutree.c
> > @@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> >  	int state;
> >  
> >  	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > -		sdp = per_cpu_ptr(ssp->sda, 0);
> > +		sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> >  	else
> >  		sdp = this_cpu_ptr(ssp->sda);
> >  	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > @@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> >  	/* Initiate callback invocation as needed. */
> >  	ss_state = smp_load_acquire(&ssp->srcu_size_state);
> >  	if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > -		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > +		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > +					cpumask_first(cpu_online_mask)), cbdelay);
> >  	} else {
> >  		idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> >  		srcu_for_each_node_breadth_first(ssp, snp) {
> > @@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> >  	idx = srcu_read_lock(ssp);
> >  	ss_state = smp_load_acquire(&ssp->srcu_size_state);
> >  	if (ss_state < SRCU_SIZE_WAIT_CALL)
> > -		sdp = per_cpu_ptr(ssp->sda, 0);
> > +		sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> >  	else
> >  		sdp = raw_cpu_ptr(ssp->sda);
> >  	spin_lock_irqsave_sdp_contention(sdp, &flags);
> > @@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> >  
> >  	idx = srcu_read_lock(ssp);
> >  	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > -		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > +		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > +					cpumask_first(cpu_online_mask)));

Collect all info here.

As discussed in another thread, due to cpu hotplug,
cpumask_first(cpu_online_mask)) may return another cpu's id. That is a
bug. I will change cpumask_first(cpu_online_mask) to get_boot_cpu_id().


Thanks,

	Pingfan


(Keep the rest for easy reference)
> >  	else
> >  		for_each_possible_cpu(cpu)
> >  			srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
> > -- 
> > 2.31.1
> >
Paul E. McKenney Oct. 27, 2022, 4:52 p.m. UTC | #7
On Thu, Oct 27, 2022 at 11:03:53AM +0800, Pingfan Liu wrote:
> On Wed, Oct 26, 2022 at 06:55:52AM -0700, Paul E. McKenney wrote:
> > On Wed, Oct 26, 2022 at 04:21:26PM +0800, Pingfan Liu wrote:
> > > On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
> > > >
> > > > >
> > > > >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > > > >snp_node array") assumes that cpu 0 is always online, but that is not
> > > > >the truth when using maxcpus=1 in the command line for the kdump kernel.
> > > > >
> > > > >On a PowerPC, the following kdump kernel hanging is observed:
> > > > >...
> > > > >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> > > > >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > > > >[  243.686264]       Not tainted 6.1.0-rc1 #1
> > > > >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > >[  243.686296] Call Trace:
> > > > >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > > > >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > > > >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > > > >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > > > >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > > > >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > > > >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > > > >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > > > >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > > > >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > > > >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > > > >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > > > >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > > > >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > > > >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > > > >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > > > >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > > > >[  243.686572] IRQMASK: 0
> > > > >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > > > >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > > > >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > > > >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > > > >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > > >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > > > >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > > > >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > > > >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > > > >[  243.686691] LR [0000000000000000] 0x0
> > > > >[  243.686698] --- interrupt: 3000
> > > > >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > > > >[  243.686717]       Not tainted 6.1.0-rc1 #1
> > > > >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > > > >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > > > >[  243.686758] Call Trace:
> > > > >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > > > >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > > > >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > > > >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > > > >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > > > >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > > > >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > > > >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > > > >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > > > >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > > > >[  366.566298]       Not tainted 6.1.0-rc1 #1
> > > > >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > >[  366.566329] Call Trace:
> > > > >...
> > > > >
> > > > >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > > > >kernel command line on the PowerPC platform. Consequently, the crash cpu
> > > > >is the only onlined cpu in the kdump kernel, but with its logical id not
> > > > >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > > > >thread is created, so sdp->work will be never executed and
> > > > >__synchronize_srcu() can not be completed.
> > > > >
> > > > >Tackle this issue by queueing sdp->work on the first onlined cpu.
> > > > >
> > > > >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > > > >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > > > >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > > > >Cc: Josh Triplett <josh@joshtriplett.org>
> > > > >Cc: Steven Rostedt <rostedt@goodmis.org>
> > > > >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > > > >To: rcu@vger.kernel.org
> > > > >---
> > > > > kernel/rcu/srcutree.c | 10 ++++++----
> > > > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > > > >
> > > > >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > > > >index 1c304fec89c0..f2df561e0995 100644
> > > > >--- a/kernel/rcu/srcutree.c
> > > > >+++ b/kernel/rcu/srcutree.c
> > > > >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> > > > >       int state;
> > > > >
> > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > >       else
> > > > >               sdp = this_cpu_ptr(ssp->sda);
> > > > >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > > > >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> > > > >       /* Initiate callback invocation as needed. */
> > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > > > >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > > > >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > > > >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> > > > >       } else {
> > > > >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> > > > >               srcu_for_each_node_breadth_first(ssp, snp) {
> > > > >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> > > > >       idx = srcu_read_lock(ssp);
> > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > >
> > > > Hi Pingfan
> > > >
> > > > If CPU0 support hotplug.
> > > > For example , in this time cpumask_first(cpu_online_mask) return 0
> > > >
> > > > >       else
> > > > >               sdp = raw_cpu_ptr(ssp->sda);
> > > > >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> > > > >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> > > > >
> > > > >       idx = srcu_read_lock(ssp);
> > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > > > >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > > > >+                                      cpumask_first(cpu_online_mask)));
> > > >
> > > > Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> > > > in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
> > > 
> > > Thanks for catching this bug.
> > 
> > Indeed, good catch!
> > 
> > > It seems that get_boot_cpu_id() can tackle the whole issue.  I will
> > > wait for more comments and send out V2.
> > 
> > Does your patch require the CPU indicated by get_boot_cpu_id() to always
> > stay online?  If so, is that CPU guaranteed to always stay online?
> 
> No, it is just a replacement for cpu 0.
> 
> A qualified candidate should meet two requirements:
> -1. worker_pool is initialized, so it has worker thread to dispatch
> work. This is satisfied by once the cpu was onlined.  
> 
> -2. this cpu id is persistent as Zqiang pointed out.
> 
> For hotplug, it is the same as cpu 0, which is taken care of by the
> workqueue infrastructure.
> 
> > I am still asking my question about PowerPC booting with a non-zero CPU.
> > After all, if all remaining architectures always boot with CPU 0, then
> > perhaps some simplification is in order.
> 
> As my understanding, it is not easy. Let us unfold the discussion in
> another thread post by you.

As you wish...

First, let me see if I understand the options.

1.	Take your initial patch, but fixing the issue pointed out by
	Zhang Qiang.  One issue is that cpu_online_mask is not initialized
	until boot_cpu_init() is invoked.  Given that SRCU is used by
	tracing, we must correctly handle early boot usage.

2.	Update your initial patch to fix the issue pointed out by Zhang
	Qiang and also to use get_boot_cpu_id() as you suggested earlier.
	Except that the __boot_cpu_id variable read by get_boot_cpu_id()
	is also initialized by boot_cpu_init(), and is zero up to
	that point.  So this has the same problem as #1 above.

3.	As in #2 above, but modify srcu_init() to check for SRCU callbacks
	queued on CPU 0 when boot_cpu_init() returns non-zero.  In this
	case, move those SRCU callbacks to the srcu_data structure
	indicated by boot_cpu_init().

4.	Leave the SRCU code alone, but boot your crash kernel with
	srcutree.big_cpu_lim=0.  This would switch all srcu_struct
	structures to big mode at initialization time.	Once this switch
	completed, CPUs would queue SRCU callbacks on their own srcu_data
	structure's ->srcu_cblist.

	I am not optimistic about this one, but could you please try it?

5.	Start over with the current SRCU code.	Make srcu_delay_timer()
	and srcu_queue_delayed_work_on() check to see if sdp->cpu is
	online, and if not, use queue_work() instead of queue_work_on().
	There might be other adjustments required, but I am not
	immediately seeing them.

At the moment, #5 looks by far the best to me.

But first, are there any options that I am missing?  Or, for that matter,
am I confused about any of the options that I called out?

							Thanx, Paul
Pingfan Liu Oct. 28, 2022, 10:23 a.m. UTC | #8
Some stuff is PowerPC specific, CC PowerPC experts, please correct me if
I make a mistake.

On Thu, Oct 27, 2022 at 09:52:00AM -0700, Paul E. McKenney wrote:
> On Thu, Oct 27, 2022 at 11:03:53AM +0800, Pingfan Liu wrote:
> > On Wed, Oct 26, 2022 at 06:55:52AM -0700, Paul E. McKenney wrote:
> > > On Wed, Oct 26, 2022 at 04:21:26PM +0800, Pingfan Liu wrote:
> > > > On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
> > > > >
> > > > > >
> > > > > >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > > > > >snp_node array") assumes that cpu 0 is always online, but that is not
> > > > > >the truth when using maxcpus=1 in the command line for the kdump kernel.
> > > > > >
> > > > > >On a PowerPC, the following kdump kernel hanging is observed:
> > > > > >...
> > > > > >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> > > > > >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > > > > >[  243.686264]       Not tainted 6.1.0-rc1 #1
> > > > > >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > >[  243.686296] Call Trace:
> > > > > >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > > > > >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > > > > >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > > > > >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > > > > >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > > > > >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > > > > >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > > > > >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > > > > >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > > > > >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > > > > >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > > > > >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > > > > >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > > > > >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > > > > >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > > > > >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > > > > >[  243.686572] IRQMASK: 0
> > > > > >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > > > > >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > > > > >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > > > > >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > > > > >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > > > >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > > > > >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > > > > >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > > > > >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > > > > >[  243.686691] LR [0000000000000000] 0x0
> > > > > >[  243.686698] --- interrupt: 3000
> > > > > >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > > > > >[  243.686717]       Not tainted 6.1.0-rc1 #1
> > > > > >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > > > > >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > > > > >[  243.686758] Call Trace:
> > > > > >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > > > > >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > > > > >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > > > > >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > > > > >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > > > > >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > > > > >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > > > > >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > > > > >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > > > > >[  366.566298]       Not tainted 6.1.0-rc1 #1
> > > > > >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > >[  366.566329] Call Trace:
> > > > > >...
> > > > > >
> > > > > >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > > > > >kernel command line on the PowerPC platform. Consequently, the crash cpu
> > > > > >is the only onlined cpu in the kdump kernel, but with its logical id not
> > > > > >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > > > > >thread is created, so sdp->work will be never executed and
> > > > > >__synchronize_srcu() can not be completed.
> > > > > >
> > > > > >Tackle this issue by queueing sdp->work on the first onlined cpu.
> > > > > >
> > > > > >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > > > > >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > > > > >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > > > > >Cc: Josh Triplett <josh@joshtriplett.org>
> > > > > >Cc: Steven Rostedt <rostedt@goodmis.org>
> > > > > >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > > > > >To: rcu@vger.kernel.org
> > > > > >---
> > > > > > kernel/rcu/srcutree.c | 10 ++++++----
> > > > > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > > > > >
> > > > > >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > > > > >index 1c304fec89c0..f2df561e0995 100644
> > > > > >--- a/kernel/rcu/srcutree.c
> > > > > >+++ b/kernel/rcu/srcutree.c
> > > > > >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> > > > > >       int state;
> > > > > >
> > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > >       else
> > > > > >               sdp = this_cpu_ptr(ssp->sda);
> > > > > >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > > > > >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> > > > > >       /* Initiate callback invocation as needed. */
> > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > > > > >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > > > > >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > > > > >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> > > > > >       } else {
> > > > > >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> > > > > >               srcu_for_each_node_breadth_first(ssp, snp) {
> > > > > >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> > > > > >       idx = srcu_read_lock(ssp);
> > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > >
> > > > > Hi Pingfan
> > > > >
> > > > > If CPU0 support hotplug.
> > > > > For example , in this time cpumask_first(cpu_online_mask) return 0
> > > > >
> > > > > >       else
> > > > > >               sdp = raw_cpu_ptr(ssp->sda);
> > > > > >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> > > > > >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> > > > > >
> > > > > >       idx = srcu_read_lock(ssp);
> > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > > > > >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > > > > >+                                      cpumask_first(cpu_online_mask)));
> > > > >
> > > > > Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> > > > > in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
> > > > 
> > > > Thanks for catching this bug.
> > > 
> > > Indeed, good catch!
> > > 
> > > > It seems that get_boot_cpu_id() can tackle the whole issue.  I will
> > > > wait for more comments and send out V2.
> > > 
> > > Does your patch require the CPU indicated by get_boot_cpu_id() to always
> > > stay online?  If so, is that CPU guaranteed to always stay online?
> > 
> > No, it is just a replacement for cpu 0.
> > 
> > A qualified candidate should meet two requirements:
> > -1. worker_pool is initialized, so it has worker thread to dispatch
> > work. This is satisfied by once the cpu was onlined.  
> > 
> > -2. this cpu id is persistent as Zqiang pointed out.
> > 
> > For hotplug, it is the same as cpu 0, which is taken care of by the
> > workqueue infrastructure.
> > 
> > > I am still asking my question about PowerPC booting with a non-zero CPU.
> > > After all, if all remaining architectures always boot with CPU 0, then
> > > perhaps some simplification is in order.
> > 
> > As my understanding, it is not easy. Let us unfold the discussion in
> > another thread post by you.
> 
> As you wish...
> 
> First, let me see if I understand the options.
> 
> 1.	Take your initial patch, but fixing the issue pointed out by
> 	Zhang Qiang.  One issue is that cpu_online_mask is not initialized
> 	until boot_cpu_init() is invoked.  Given that SRCU is used by
> 	tracing, we must correctly handle early boot usage.
> 

The hotplug issue takes some effort to fix, so if there is a better way
to go, it should be avoided.

But you make me aware that I miss the early boot usage.
I think it carefully and hope I can learn from this lesson. Please
correct me if I am wrong in the following assumption.

Since ftrace_init() comes after srcu_init(), so before srcu_init(), only
_mcount has the opportunity to use SRCU, right?

If that part does not use the SRCU function like srcu_barrier(), it will
not a problem. Right?

> 2.	Update your initial patch to fix the issue pointed out by Zhang
> 	Qiang and also to use get_boot_cpu_id() as you suggested earlier.
> 	Except that the __boot_cpu_id variable read by get_boot_cpu_id()
> 	is also initialized by boot_cpu_init(), and is zero up to
> 	that point.  So this has the same problem as #1 above.
> 

This is the way that I want to take.

On the PowerPC, #define raw_smp_processor_id() (local_paca->paca_index),
instead of (*raw_cpu_ptr(&cpu_number)).
So in fact, boot_cpu_init()->smp_processor_id() gets the "boot_cpuid",
which is assigned to __boot_cpu_id() later.

For boot_cpuid, early_init_devtree()->early_init_dt_scan_cpus() decides
it. Then early_init_devtree()->allocate_paca(boot_cpuid)->initialise_paca(),
where new_paca->paca_index = cpu;

All these are done before start_kernel().

I have tested a patch adopting this method. It works.

> 3.	As in #2 above, but modify srcu_init() to check for SRCU callbacks
> 	queued on CPU 0 when boot_cpu_init() returns non-zero.  In this
> 	case, move those SRCU callbacks to the srcu_data structure
> 	indicated by boot_cpu_init().
> 
> 4.	Leave the SRCU code alone, but boot your crash kernel with
> 	srcutree.big_cpu_lim=0.  This would switch all srcu_struct
> 	structures to big mode at initialization time.	Once this switch
> 	completed, CPUs would queue SRCU callbacks on their own srcu_data
> 	structure's ->srcu_cblist.
> 
> 	I am not optimistic about this one, but could you please try it?
> 

I have tested it five times. It works.

> 5.	Start over with the current SRCU code.	Make srcu_delay_timer()
> 	and srcu_queue_delayed_work_on() check to see if sdp->cpu is
> 	online, and if not, use queue_work() instead of queue_work_on().
> 	There might be other adjustments required, but I am not
> 	immediately seeing them.
> 
> At the moment, #5 looks by far the best to me.
> 

The boot cpu is guaranteed to be online once. That condition relaxes the
code. But if you do not like #2, I am happy to have a shot on #5.


Thanks for your detailed suggestion.

Regards,

	Pingfan


> But first, are there any options that I am missing?  Or, for that matter,
> am I confused about any of the options that I called out?
> 
> 							Thanx, Paul
Michael Ellerman Oct. 28, 2022, 11:28 a.m. UTC | #9
"Paul E. McKenney" <paulmck@kernel.org> writes:
> On Wed, Oct 26, 2022 at 11:27:16AM +0800, Pingfan Liu wrote:
>> commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
>> snp_node array") assumes that cpu 0 is always online, but that is not
>> the truth when using maxcpus=1 in the command line for the kdump kernel.
>> 
>> On a PowerPC, the following kdump kernel hanging is observed:
>
> Adding a few PowerPC folks on CC for their thoughts systems booting with
> some CPU other than CPU 0 as the boot CPU.  Is this intended/supported?

Yes, unfortunately.

It comes as part of kdump, where a random CPU crashes and kexec's into a
new kernel, so we can end up booting the 2nd kernel on any CPU.

I have objections to the distro practice of booting the kdump kernel
with maxcpus=1, but no one has ever listened to me :)

>> ...
>> [    1.740036] systemd[1]: Hostname set to <xyz.com>
>> [  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
>> [  243.686264]       Not tainted 6.1.0-rc1 #1
>> [  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
>> [  243.686296] Call Trace:
>> [  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
>> [  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
>> [  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
>> [  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
>> [  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
>> [  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
>> [  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
>> [  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
>> [  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
>> [  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
>> [  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
>> [  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
>> [  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
>> [  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
>> [  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
>> [  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
>> [  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
>> [  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
>> [  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
>> [  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
>> [  243.686572] IRQMASK: 0
>> [  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
>> [  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
>> [  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
>> [  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
>> [  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>> [  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
>> [  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
>> [  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
>> [  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
>> [  243.686691] LR [0000000000000000] 0x0
>> [  243.686698] --- interrupt: 3000
>> [  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
>> [  243.686717]       Not tainted 6.1.0-rc1 #1
>> [  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
>> [  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
>> [  243.686758] Call Trace:
>> [  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
>> [  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
>> [  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
>> [  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
>> [  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
>> [  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
>> [  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
>> [  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
>> [  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
>> [  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
>> [  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
>> [  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
>> [  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
>> [  366.566298]       Not tainted 6.1.0-rc1 #1
>> [  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
>> [  366.566329] Call Trace:
>> ...
>> 
>> In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
>> kernel command line on the PowerPC platform. Consequently, the crash cpu
>> is the only onlined cpu in the kdump kernel, but with its logical id not
>> necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
>> thread is created, so sdp->work will be never executed and
>> __synchronize_srcu() can not be completed.
>> 
>> Tackle this issue by queueing sdp->work on the first onlined cpu.
>> 
>> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
>
> Good catch!!!  New one on me!  ;-)
>
> But a few questions...
>
> 1.	As noted above, is booting without CPU 0 an intentional and
> 	supported feature of PowerPC?  If not, perhaps a better approach
> 	would be to rule out this configuration.
>
> 2.	When booting without CPU 0, is it guaranteed that CPU 0 will
> 	never come online?  If not, then isn't the patch below subject
> 	to failure modes when that happens?

In the general case no it's not guaranteed.

You could boot on CPU N and then later CPU0 could be brought online via
hotplug.

> 3.	More generally, when CPU N is the boot CPU, is it guaranteed
> 	that CPU M, M < N, will never come online?  Same as above on
> 	failure modes.

No.

cheers
Paul E. McKenney Oct. 28, 2022, 5:38 p.m. UTC | #10
On Fri, Oct 28, 2022 at 10:28:27PM +1100, Michael Ellerman wrote:
> "Paul E. McKenney" <paulmck@kernel.org> writes:
> > On Wed, Oct 26, 2022 at 11:27:16AM +0800, Pingfan Liu wrote:
> >> commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> >> snp_node array") assumes that cpu 0 is always online, but that is not
> >> the truth when using maxcpus=1 in the command line for the kdump kernel.
> >> 
> >> On a PowerPC, the following kdump kernel hanging is observed:
> >
> > Adding a few PowerPC folks on CC for their thoughts systems booting with
> > some CPU other than CPU 0 as the boot CPU.  Is this intended/supported?
> 
> Yes, unfortunately.
> 
> It comes as part of kdump, where a random CPU crashes and kexec's into a
> new kernel, so we can end up booting the 2nd kernel on any CPU.
> 
> I have objections to the distro practice of booting the kdump kernel
> with maxcpus=1, but no one has ever listened to me :)

I know that feeling...  :-/

And thank you for your answers!

And one more that I should have asked to begin with...

Is it possible for the system booting on CPU N to not have CPU 0 in the
cpu_possible_mask?  As in is it possible for CPU 0's per-CPU variables
to be unmapped?

							Thanx, Paul

> >> ...
> >> [    1.740036] systemd[1]: Hostname set to <xyz.com>
> >> [  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> >> [  243.686264]       Not tainted 6.1.0-rc1 #1
> >> [  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >> [  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> >> [  243.686296] Call Trace:
> >> [  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> >> [  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> >> [  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> >> [  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> >> [  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> >> [  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> >> [  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> >> [  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> >> [  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> >> [  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> >> [  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> >> [  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> >> [  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> >> [  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> >> [  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> >> [  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> >> [  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> >> [  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> >> [  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> >> [  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> >> [  243.686572] IRQMASK: 0
> >> [  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> >> [  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> >> [  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> >> [  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> >> [  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >> [  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> >> [  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> >> [  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> >> [  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> >> [  243.686691] LR [0000000000000000] 0x0
> >> [  243.686698] --- interrupt: 3000
> >> [  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> >> [  243.686717]       Not tainted 6.1.0-rc1 #1
> >> [  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >> [  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> >> [  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> >> [  243.686758] Call Trace:
> >> [  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> >> [  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> >> [  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> >> [  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> >> [  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> >> [  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> >> [  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> >> [  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> >> [  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> >> [  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> >> [  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> >> [  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> >> [  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> >> [  366.566298]       Not tainted 6.1.0-rc1 #1
> >> [  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >> [  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> >> [  366.566329] Call Trace:
> >> ...
> >> 
> >> In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> >> kernel command line on the PowerPC platform. Consequently, the crash cpu
> >> is the only onlined cpu in the kdump kernel, but with its logical id not
> >> necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> >> thread is created, so sdp->work will be never executed and
> >> __synchronize_srcu() can not be completed.
> >> 
> >> Tackle this issue by queueing sdp->work on the first onlined cpu.
> >> 
> >> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> >
> > Good catch!!!  New one on me!  ;-)
> >
> > But a few questions...
> >
> > 1.	As noted above, is booting without CPU 0 an intentional and
> > 	supported feature of PowerPC?  If not, perhaps a better approach
> > 	would be to rule out this configuration.
> >
> > 2.	When booting without CPU 0, is it guaranteed that CPU 0 will
> > 	never come online?  If not, then isn't the patch below subject
> > 	to failure modes when that happens?
> 
> In the general case no it's not guaranteed.
> 
> You could boot on CPU N and then later CPU0 could be brought online via
> hotplug.
> 
> > 3.	More generally, when CPU N is the boot CPU, is it guaranteed
> > 	that CPU M, M < N, will never come online?  Same as above on
> > 	failure modes.
> 
> No.
> 
> cheers
Paul E. McKenney Oct. 28, 2022, 6:13 p.m. UTC | #11
On Fri, Oct 28, 2022 at 06:23:46PM +0800, Pingfan Liu wrote:
> Some stuff is PowerPC specific, CC PowerPC experts, please correct me if
> I make a mistake.
> 
> On Thu, Oct 27, 2022 at 09:52:00AM -0700, Paul E. McKenney wrote:
> > On Thu, Oct 27, 2022 at 11:03:53AM +0800, Pingfan Liu wrote:
> > > On Wed, Oct 26, 2022 at 06:55:52AM -0700, Paul E. McKenney wrote:
> > > > On Wed, Oct 26, 2022 at 04:21:26PM +0800, Pingfan Liu wrote:
> > > > > On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
> > > > > >
> > > > > > >
> > > > > > >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > > > > > >snp_node array") assumes that cpu 0 is always online, but that is not
> > > > > > >the truth when using maxcpus=1 in the command line for the kdump kernel.
> > > > > > >
> > > > > > >On a PowerPC, the following kdump kernel hanging is observed:
> > > > > > >...
> > > > > > >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> > > > > > >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > > > > > >[  243.686264]       Not tainted 6.1.0-rc1 #1
> > > > > > >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > > >[  243.686296] Call Trace:
> > > > > > >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > > > > > >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > > >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > > >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > > > > > >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > > >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > > >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > > > > > >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > > > > > >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > > > > > >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > > > > > >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > > > > > >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > > > > > >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > > > > > >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > > > > > >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > > > > > >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > > > > > >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > > > > > >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > > > > > >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > > > > > >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > > > > > >[  243.686572] IRQMASK: 0
> > > > > > >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > > > > > >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > > > > > >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > > > > > >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > > > > > >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > > > > >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > > > > > >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > > > > > >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > > > > > >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > > > > > >[  243.686691] LR [0000000000000000] 0x0
> > > > > > >[  243.686698] --- interrupt: 3000
> > > > > > >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > > > > > >[  243.686717]       Not tainted 6.1.0-rc1 #1
> > > > > > >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > > > > > >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > > > > > >[  243.686758] Call Trace:
> > > > > > >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > > > > > >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > > >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > > >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > > > > > >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > > >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > > >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > > > > > >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > > > > > >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > > > > > >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > > > > > >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > > > > > >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > > > > > >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > > > > > >[  366.566298]       Not tainted 6.1.0-rc1 #1
> > > > > > >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > > >[  366.566329] Call Trace:
> > > > > > >...
> > > > > > >
> > > > > > >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > > > > > >kernel command line on the PowerPC platform. Consequently, the crash cpu
> > > > > > >is the only onlined cpu in the kdump kernel, but with its logical id not
> > > > > > >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > > > > > >thread is created, so sdp->work will be never executed and
> > > > > > >__synchronize_srcu() can not be completed.
> > > > > > >
> > > > > > >Tackle this issue by queueing sdp->work on the first onlined cpu.
> > > > > > >
> > > > > > >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > > > > > >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > > > > > >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > > > > > >Cc: Josh Triplett <josh@joshtriplett.org>
> > > > > > >Cc: Steven Rostedt <rostedt@goodmis.org>
> > > > > > >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > > > > > >To: rcu@vger.kernel.org
> > > > > > >---
> > > > > > > kernel/rcu/srcutree.c | 10 ++++++----
> > > > > > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > > > > > >
> > > > > > >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > > > > > >index 1c304fec89c0..f2df561e0995 100644
> > > > > > >--- a/kernel/rcu/srcutree.c
> > > > > > >+++ b/kernel/rcu/srcutree.c
> > > > > > >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> > > > > > >       int state;
> > > > > > >
> > > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > > >       else
> > > > > > >               sdp = this_cpu_ptr(ssp->sda);
> > > > > > >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > > > > > >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> > > > > > >       /* Initiate callback invocation as needed. */
> > > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > > >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > > > > > >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > > > > > >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > > > > > >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> > > > > > >       } else {
> > > > > > >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> > > > > > >               srcu_for_each_node_breadth_first(ssp, snp) {
> > > > > > >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> > > > > > >       idx = srcu_read_lock(ssp);
> > > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > > >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> > > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > >
> > > > > > Hi Pingfan
> > > > > >
> > > > > > If CPU0 support hotplug.
> > > > > > For example , in this time cpumask_first(cpu_online_mask) return 0
> > > > > >
> > > > > > >       else
> > > > > > >               sdp = raw_cpu_ptr(ssp->sda);
> > > > > > >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> > > > > > >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> > > > > > >
> > > > > > >       idx = srcu_read_lock(ssp);
> > > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > > >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > > > > > >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > > > > > >+                                      cpumask_first(cpu_online_mask)));
> > > > > >
> > > > > > Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> > > > > > in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
> > > > > 
> > > > > Thanks for catching this bug.
> > > > 
> > > > Indeed, good catch!
> > > > 
> > > > > It seems that get_boot_cpu_id() can tackle the whole issue.  I will
> > > > > wait for more comments and send out V2.
> > > > 
> > > > Does your patch require the CPU indicated by get_boot_cpu_id() to always
> > > > stay online?  If so, is that CPU guaranteed to always stay online?
> > > 
> > > No, it is just a replacement for cpu 0.
> > > 
> > > A qualified candidate should meet two requirements:
> > > -1. worker_pool is initialized, so it has worker thread to dispatch
> > > work. This is satisfied by once the cpu was onlined.  
> > > 
> > > -2. this cpu id is persistent as Zqiang pointed out.
> > > 
> > > For hotplug, it is the same as cpu 0, which is taken care of by the
> > > workqueue infrastructure.
> > > 
> > > > I am still asking my question about PowerPC booting with a non-zero CPU.
> > > > After all, if all remaining architectures always boot with CPU 0, then
> > > > perhaps some simplification is in order.
> > > 
> > > As my understanding, it is not easy. Let us unfold the discussion in
> > > another thread post by you.
> > 
> > As you wish...
> > 
> > First, let me see if I understand the options.
> > 
> > 1.	Take your initial patch, but fixing the issue pointed out by
> > 	Zhang Qiang.  One issue is that cpu_online_mask is not initialized
> > 	until boot_cpu_init() is invoked.  Given that SRCU is used by
> > 	tracing, we must correctly handle early boot usage.
> 
> The hotplug issue takes some effort to fix, so if there is a better way
> to go, it should be avoided.
> 
> But you make me aware that I miss the early boot usage.
> I think it carefully and hope I can learn from this lesson. Please
> correct me if I am wrong in the following assumption.
> 
> Since ftrace_init() comes after srcu_init(), so before srcu_init(), only
> _mcount has the opportunity to use SRCU, right?
> 
> If that part does not use the SRCU function like srcu_barrier(), it will
> not a problem. Right?

We would need to ask the ftrace and bpf people to make sure.  Except that
in the past, my fond assumptions about things not happening before
some point in boot have almost always been rudely invalidated, just so
you know.  ;-)

> > 2.	Update your initial patch to fix the issue pointed out by Zhang
> > 	Qiang and also to use get_boot_cpu_id() as you suggested earlier.
> > 	Except that the __boot_cpu_id variable read by get_boot_cpu_id()
> > 	is also initialized by boot_cpu_init(), and is zero up to
> > 	that point.  So this has the same problem as #1 above.
> 
> This is the way that I want to take.
> 
> On the PowerPC, #define raw_smp_processor_id() (local_paca->paca_index),
> instead of (*raw_cpu_ptr(&cpu_number)).
> So in fact, boot_cpu_init()->smp_processor_id() gets the "boot_cpuid",
> which is assigned to __boot_cpu_id() later.

I have to ask...  What does "boot_cpu_init()->smp_processor_id()" mean?

> For boot_cpuid, early_init_devtree()->early_init_dt_scan_cpus() decides
> it. Then early_init_devtree()->allocate_paca(boot_cpuid)->initialise_paca(),
> where new_paca->paca_index = cpu;
> 
> All these are done before start_kernel().
> 
> I have tested a patch adopting this method. It works.

Could you please send the actual patch?

> > 3.	As in #2 above, but modify srcu_init() to check for SRCU callbacks
> > 	queued on CPU 0 when boot_cpu_init() returns non-zero.  In this
> > 	case, move those SRCU callbacks to the srcu_data structure
> > 	indicated by boot_cpu_init().
> > 
> > 4.	Leave the SRCU code alone, but boot your crash kernel with
> > 	srcutree.big_cpu_lim=0.  This would switch all srcu_struct
> > 	structures to big mode at initialization time.	Once this switch
> > 	completed, CPUs would queue SRCU callbacks on their own srcu_data
> > 	structure's ->srcu_cblist.
> > 
> > 	I am not optimistic about this one, but could you please try it?
> 
> I have tested it five times. It works.

Given that it seems to work, I find this approach extremely attractive.
Hence my question about CPU 0's per-CPU variables in my recent email to
Michael Ellerman.

> > 5.	Start over with the current SRCU code.	Make srcu_delay_timer()
> > 	and srcu_queue_delayed_work_on() check to see if sdp->cpu is
> > 	online, and if not, use queue_work() instead of queue_work_on().
> > 	There might be other adjustments required, but I am not
> > 	immediately seeing them.
> > 
> > At the moment, #5 looks by far the best to me.
> 
> The boot cpu is guaranteed to be online once. That condition relaxes the
> code. But if you do not like #2, I am happy to have a shot on #5.

I need to see the patch for your #2 before I make any decision between
#2 and #5.  But #4, if it works for real rather than by accident, looks
even better.

> Thanks for your detailed suggestion.

And thank you for finding this issue and your continued attention to it.

							Thanx, Paul

> Regards,
> 
> 	Pingfan
> 
> 
> > But first, are there any options that I am missing?  Or, for that matter,
> > am I confused about any of the options that I called out?
> > 
> > 							Thanx, Paul
Pingfan Liu Oct. 31, 2022, 1:45 a.m. UTC | #12
On Fri, Oct 28, 2022 at 11:13:05AM -0700, Paul E. McKenney wrote:
> On Fri, Oct 28, 2022 at 06:23:46PM +0800, Pingfan Liu wrote:
> > Some stuff is PowerPC specific, CC PowerPC experts, please correct me if
> > I make a mistake.
> > 
> > On Thu, Oct 27, 2022 at 09:52:00AM -0700, Paul E. McKenney wrote:
> > > On Thu, Oct 27, 2022 at 11:03:53AM +0800, Pingfan Liu wrote:
> > > > On Wed, Oct 26, 2022 at 06:55:52AM -0700, Paul E. McKenney wrote:
> > > > > On Wed, Oct 26, 2022 at 04:21:26PM +0800, Pingfan Liu wrote:
> > > > > > On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
> > > > > > >
> > > > > > > >
> > > > > > > >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > > > > > > >snp_node array") assumes that cpu 0 is always online, but that is not
> > > > > > > >the truth when using maxcpus=1 in the command line for the kdump kernel.
> > > > > > > >
> > > > > > > >On a PowerPC, the following kdump kernel hanging is observed:
> > > > > > > >...
> > > > > > > >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> > > > > > > >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > > > > > > >[  243.686264]       Not tainted 6.1.0-rc1 #1
> > > > > > > >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > > > >[  243.686296] Call Trace:
> > > > > > > >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > > > > > > >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > > > >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > > > >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > > > > > > >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > > > >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > > > >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > > > > > > >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > > > > > > >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > > > > > > >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > > > > > > >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > > > > > > >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > > > > > > >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > > > > > > >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > > > > > > >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > > > > > > >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > > > > > > >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > > > > > > >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > > > > > > >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > > > > > > >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > > > > > > >[  243.686572] IRQMASK: 0
> > > > > > > >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > > > > > > >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > > > > > > >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > > > > > > >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > > > > > > >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > > > > > >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > > > > > > >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > > > > > > >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > > > > > > >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > > > > > > >[  243.686691] LR [0000000000000000] 0x0
> > > > > > > >[  243.686698] --- interrupt: 3000
> > > > > > > >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > > > > > > >[  243.686717]       Not tainted 6.1.0-rc1 #1
> > > > > > > >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > > > > > > >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > > > > > > >[  243.686758] Call Trace:
> > > > > > > >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > > > > > > >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > > > >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > > > >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > > > > > > >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > > > >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > > > >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > > > > > > >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > > > > > > >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > > > > > > >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > > > > > > >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > > > > > > >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > > > > > > >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > > > > > > >[  366.566298]       Not tainted 6.1.0-rc1 #1
> > > > > > > >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > > > >[  366.566329] Call Trace:
> > > > > > > >...
> > > > > > > >
> > > > > > > >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > > > > > > >kernel command line on the PowerPC platform. Consequently, the crash cpu
> > > > > > > >is the only onlined cpu in the kdump kernel, but with its logical id not
> > > > > > > >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > > > > > > >thread is created, so sdp->work will be never executed and
> > > > > > > >__synchronize_srcu() can not be completed.
> > > > > > > >
> > > > > > > >Tackle this issue by queueing sdp->work on the first onlined cpu.
> > > > > > > >
> > > > > > > >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > > > > > > >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > > > > > > >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > > > > > > >Cc: Josh Triplett <josh@joshtriplett.org>
> > > > > > > >Cc: Steven Rostedt <rostedt@goodmis.org>
> > > > > > > >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > > > > > > >To: rcu@vger.kernel.org
> > > > > > > >---
> > > > > > > > kernel/rcu/srcutree.c | 10 ++++++----
> > > > > > > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > > > > > > >
> > > > > > > >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > > > > > > >index 1c304fec89c0..f2df561e0995 100644
> > > > > > > >--- a/kernel/rcu/srcutree.c
> > > > > > > >+++ b/kernel/rcu/srcutree.c
> > > > > > > >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> > > > > > > >       int state;
> > > > > > > >
> > > > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > > > >       else
> > > > > > > >               sdp = this_cpu_ptr(ssp->sda);
> > > > > > > >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > > > > > > >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> > > > > > > >       /* Initiate callback invocation as needed. */
> > > > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > > > >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > > > > > > >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > > > > > > >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > > > > > > >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> > > > > > > >       } else {
> > > > > > > >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> > > > > > > >               srcu_for_each_node_breadth_first(ssp, snp) {
> > > > > > > >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> > > > > > > >       idx = srcu_read_lock(ssp);
> > > > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > > > >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> > > > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > > >
> > > > > > > Hi Pingfan
> > > > > > >
> > > > > > > If CPU0 support hotplug.
> > > > > > > For example , in this time cpumask_first(cpu_online_mask) return 0
> > > > > > >
> > > > > > > >       else
> > > > > > > >               sdp = raw_cpu_ptr(ssp->sda);
> > > > > > > >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> > > > > > > >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> > > > > > > >
> > > > > > > >       idx = srcu_read_lock(ssp);
> > > > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > > > >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > > > > > > >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > > > > > > >+                                      cpumask_first(cpu_online_mask)));
> > > > > > >
> > > > > > > Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> > > > > > > in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
> > > > > > 
> > > > > > Thanks for catching this bug.
> > > > > 
> > > > > Indeed, good catch!
> > > > > 
> > > > > > It seems that get_boot_cpu_id() can tackle the whole issue.  I will
> > > > > > wait for more comments and send out V2.
> > > > > 
> > > > > Does your patch require the CPU indicated by get_boot_cpu_id() to always
> > > > > stay online?  If so, is that CPU guaranteed to always stay online?
> > > > 
> > > > No, it is just a replacement for cpu 0.
> > > > 
> > > > A qualified candidate should meet two requirements:
> > > > -1. worker_pool is initialized, so it has worker thread to dispatch
> > > > work. This is satisfied by once the cpu was onlined.  
> > > > 
> > > > -2. this cpu id is persistent as Zqiang pointed out.
> > > > 
> > > > For hotplug, it is the same as cpu 0, which is taken care of by the
> > > > workqueue infrastructure.
> > > > 
> > > > > I am still asking my question about PowerPC booting with a non-zero CPU.
> > > > > After all, if all remaining architectures always boot with CPU 0, then
> > > > > perhaps some simplification is in order.
> > > > 
> > > > As my understanding, it is not easy. Let us unfold the discussion in
> > > > another thread post by you.
> > > 
> > > As you wish...
> > > 
> > > First, let me see if I understand the options.
> > > 
> > > 1.	Take your initial patch, but fixing the issue pointed out by
> > > 	Zhang Qiang.  One issue is that cpu_online_mask is not initialized
> > > 	until boot_cpu_init() is invoked.  Given that SRCU is used by
> > > 	tracing, we must correctly handle early boot usage.
> > 
> > The hotplug issue takes some effort to fix, so if there is a better way
> > to go, it should be avoided.
> > 
> > But you make me aware that I miss the early boot usage.
> > I think it carefully and hope I can learn from this lesson. Please
> > correct me if I am wrong in the following assumption.
> > 
> > Since ftrace_init() comes after srcu_init(), so before srcu_init(), only
> > _mcount has the opportunity to use SRCU, right?
> > 
> > If that part does not use the SRCU function like srcu_barrier(), it will
> > not a problem. Right?
> 
> We would need to ask the ftrace and bpf people to make sure.  Except that
> in the past, my fond assumptions about things not happening before
> some point in boot have almost always been rudely invalidated, just so
> you know.  ;-)
> 
> > > 2.	Update your initial patch to fix the issue pointed out by Zhang
> > > 	Qiang and also to use get_boot_cpu_id() as you suggested earlier.
> > > 	Except that the __boot_cpu_id variable read by get_boot_cpu_id()
> > > 	is also initialized by boot_cpu_init(), and is zero up to
> > > 	that point.  So this has the same problem as #1 above.
> > 
> > This is the way that I want to take.
> > 
> > On the PowerPC, #define raw_smp_processor_id() (local_paca->paca_index),
> > instead of (*raw_cpu_ptr(&cpu_number)).
> > So in fact, boot_cpu_init()->smp_processor_id() gets the "boot_cpuid",
> > which is assigned to __boot_cpu_id() later.
> 
> I have to ask...  What does "boot_cpu_init()->smp_processor_id()" mean?
> 

void __init boot_cpu_init(void)
{
        int cpu = smp_processor_id();
                  ^^^ This is specific on PowerPC, which is finally implemented by "#define raw_smp_processor_id() (local_paca->paca_index)"
		      And it is not zero up at this point. For the initialization of ->paca_index, please refer to the old comment just right after this function.


        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
        set_cpu_online(cpu, true);
        set_cpu_active(cpu, true);
        set_cpu_present(cpu, true);
        set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
        __boot_cpu_id = cpu;
	                ^^^ Actually, cpu will not be zero up
#endif
}


> > For boot_cpuid, early_init_devtree()->early_init_dt_scan_cpus() decides
> > it. Then early_init_devtree()->allocate_paca(boot_cpuid)->initialise_paca(),
> > where new_paca->paca_index = cpu;
> > 
> > All these are done before start_kernel().
> > 

More accurate info:
arch/powerpc/kernel/head_64.S:973:   LOAD_REG_ADDR(r12, DOTSYM(early_setup))  results in the calling to early_setup()
->early_init_devtree(), which finally sets up the local_paca->paca_index

Then 
arch/powerpc/kernel/head_64.S:1002:  bl      start_kernel

So before the common kernel starts, PowerPC has made its effort so that
get_boot_cpu_id() can keep its semantic.

> > I have tested a patch adopting this method. It works.
> 
> Could you please send the actual patch?
> 

Sure, I will sent out it right after finishing this mail.

> > > 3.	As in #2 above, but modify srcu_init() to check for SRCU callbacks
> > > 	queued on CPU 0 when boot_cpu_init() returns non-zero.  In this
> > > 	case, move those SRCU callbacks to the srcu_data structure
> > > 	indicated by boot_cpu_init().
> > > 
> > > 4.	Leave the SRCU code alone, but boot your crash kernel with
> > > 	srcutree.big_cpu_lim=0.  This would switch all srcu_struct
> > > 	structures to big mode at initialization time.	Once this switch
> > > 	completed, CPUs would queue SRCU callbacks on their own srcu_data
> > > 	structure's ->srcu_cblist.
> > > 
> > > 	I am not optimistic about this one, but could you please try it?
> > 
> > I have tested it five times. It works.
> 
> Given that it seems to work, I find this approach extremely attractive.
> Hence my question about CPU 0's per-CPU variables in my recent email to
> Michael Ellerman.
> 

Sorry that I can not get your point. Are you suggesting to append an extra
kernel param to the kdump kernel?


Thanks,

	Pingfan


> > > 5.	Start over with the current SRCU code.	Make srcu_delay_timer()
> > > 	and srcu_queue_delayed_work_on() check to see if sdp->cpu is
> > > 	online, and if not, use queue_work() instead of queue_work_on().
> > > 	There might be other adjustments required, but I am not
> > > 	immediately seeing them.
> > > 
> > > At the moment, #5 looks by far the best to me.
> > 
> > The boot cpu is guaranteed to be online once. That condition relaxes the
> > code. But if you do not like #2, I am happy to have a shot on #5.
> 
> I need to see the patch for your #2 before I make any decision between
> #2 and #5.  But #4, if it works for real rather than by accident, looks
> even better.
> 
> > Thanks for your detailed suggestion.
> 
> And thank you for finding this issue and your continued attention to it.
> 
> 							Thanx, Paul
> 
> > Regards,
> > 
> > 	Pingfan
> > 
> > 
> > > But first, are there any options that I am missing?  Or, for that matter,
> > > am I confused about any of the options that I called out?
> > > 
> > > 							Thanx, Paul
Zqiang Nov. 2, 2022, 6:07 a.m. UTC | #13
On Fri, Oct 28, 2022 at 11:13:05AM -0700, Paul E. McKenney wrote:
> On Fri, Oct 28, 2022 at 06:23:46PM +0800, Pingfan Liu wrote:
> > Some stuff is PowerPC specific, CC PowerPC experts, please correct me if
> > I make a mistake.
> > 
> > On Thu, Oct 27, 2022 at 09:52:00AM -0700, Paul E. McKenney wrote:
> > > On Thu, Oct 27, 2022 at 11:03:53AM +0800, Pingfan Liu wrote:
> > > > On Wed, Oct 26, 2022 at 06:55:52AM -0700, Paul E. McKenney wrote:
> > > > > On Wed, Oct 26, 2022 at 04:21:26PM +0800, Pingfan Liu wrote:
> > > > > > On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
> > > > > > >
> > > > > > > >
> > > > > > > >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > > > > > > >snp_node array") assumes that cpu 0 is always online, but that is not
> > > > > > > >the truth when using maxcpus=1 in the command line for the kdump kernel.
> > > > > > > >
> > > > > > > >On a PowerPC, the following kdump kernel hanging is observed:
> > > > > > > >...
> > > > > > > >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> > > > > > > >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > > > > > > >[  243.686264]       Not tainted 6.1.0-rc1 #1
> > > > > > > >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > > > >[  243.686296] Call Trace:
> > > > > > > >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > > > > > > >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > > > >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > > > >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > > > > > > >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > > > >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > > > >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > > > > > > >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > > > > > > >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > > > > > > >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > > > > > > >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > > > > > > >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > > > > > > >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > > > > > > >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > > > > > > >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > > > > > > >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > > > > > > >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > > > > > > >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > > > > > > >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > > > > > > >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > > > > > > >[  243.686572] IRQMASK: 0
> > > > > > > >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > > > > > > >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > > > > > > >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > > > > > > >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > > > > > > >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > > > > > >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > > > > > > >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > > > > > > >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > > > > > > >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > > > > > > >[  243.686691] LR [0000000000000000] 0x0
> > > > > > > >[  243.686698] --- interrupt: 3000
> > > > > > > >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > > > > > > >[  243.686717]       Not tainted 6.1.0-rc1 #1
> > > > > > > >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > > > > > > >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > > > > > > >[  243.686758] Call Trace:
> > > > > > > >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > > > > > > >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > > > >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > > > >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > > > > > > >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > > > >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > > > >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > > > > > > >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > > > > > > >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > > > > > > >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > > > > > > >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > > > > > > >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > > > > > > >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > > > > > > >[  366.566298]       Not tainted 6.1.0-rc1 #1
> > > > > > > >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > > > >[  366.566329] Call Trace:
> > > > > > > >...
> > > > > > > >
> > > > > > > >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > > > > > > >kernel command line on the PowerPC platform. Consequently, the crash cpu
> > > > > > > >is the only onlined cpu in the kdump kernel, but with its logical id not
> > > > > > > >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > > > > > > >thread is created, so sdp->work will be never executed and
> > > > > > > >__synchronize_srcu() can not be completed.
> > > > > > > >
> > > > > > > >Tackle this issue by queueing sdp->work on the first onlined cpu.
> > > > > > > >
> > > > > > > >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > > > > > > >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > > > > > > >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > > > > > > >Cc: Josh Triplett <josh@joshtriplett.org>
> > > > > > > >Cc: Steven Rostedt <rostedt@goodmis.org>
> > > > > > > >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > > > > > > >To: rcu@vger.kernel.org
> > > > > > > >---
> > > > > > > > kernel/rcu/srcutree.c | 10 ++++++----
> > > > > > > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > > > > > > >
> > > > > > > >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > > > > > > >index 1c304fec89c0..f2df561e0995 100644
> > > > > > > >--- a/kernel/rcu/srcutree.c
> > > > > > > >+++ b/kernel/rcu/srcutree.c
> > > > > > > >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> > > > > > > >       int state;
> > > > > > > >
> > > > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > > > >       else
> > > > > > > >               sdp = this_cpu_ptr(ssp->sda);
> > > > > > > >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > > > > > > >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> > > > > > > >       /* Initiate callback invocation as needed. */
> > > > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > > > >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > > > > > > >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > > > > > > >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > > > > > > >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> > > > > > > >       } else {
> > > > > > > >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> > > > > > > >               srcu_for_each_node_breadth_first(ssp, snp) {
> > > > > > > >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> > > > > > > >       idx = srcu_read_lock(ssp);
> > > > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > > > >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> > > > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > > >
> > > > > > > Hi Pingfan
> > > > > > >
> > > > > > > If CPU0 support hotplug.
> > > > > > > For example , in this time cpumask_first(cpu_online_mask) return 0
> > > > > > >
> > > > > > > >       else
> > > > > > > >               sdp = raw_cpu_ptr(ssp->sda);
> > > > > > > >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> > > > > > > >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> > > > > > > >
> > > > > > > >       idx = srcu_read_lock(ssp);
> > > > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > > > >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > > > > > > >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > > > > > > >+                                      cpumask_first(cpu_online_mask)));
> > > > > > >
> > > > > > > Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> > > > > > > in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
> > > > > > 
> > > > > > Thanks for catching this bug.
> > > > > 
> > > > > Indeed, good catch!
> > > > > 
> > > > > > It seems that get_boot_cpu_id() can tackle the whole issue.  I will
> > > > > > wait for more comments and send out V2.
> > > > > 
> > > > > Does your patch require the CPU indicated by get_boot_cpu_id() to always
> > > > > stay online?  If so, is that CPU guaranteed to always stay online?
> > > > 
> > > > No, it is just a replacement for cpu 0.
> > > > 
> > > > A qualified candidate should meet two requirements:
> > > > -1. worker_pool is initialized, so it has worker thread to dispatch
> > > > work. This is satisfied by once the cpu was onlined.  
> > > > 
> > > > -2. this cpu id is persistent as Zqiang pointed out.
> > > > 
> > > > For hotplug, it is the same as cpu 0, which is taken care of by the
> > > > workqueue infrastructure.
> > > > 
> > > > > I am still asking my question about PowerPC booting with a non-zero CPU.
> > > > > After all, if all remaining architectures always boot with CPU 0, then
> > > > > perhaps some simplification is in order.
> > > > 
> > > > As my understanding, it is not easy. Let us unfold the discussion in
> > > > another thread post by you.
> > > 
> > > As you wish...
> > > 
> > > First, let me see if I understand the options.
> > > 
> > > 1.	Take your initial patch, but fixing the issue pointed out by
> > > 	Zhang Qiang.  One issue is that cpu_online_mask is not initialized
> > > 	until boot_cpu_init() is invoked.  Given that SRCU is used by
> > > 	tracing, we must correctly handle early boot usage.
> > 
> > The hotplug issue takes some effort to fix, so if there is a better way
> > to go, it should be avoided.
> > 
> > But you make me aware that I miss the early boot usage.
> > I think it carefully and hope I can learn from this lesson. Please
> > correct me if I am wrong in the following assumption.
> > 
> > Since ftrace_init() comes after srcu_init(), so before srcu_init(), only
> > _mcount has the opportunity to use SRCU, right?
> > 
> > If that part does not use the SRCU function like srcu_barrier(), it will
> > not a problem. Right?
> 
> We would need to ask the ftrace and bpf people to make sure.  Except that
> in the past, my fond assumptions about things not happening before
> some point in boot have almost always been rudely invalidated, just so
> you know.  ;-)
> 
> > > 2.	Update your initial patch to fix the issue pointed out by Zhang
> > > 	Qiang and also to use get_boot_cpu_id() as you suggested earlier.
> > > 	Except that the __boot_cpu_id variable read by get_boot_cpu_id()
> > > 	is also initialized by boot_cpu_init(), and is zero up to
> > > 	that point.  So this has the same problem as #1 above.
> > 
> > This is the way that I want to take.
> > 
> > On the PowerPC, #define raw_smp_processor_id() (local_paca->paca_index),
> > instead of (*raw_cpu_ptr(&cpu_number)).
> > So in fact, boot_cpu_init()->smp_processor_id() gets the "boot_cpuid",
> > which is assigned to __boot_cpu_id() later.
> 
> I have to ask...  What does "boot_cpu_init()->smp_processor_id()" mean?
> 
>
>void __init boot_cpu_init(void)
>{
>        int cpu = smp_processor_id();
>                  ^^^ This is specific on PowerPC, which is finally implemented by "#define raw_smp_processor_id() (local_paca->paca_index)"
>		      And it is not zero up at this point. For the initialization of ->paca_index, please refer to the old comment just right after this function.
>
>
>        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
>        set_cpu_online(cpu, true);
>        set_cpu_active(cpu, true);
>        set_cpu_present(cpu, true);
>        set_cpu_possible(cpu, true);
>
>#ifdef CONFIG_SMP
>        __boot_cpu_id = cpu;
>	                ^^^ Actually, cpu will not be zero up
>#endif
>}
>

> > For boot_cpuid, early_init_devtree()->early_init_dt_scan_cpus() decides
> > it. Then early_init_devtree()->allocate_paca(boot_cpuid)->initialise_paca(),
> > where new_paca->paca_index = cpu;
> > 
> > All these are done before start_kernel().
> > 
>
>More accurate info:
>arch/powerpc/kernel/head_64.S:973:   LOAD_REG_ADDR(r12, DOTSYM(early_setup))  results in the calling to early_setup()
->early_init_devtree(), which finally sets up the local_paca->paca_index
>
>Then 
>arch/powerpc/kernel/head_64.S:1002:  bl      start_kernel
>
>So before the common kernel starts, PowerPC has made its effort so that
>get_boot_cpu_id() can keep its semantic.
>
> > I have tested a patch adopting this method. It works.
> 
> Could you please send the actual patch?
> 
>
>Sure, I will sent out it right after finishing this mail.
>
> > > 3.	As in #2 above, but modify srcu_init() to check for SRCU callbacks
> > > 	queued on CPU 0 when boot_cpu_init() returns non-zero.  In this
> > > 	case, move those SRCU callbacks to the srcu_data structure
> > > 	indicated by boot_cpu_init().
> > > 
> > > 4.	Leave the SRCU code alone, but boot your crash kernel with
> > > 	srcutree.big_cpu_lim=0.  This would switch all srcu_struct
> > > 	structures to big mode at initialization time.	Once this switch
> > > 	completed, CPUs would queue SRCU callbacks on their own srcu_data
> > > 	structure's ->srcu_cblist.
> > > 
> > > 	I am not optimistic about this one, but could you please try it?
> > 
> > I have tested it five times. It works.
> 
> Given that it seems to work, I find this approach extremely attractive.
> Hence my question about CPU 0's per-CPU variables in my recent email to
> Michael Ellerman.
> 
>
>Sorry that I can not get your point. Are you suggesting to append an extra
>kernel param to the kdump kernel?
>
>
>Thanks,
>
>	Pingfan
>
>
> > > 5.	Start over with the current SRCU code.	Make srcu_delay_timer()
> > > 	and srcu_queue_delayed_work_on() check to see if sdp->cpu is
> > > 	online, and if not, use queue_work() instead of queue_work_on().
> > > 	There might be other adjustments required, but I am not
> > > 	immediately seeing them.
> > > 

Hi Pingfan, does this change look clearer?

--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -745,19 +745,23 @@ static void srcu_gp_start(struct srcu_struct *ssp)
        WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
 }

+static inline int sdp_cpu(struct srcu_data *sdp)
+{
+       return cpu_online(sdp->cpu) ? sdp->cpu : raw_smp_processor_id();
+}

 static void srcu_delay_timer(struct timer_list *t)
 {
        struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work);

-       queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
+       queue_work_on(sdp_cpu(sdp), rcu_gp_wq, &sdp->work);
 }

 static void srcu_queue_delayed_work_on(struct srcu_data *sdp,
                                       unsigned long delay)
 {
        if (!delay) {
-               queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
+               queue_work_on(sdp_cpu(sdp), rcu_gp_wq, &sdp->work);
                return;
        }

Thanks
Zqiang

> > > At the moment, #5 looks by far the best to me.
> > 
> > The boot cpu is guaranteed to be online once. That condition relaxes the
> > code. But if you do not like #2, I am happy to have a shot on #5.
> 
> I need to see the patch for your #2 before I make any decision between
> #2 and #5.  But #4, if it works for real rather than by accident, looks
> even better.
> 
> > Thanks for your detailed suggestion.
> 
> And thank you for finding this issue and your continued attention to it.
> 
> 							Thanx, Paul
> 
> > Regards,
> > 
> > 	Pingfan
> > 
> > 
> > > But first, are there any options that I am missing?  Or, for that matter,
> > > am I confused about any of the options that I called out?
> > > 
> > > 							Thanx, Paul
Pingfan Liu Nov. 2, 2022, 2:06 p.m. UTC | #14
Hi Zqiang,

On Wed, Nov 2, 2022 at 2:07 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
>
> On Fri, Oct 28, 2022 at 11:13:05AM -0700, Paul E. McKenney wrote:
> > On Fri, Oct 28, 2022 at 06:23:46PM +0800, Pingfan Liu wrote:
> > > Some stuff is PowerPC specific, CC PowerPC experts, please correct me if
> > > I make a mistake.
> > >
> > > On Thu, Oct 27, 2022 at 09:52:00AM -0700, Paul E. McKenney wrote:
> > > > On Thu, Oct 27, 2022 at 11:03:53AM +0800, Pingfan Liu wrote:
> > > > > On Wed, Oct 26, 2022 at 06:55:52AM -0700, Paul E. McKenney wrote:
> > > > > > On Wed, Oct 26, 2022 at 04:21:26PM +0800, Pingfan Liu wrote:
> > > > > > > On Wed, Oct 26, 2022 at 12:36 PM Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
> > > > > > > >
> > > > > > > > >
> > > > > > > > >commit 994f706872e6 ("srcu: Make Tree SRCU able to operate without
> > > > > > > > >snp_node array") assumes that cpu 0 is always online, but that is not
> > > > > > > > >the truth when using maxcpus=1 in the command line for the kdump kernel.
> > > > > > > > >
> > > > > > > > >On a PowerPC, the following kdump kernel hanging is observed:
> > > > > > > > >...
> > > > > > > > >[    1.740036] systemd[1]: Hostname set to <xyz.com>
> > > > > > > > >[  243.686240] INFO: task systemd:1 blocked for more than 122 seconds.
> > > > > > > > >[  243.686264]       Not tainted 6.1.0-rc1 #1
> > > > > > > > >[  243.686272] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > > >[  243.686281] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > > > > >[  243.686296] Call Trace:
> > > > > > > > >[  243.686301] [c000000016657640] [c000000016657670] 0xc000000016657670 (unreliable)
> > > > > > > > >[  243.686317] [c000000016657830] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > > > > >[  243.686333] [c000000016657890] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > > > > >[  243.686347] [c000000016657940] [c000000010f60bb4] schedule+0x74/0x140
> > > > > > > > >[  243.686361] [c0000000166579b0] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > > > > >[  243.686374] [c000000016657a80] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > > > > >[  243.686387] [c000000016657b20] [c000000010176bb0] __flush_work.isra.0+0x1c0/0x3d0
> > > > > > > > >[  243.686401] [c000000016657bb0] [c0000000105f2768] fsnotify_wait_marks_destroyed+0x28/0x40
> > > > > > > > >[  243.686415] [c000000016657bd0] [c0000000105f21b8] fsnotify_destroy_group+0x68/0x160
> > > > > > > > >[  243.686428] [c000000016657c40] [c0000000105f6500] inotify_release+0x30/0xa0
> > > > > > > > >[  243.686440] [c000000016657cb0] [c0000000105751a8] __fput+0xc8/0x350
> > > > > > > > >[  243.686452] [c000000016657d00] [c00000001017d524] task_work_run+0xe4/0x170
> > > > > > > > >[  243.686464] [c000000016657d50] [c000000010020e94] do_notify_resume+0x134/0x140
> > > > > > > > >[  243.686478] [c000000016657d80] [c00000001002eb18] interrupt_exit_user_prepare_main+0x198/0x270
> > > > > > > > >[  243.686493] [c000000016657de0] [c00000001002ec60] syscall_exit_prepare+0x70/0x180
> > > > > > > > >[  243.686505] [c000000016657e10] [c00000001000bf7c] system_call_vectored_common+0xfc/0x280
> > > > > > > > >[  243.686520] --- interrupt: 3000 at 0x7fffa47d5ba4
> > > > > > > > >[  243.686528] NIP:  00007fffa47d5ba4 LR: 0000000000000000 CTR: 0000000000000000
> > > > > > > > >[  243.686538] REGS: c000000016657e80 TRAP: 3000   Not tainted  (6.1.0-rc1)
> > > > > > > > >[  243.686548] MSR:  800000000000d033 <SF,EE,PR,ME,IR,DR,RI,LE>  CR: 42044440  XER: 00000000
> > > > > > > > >[  243.686572] IRQMASK: 0
> > > > > > > > >[  243.686572] GPR00: 0000000000000006 00007ffffa606710 00007fffa48e7200 0000000000000000
> > > > > > > > >[  243.686572] GPR04: 0000000000000002 000000000000000a 0000000000000000 0000000000000001
> > > > > > > > >[  243.686572] GPR08: 000001000c172dd0 0000000000000000 0000000000000000 0000000000000000
> > > > > > > > >[  243.686572] GPR12: 0000000000000000 00007fffa4ff4bc0 0000000000000000 0000000000000000
> > > > > > > > >[  243.686572] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > > > > > > >[  243.686572] GPR20: 0000000132dfdc50 000000000000000e 0000000000189375 0000000000000000
> > > > > > > > >[  243.686572] GPR24: 00007ffffa606ae0 0000000000000005 000001000c185490 000001000c172570
> > > > > > > > >[  243.686572] GPR28: 000001000c172990 000001000c184850 000001000c172e00 00007fffa4fedd98
> > > > > > > > >[  243.686683] NIP [00007fffa47d5ba4] 0x7fffa47d5ba4
> > > > > > > > >[  243.686691] LR [0000000000000000] 0x0
> > > > > > > > >[  243.686698] --- interrupt: 3000
> > > > > > > > >[  243.686708] INFO: task kworker/u16:1:24 blocked for more than 122 seconds.
> > > > > > > > >[  243.686717]       Not tainted 6.1.0-rc1 #1
> > > > > > > > >[  243.686724] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > > >[  243.686733] task:kworker/u16:1   state:D stack:0     pid:24    ppid:2      flags:0x00000800
> > > > > > > > >[  243.686747] Workqueue: events_unbound fsnotify_mark_destroy_workfn
> > > > > > > > >[  243.686758] Call Trace:
> > > > > > > > >[  243.686762] [c0000000166736e0] [c00000004fd91000] 0xc00000004fd91000 (unreliable)
> > > > > > > > >[  243.686775] [c0000000166738d0] [c00000001001dec0] __switch_to+0x130/0x220
> > > > > > > > >[  243.686788] [c000000016673930] [c000000010f607b8] __schedule+0x1f8/0x580
> > > > > > > > >[  243.686801] [c0000000166739e0] [c000000010f60bb4] schedule+0x74/0x140
> > > > > > > > >[  243.686814] [c000000016673a50] [c000000010f699b8] schedule_timeout+0x168/0x1c0
> > > > > > > > >[  243.686827] [c000000016673b20] [c000000010f61de8] __wait_for_common+0x148/0x360
> > > > > > > > >[  243.686840] [c000000016673bc0] [c000000010210840] __synchronize_srcu.part.0+0xa0/0xe0
> > > > > > > > >[  243.686855] [c000000016673c30] [c0000000105f2c64] fsnotify_mark_destroy_workfn+0xc4/0x1a0
> > > > > > > > >[  243.686868] [c000000016673ca0] [c000000010174ea8] process_one_work+0x2a8/0x570
> > > > > > > > >[  243.686882] [c000000016673d40] [c000000010175208] worker_thread+0x98/0x5e0
> > > > > > > > >[  243.686895] [c000000016673dc0] [c0000000101828d4] kthread+0x124/0x130
> > > > > > > > >[  243.686908] [c000000016673e10] [c00000001000cd40] ret_from_kernel_thread+0x5c/0x64
> > > > > > > > >[  366.566274] INFO: task systemd:1 blocked for more than 245 seconds.
> > > > > > > > >[  366.566298]       Not tainted 6.1.0-rc1 #1
> > > > > > > > >[  366.566305] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > > > > > >[  366.566314] task:systemd         state:D stack:0     pid:1     ppid:0      flags:0x00042000
> > > > > > > > >[  366.566329] Call Trace:
> > > > > > > > >...
> > > > > > > > >
> > > > > > > > >In that case, note that maxcpus=1 instead of nr_cpus=1 is used in the
> > > > > > > > >kernel command line on the PowerPC platform. Consequently, the crash cpu
> > > > > > > > >is the only onlined cpu in the kdump kernel, but with its logical id not
> > > > > > > > >necessary 0. While SRCU queues a sdp->work on cpu 0, on which no worker
> > > > > > > > >thread is created, so sdp->work will be never executed and
> > > > > > > > >__synchronize_srcu() can not be completed.
> > > > > > > > >
> > > > > > > > >Tackle this issue by queueing sdp->work on the first onlined cpu.
> > > > > > > > >
> > > > > > > > >Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > > > > > > > >Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > > > > > > > >Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > > > > > > > >Cc: Josh Triplett <josh@joshtriplett.org>
> > > > > > > > >Cc: Steven Rostedt <rostedt@goodmis.org>
> > > > > > > > >Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > > > > > > > >To: rcu@vger.kernel.org
> > > > > > > > >---
> > > > > > > > > kernel/rcu/srcutree.c | 10 ++++++----
> > > > > > > > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > > > > > > > >
> > > > > > > > >diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > > > > > > > >index 1c304fec89c0..f2df561e0995 100644
> > > > > > > > >--- a/kernel/rcu/srcutree.c
> > > > > > > > >+++ b/kernel/rcu/srcutree.c
> > > > > > > > >@@ -663,7 +663,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
> > > > > > > > >       int state;
> > > > > > > > >
> > > > > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > > > > >       else
> > > > > > > > >               sdp = this_cpu_ptr(ssp->sda);
> > > > > > > > >       lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
> > > > > > > > >@@ -774,7 +774,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
> > > > > > > > >       /* Initiate callback invocation as needed. */
> > > > > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > > > > >       if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
> > > > > > > > >-              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
> > > > > > > > >+              srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
> > > > > > > > >+                                      cpumask_first(cpu_online_mask)), cbdelay);
> > > > > > > > >       } else {
> > > > > > > > >               idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
> > > > > > > > >               srcu_for_each_node_breadth_first(ssp, snp) {
> > > > > > > > >@@ -1093,7 +1094,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
> > > > > > > > >       idx = srcu_read_lock(ssp);
> > > > > > > > >       ss_state = smp_load_acquire(&ssp->srcu_size_state);
> > > > > > > > >       if (ss_state < SRCU_SIZE_WAIT_CALL)
> > > > > > > > >-              sdp = per_cpu_ptr(ssp->sda, 0);
> > > > > > > > >+              sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
> > > > > > > >
> > > > > > > > Hi Pingfan
> > > > > > > >
> > > > > > > > If CPU0 support hotplug.
> > > > > > > > For example , in this time cpumask_first(cpu_online_mask) return 0
> > > > > > > >
> > > > > > > > >       else
> > > > > > > > >               sdp = raw_cpu_ptr(ssp->sda);
> > > > > > > > >       spin_lock_irqsave_sdp_contention(sdp, &flags);
> > > > > > > > >@@ -1429,7 +1430,8 @@ void srcu_barrier(struct srcu_struct *ssp)
> > > > > > > > >
> > > > > > > > >       idx = srcu_read_lock(ssp);
> > > > > > > > >       if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
> > > > > > > > >-              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
> > > > > > > > >+              srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
> > > > > > > > >+                                      cpumask_first(cpu_online_mask)));
> > > > > > > >
> > > > > > > > Due to CPU hotplug, CPU0 have been offline, assume srcu_size_state is still SRCU_SIZE_WAIT_CALL.
> > > > > > > > in this time, cpumask_first(cpu_online_mask) not return 0,  this will cause the srcu barrier to return directly.
> > > > > > >
> > > > > > > Thanks for catching this bug.
> > > > > >
> > > > > > Indeed, good catch!
> > > > > >
> > > > > > > It seems that get_boot_cpu_id() can tackle the whole issue.  I will
> > > > > > > wait for more comments and send out V2.
> > > > > >
> > > > > > Does your patch require the CPU indicated by get_boot_cpu_id() to always
> > > > > > stay online?  If so, is that CPU guaranteed to always stay online?
> > > > >
> > > > > No, it is just a replacement for cpu 0.
> > > > >
> > > > > A qualified candidate should meet two requirements:
> > > > > -1. worker_pool is initialized, so it has worker thread to dispatch
> > > > > work. This is satisfied by once the cpu was onlined.
> > > > >
> > > > > -2. this cpu id is persistent as Zqiang pointed out.
> > > > >
> > > > > For hotplug, it is the same as cpu 0, which is taken care of by the
> > > > > workqueue infrastructure.
> > > > >
> > > > > > I am still asking my question about PowerPC booting with a non-zero CPU.
> > > > > > After all, if all remaining architectures always boot with CPU 0, then
> > > > > > perhaps some simplification is in order.
> > > > >
> > > > > As my understanding, it is not easy. Let us unfold the discussion in
> > > > > another thread post by you.
> > > >
> > > > As you wish...
> > > >
> > > > First, let me see if I understand the options.
> > > >
> > > > 1.        Take your initial patch, but fixing the issue pointed out by
> > > >   Zhang Qiang.  One issue is that cpu_online_mask is not initialized
> > > >   until boot_cpu_init() is invoked.  Given that SRCU is used by
> > > >   tracing, we must correctly handle early boot usage.
> > >
> > > The hotplug issue takes some effort to fix, so if there is a better way
> > > to go, it should be avoided.
> > >
> > > But you make me aware that I miss the early boot usage.
> > > I think it carefully and hope I can learn from this lesson. Please
> > > correct me if I am wrong in the following assumption.
> > >
> > > Since ftrace_init() comes after srcu_init(), so before srcu_init(), only
> > > _mcount has the opportunity to use SRCU, right?
> > >
> > > If that part does not use the SRCU function like srcu_barrier(), it will
> > > not a problem. Right?
> >
> > We would need to ask the ftrace and bpf people to make sure.  Except that
> > in the past, my fond assumptions about things not happening before
> > some point in boot have almost always been rudely invalidated, just so
> > you know.  ;-)
> >
> > > > 2.        Update your initial patch to fix the issue pointed out by Zhang
> > > >   Qiang and also to use get_boot_cpu_id() as you suggested earlier.
> > > >   Except that the __boot_cpu_id variable read by get_boot_cpu_id()
> > > >   is also initialized by boot_cpu_init(), and is zero up to
> > > >   that point.  So this has the same problem as #1 above.
> > >
> > > This is the way that I want to take.
> > >
> > > On the PowerPC, #define raw_smp_processor_id() (local_paca->paca_index),
> > > instead of (*raw_cpu_ptr(&cpu_number)).
> > > So in fact, boot_cpu_init()->smp_processor_id() gets the "boot_cpuid",
> > > which is assigned to __boot_cpu_id() later.
> >
> > I have to ask...  What does "boot_cpu_init()->smp_processor_id()" mean?
> >
> >
> >void __init boot_cpu_init(void)
> >{
> >        int cpu = smp_processor_id();
> >                  ^^^ This is specific on PowerPC, which is finally implemented by "#define raw_smp_processor_id() (local_paca->paca_index)"
> >                     And it is not zero up at this point. For the initialization of ->paca_index, please refer to the old comment just right after this function.
> >
> >
> >        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
> >        set_cpu_online(cpu, true);
> >        set_cpu_active(cpu, true);
> >        set_cpu_present(cpu, true);
> >        set_cpu_possible(cpu, true);
> >
> >#ifdef CONFIG_SMP
> >        __boot_cpu_id = cpu;
> >                       ^^^ Actually, cpu will not be zero up
> >#endif
> >}
> >
>
> > > For boot_cpuid, early_init_devtree()->early_init_dt_scan_cpus() decides
> > > it. Then early_init_devtree()->allocate_paca(boot_cpuid)->initialise_paca(),
> > > where new_paca->paca_index = cpu;
> > >
> > > All these are done before start_kernel().
> > >
> >
> >More accurate info:
> >arch/powerpc/kernel/head_64.S:973:   LOAD_REG_ADDR(r12, DOTSYM(early_setup))  results in the calling to early_setup()
> ->early_init_devtree(), which finally sets up the local_paca->paca_index
> >
> >Then
> >arch/powerpc/kernel/head_64.S:1002:  bl      start_kernel
> >
> >So before the common kernel starts, PowerPC has made its effort so that
> >get_boot_cpu_id() can keep its semantic.
> >
> > > I have tested a patch adopting this method. It works.
> >
> > Could you please send the actual patch?
> >
> >
> >Sure, I will sent out it right after finishing this mail.
> >
> > > > 3.        As in #2 above, but modify srcu_init() to check for SRCU callbacks
> > > >   queued on CPU 0 when boot_cpu_init() returns non-zero.  In this
> > > >   case, move those SRCU callbacks to the srcu_data structure
> > > >   indicated by boot_cpu_init().
> > > >
> > > > 4.        Leave the SRCU code alone, but boot your crash kernel with
> > > >   srcutree.big_cpu_lim=0.  This would switch all srcu_struct
> > > >   structures to big mode at initialization time.  Once this switch
> > > >   completed, CPUs would queue SRCU callbacks on their own srcu_data
> > > >   structure's ->srcu_cblist.
> > > >
> > > >   I am not optimistic about this one, but could you please try it?
> > >
> > > I have tested it five times. It works.
> >
> > Given that it seems to work, I find this approach extremely attractive.
> > Hence my question about CPU 0's per-CPU variables in my recent email to
> > Michael Ellerman.
> >
> >
> >Sorry that I can not get your point. Are you suggesting to append an extra
> >kernel param to the kdump kernel?
> >
> >
> >Thanks,
> >
> >       Pingfan
> >
> >
> > > > 5.        Start over with the current SRCU code.  Make srcu_delay_timer()
> > > >   and srcu_queue_delayed_work_on() check to see if sdp->cpu is
> > > >   online, and if not, use queue_work() instead of queue_work_on().
> > > >   There might be other adjustments required, but I am not
> > > >   immediately seeing them.
> > > >
>
> Hi Pingfan, does this change look clearer?
>
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> @@ -745,19 +745,23 @@ static void srcu_gp_start(struct srcu_struct *ssp)
>         WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
>  }
>
> +static inline int sdp_cpu(struct srcu_data *sdp)
> +{
> +       return cpu_online(sdp->cpu) ? sdp->cpu : raw_smp_processor_id();
> +}
>
>  static void srcu_delay_timer(struct timer_list *t)
>  {
>         struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work);
>
> -       queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
> +       queue_work_on(sdp_cpu(sdp), rcu_gp_wq, &sdp->work);
>  }
>
>  static void srcu_queue_delayed_work_on(struct srcu_data *sdp,
>                                        unsigned long delay)
>  {
>         if (!delay) {
> -               queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
> +               queue_work_on(sdp_cpu(sdp), rcu_gp_wq, &sdp->work);

I think that in this way, srcu_barrier() can not work, since it
expects a fixed cpu. But now srcu_queue_delayed_work_on() queues work
on a dynamic cpu.

And I post V2: https://lore.kernel.org/all/20221031015237.10294-1-kernelfans@gmail.com/

It replaces cpu 0 with boot cpu id, so it meets both the fix and
once-onlined requirement.


Thanks,

    Pingfan
diff mbox series

Patch

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 1c304fec89c0..f2df561e0995 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -663,7 +663,7 @@  static void srcu_gp_start(struct srcu_struct *ssp)
 	int state;
 
 	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
-		sdp = per_cpu_ptr(ssp->sda, 0);
+		sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
 	else
 		sdp = this_cpu_ptr(ssp->sda);
 	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
@@ -774,7 +774,8 @@  static void srcu_gp_end(struct srcu_struct *ssp)
 	/* Initiate callback invocation as needed. */
 	ss_state = smp_load_acquire(&ssp->srcu_size_state);
 	if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
-		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
+		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda,
+					cpumask_first(cpu_online_mask)), cbdelay);
 	} else {
 		idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
 		srcu_for_each_node_breadth_first(ssp, snp) {
@@ -1093,7 +1094,7 @@  static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	idx = srcu_read_lock(ssp);
 	ss_state = smp_load_acquire(&ssp->srcu_size_state);
 	if (ss_state < SRCU_SIZE_WAIT_CALL)
-		sdp = per_cpu_ptr(ssp->sda, 0);
+		sdp = per_cpu_ptr(ssp->sda, cpumask_first(cpu_online_mask));
 	else
 		sdp = raw_cpu_ptr(ssp->sda);
 	spin_lock_irqsave_sdp_contention(sdp, &flags);
@@ -1429,7 +1430,8 @@  void srcu_barrier(struct srcu_struct *ssp)
 
 	idx = srcu_read_lock(ssp);
 	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
-		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
+		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,
+					cpumask_first(cpu_online_mask)));
 	else
 		for_each_possible_cpu(cpu)
 			srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));