diff mbox series

[v2] rcu: Dump memory object info if callback function is invalid

Message ID 20221111102822.224-1-thunder.leizhen@huawei.com (mailing list archive)
State New, archived
Headers show
Series [v2] rcu: Dump memory object info if callback function is invalid | expand

Commit Message

Leizhen (ThunderTown) Nov. 11, 2022, 10:28 a.m. UTC
When a structure containing an RCU callback rhp is (incorrectly) freed
and reallocated after rhp is passed to call_rcu(), it is not unusual for
rhp->func to be set to NULL. This defeats the debugging prints used by
__call_rcu_common() in kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y,
which expect to identify the offending code using the identity of this
function.

And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things
are even worse, as can be seen from this splat:

Unable to handle kernel NULL pointer dereference at virtual address 0
... ...
PC is at 0x0
LR is at rcu_do_batch+0x1c0/0x3b8
... ...
 (rcu_do_batch) from (rcu_core+0x1d4/0x284)
 (rcu_core) from (__do_softirq+0x24c/0x344)
 (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
 (__irq_exit_rcu) from (irq_exit+0x8/0x10)
 (irq_exit) from (__handle_domain_irq+0x74/0x9c)
 (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
 (gic_handle_irq) from (__irq_svc+0x5c/0x94)
 (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
 (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
 (default_idle_call) from (do_idle+0xf8/0x150)
 (do_idle) from (cpu_startup_entry+0x18/0x20)
 (cpu_startup_entry) from (0xc01530)

This commit therefore adds calls to mem_dump_obj(rhp) to output some
information, for example:

  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256

This provides the rough size of the memory block and the offset of the
rcu_head structure, which as least provides at least a few clues to help
locate the problem. If the problem is reproducible, additional slab
debugging can be enabled, for example, CONFIG_DEBUG_SLAB=y, which can
provide significantly more information.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu.h      | 7 +++++++
 kernel/rcu/srcutiny.c | 1 +
 kernel/rcu/srcutree.c | 1 +
 kernel/rcu/tasks.h    | 1 +
 kernel/rcu/tiny.c     | 1 +
 kernel/rcu/tree.c     | 1 +
 6 files changed, 12 insertions(+)

v1 --> v2:
1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
2. Paul E. McKenney helped me update the commit message, thanks.

Comments

Zqiang Nov. 11, 2022, 11:54 a.m. UTC | #1
>When a structure containing an RCU callback rhp is (incorrectly) freed
>and reallocated after rhp is passed to call_rcu(), it is not unusual for
>rhp->func to be set to NULL. This defeats the debugging prints used by
>__call_rcu_common() in kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y,
>which expect to identify the offending code using the identity of this
>function.
>
>And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things
>are even worse, as can be seen from this splat:
>
>Unable to handle kernel NULL pointer dereference at virtual address 0
>... ...
>PC is at 0x0
>LR is at rcu_do_batch+0x1c0/0x3b8
>... ...
> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
> (rcu_core) from (__do_softirq+0x24c/0x344)
> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
> (default_idle_call) from (do_idle+0xf8/0x150)
> (do_idle) from (cpu_startup_entry+0x18/0x20)
> (cpu_startup_entry) from (0xc01530)
>
>This commit therefore adds calls to mem_dump_obj(rhp) to output some
>information, for example:
>
>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
>
>This provides the rough size of the memory block and the offset of the
>rcu_head structure, which as least provides at least a few clues to help
>locate the problem. If the problem is reproducible, additional slab
>debugging can be enabled, for example, CONFIG_DEBUG_SLAB=y, which can
>provide significantly more information.
>
>Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>---
> kernel/rcu/rcu.h      | 7 +++++++
> kernel/rcu/srcutiny.c | 1 +
> kernel/rcu/srcutree.c | 1 +
> kernel/rcu/tasks.h    | 1 +
> kernel/rcu/tiny.c     | 1 +
> kernel/rcu/tree.c     | 1 +
> 6 files changed, 12 insertions(+)
>
>v1 --> v2:
>1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
>2. Paul E. McKenney helped me update the commit message, thanks.
>

Hi, Zhen Lei

Maybe the following scenarios should be considered:

                CPU 0
tasks context
   spin_lock(&vmap_area_lock)
          Interrupt 
	 RCU softirq
	      rcu_do_batch
		mem_dump_obj
                                  vmalloc_dump_obj
                                       spin_lock(&vmap_area_lock)   <--  deadlock     

Thanks
Zqiang


>diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
>index 65704cbc9df7b3d..32ab45fabf8eebf 100644
>--- a/kernel/rcu/rcu.h
>+++ b/kernel/rcu/rcu.h
>@@ -10,6 +10,7 @@
> #ifndef __LINUX_RCU_H
> #define __LINUX_RCU_H
> 
>+#include <linux/mm.h>
> #include <trace/events/rcu.h>
> 
> /*
>@@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
> }
> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
> 
>+static inline void debug_rcu_head_callback(struct rcu_head *rhp)
>+{
>+	if (unlikely(!rhp->func))
>+		mem_dump_obj(rhp);
>+}
>+
> extern int rcu_cpu_stall_suppress_at_boot;
> 
> static inline bool rcu_stall_is_suppressed_at_boot(void)
>diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
>index 33adafdad261389..5e7f336baa06ae0 100644
>--- a/kernel/rcu/srcutiny.c
>+++ b/kernel/rcu/srcutiny.c
>@@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
> 	while (lh) {
> 		rhp = lh;
> 		lh = lh->next;
>+		debug_rcu_head_callback(rhp);
> 		local_bh_disable();
> 		rhp->func(rhp);
> 		local_bh_enable();
>diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
>index ca4b5dcec675bac..294972e66b31863 100644
>--- a/kernel/rcu/srcutree.c
>+++ b/kernel/rcu/srcutree.c
>@@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
> 	rhp = rcu_cblist_dequeue(&ready_cbs);
> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
> 		debug_rcu_head_unqueue(rhp);
>+		debug_rcu_head_callback(rhp);
> 		local_bh_disable();
> 		rhp->func(rhp);
> 		local_bh_enable();
>diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
>index b0b885e071fa8dc..b7f8c67c586cdc4 100644
>--- a/kernel/rcu/tasks.h
>+++ b/kernel/rcu/tasks.h
>@@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
> 	len = rcl.len;
> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
>+		debug_rcu_head_callback(rhp);
> 		local_bh_disable();
> 		rhp->func(rhp);
> 		local_bh_enable();
>diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
>index bb8f7d270f01747..56e9a5d91d97ec5 100644
>--- a/kernel/rcu/tiny.c
>+++ b/kernel/rcu/tiny.c
>@@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
> 
> 	trace_rcu_invoke_callback("", head);
> 	f = head->func;
>+	debug_rcu_head_callback(head);
> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
> 	f(head);
> 	rcu_lock_release(&rcu_callback_map);
>diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>index 15aaff3203bf2d0..ed93ddb8203d42c 100644
>--- a/kernel/rcu/tree.c
>+++ b/kernel/rcu/tree.c
>@@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
> 
> 		f = rhp->func;
>+		debug_rcu_head_callback(rhp);
> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
> 		f(rhp);
> 
>-- 
>2.25.1
Leizhen (ThunderTown) Nov. 11, 2022, 12:34 p.m. UTC | #2
On 2022/11/11 19:54, Zhang, Qiang1 wrote:
>> When a structure containing an RCU callback rhp is (incorrectly) freed
>> and reallocated after rhp is passed to call_rcu(), it is not unusual for
>> rhp->func to be set to NULL. This defeats the debugging prints used by
>> __call_rcu_common() in kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y,
>> which expect to identify the offending code using the identity of this
>> function.
>>
>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things
>> are even worse, as can be seen from this splat:
>>
>> Unable to handle kernel NULL pointer dereference at virtual address 0
>> ... ...
>> PC is at 0x0
>> LR is at rcu_do_batch+0x1c0/0x3b8
>> ... ...
>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
>> (rcu_core) from (__do_softirq+0x24c/0x344)
>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
>> (default_idle_call) from (do_idle+0xf8/0x150)
>> (do_idle) from (cpu_startup_entry+0x18/0x20)
>> (cpu_startup_entry) from (0xc01530)
>>
>> This commit therefore adds calls to mem_dump_obj(rhp) to output some
>> information, for example:
>>
>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
>>
>> This provides the rough size of the memory block and the offset of the
>> rcu_head structure, which as least provides at least a few clues to help
>> locate the problem. If the problem is reproducible, additional slab
>> debugging can be enabled, for example, CONFIG_DEBUG_SLAB=y, which can
>> provide significantly more information.
>>
>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>> ---
>> kernel/rcu/rcu.h      | 7 +++++++
>> kernel/rcu/srcutiny.c | 1 +
>> kernel/rcu/srcutree.c | 1 +
>> kernel/rcu/tasks.h    | 1 +
>> kernel/rcu/tiny.c     | 1 +
>> kernel/rcu/tree.c     | 1 +
>> 6 files changed, 12 insertions(+)
>>
>> v1 --> v2:
>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
>> 2. Paul E. McKenney helped me update the commit message, thanks.
>>
> 
> Hi, Zhen Lei
> 
> Maybe the following scenarios should be considered:
> 
>                 CPU 0
> tasks context
>    spin_lock(&vmap_area_lock)
>           Interrupt 
> 	 RCU softirq
> 	      rcu_do_batch
> 		mem_dump_obj
>                                   vmalloc_dump_obj
>                                        spin_lock(&vmap_area_lock)   <--  deadlock     

Right, thanks. I just saw the robot's report. So this patch should be dropped.
I'll try to add an helper in mm, where I can check whether the lock has been
held, and dump the content of memory object.

> 
> Thanks
> Zqiang
> 
> 
>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
>> index 65704cbc9df7b3d..32ab45fabf8eebf 100644
>> --- a/kernel/rcu/rcu.h
>> +++ b/kernel/rcu/rcu.h
>> @@ -10,6 +10,7 @@
>> #ifndef __LINUX_RCU_H
>> #define __LINUX_RCU_H
>>
>> +#include <linux/mm.h>
>> #include <trace/events/rcu.h>
>>
>> /*
>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
>> }
>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
>>
>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp)
>> +{
>> +	if (unlikely(!rhp->func))
>> +		mem_dump_obj(rhp);
>> +}
>> +
>> extern int rcu_cpu_stall_suppress_at_boot;
>>
>> static inline bool rcu_stall_is_suppressed_at_boot(void)
>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
>> index 33adafdad261389..5e7f336baa06ae0 100644
>> --- a/kernel/rcu/srcutiny.c
>> +++ b/kernel/rcu/srcutiny.c
>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
>> 	while (lh) {
>> 		rhp = lh;
>> 		lh = lh->next;
>> +		debug_rcu_head_callback(rhp);
>> 		local_bh_disable();
>> 		rhp->func(rhp);
>> 		local_bh_enable();
>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
>> index ca4b5dcec675bac..294972e66b31863 100644
>> --- a/kernel/rcu/srcutree.c
>> +++ b/kernel/rcu/srcutree.c
>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
>> 		debug_rcu_head_unqueue(rhp);
>> +		debug_rcu_head_callback(rhp);
>> 		local_bh_disable();
>> 		rhp->func(rhp);
>> 		local_bh_enable();
>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
>> index b0b885e071fa8dc..b7f8c67c586cdc4 100644
>> --- a/kernel/rcu/tasks.h
>> +++ b/kernel/rcu/tasks.h
>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
>> 	len = rcl.len;
>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
>> +		debug_rcu_head_callback(rhp);
>> 		local_bh_disable();
>> 		rhp->func(rhp);
>> 		local_bh_enable();
>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
>> index bb8f7d270f01747..56e9a5d91d97ec5 100644
>> --- a/kernel/rcu/tiny.c
>> +++ b/kernel/rcu/tiny.c
>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
>>
>> 	trace_rcu_invoke_callback("", head);
>> 	f = head->func;
>> +	debug_rcu_head_callback(head);
>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
>> 	f(head);
>> 	rcu_lock_release(&rcu_callback_map);
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index 15aaff3203bf2d0..ed93ddb8203d42c 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
>>
>> 		f = rhp->func;
>> +		debug_rcu_head_callback(rhp);
>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
>> 		f(rhp);
>>
>> -- 
>> 2.25.1
> 
> .
>
Zqiang Nov. 11, 2022, 1:05 p.m. UTC | #3
On 2022/11/11 19:54, Zhang, Qiang1 wrote:
>> When a structure containing an RCU callback rhp is (incorrectly) 
>> freed and reallocated after rhp is passed to call_rcu(), it is not 
>> unusual for
>> rhp->func to be set to NULL. This defeats the debugging prints used 
>> rhp->by
>> __call_rcu_common() in kernels built with 
>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
>> offending code using the identity of this function.
>>
>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
>> are even worse, as can be seen from this splat:
>>
>> Unable to handle kernel NULL pointer dereference at virtual address 0 
>> ... ...
>> PC is at 0x0
>> LR is at rcu_do_batch+0x1c0/0x3b8
>> ... ...
>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
>> (rcu_core) from (__do_softirq+0x24c/0x344)
>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
>> (default_idle_call) from (do_idle+0xf8/0x150)
>> (do_idle) from (cpu_startup_entry+0x18/0x20)
>> (cpu_startup_entry) from (0xc01530)
>>
>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
>> information, for example:
>>
>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
>>
>> This provides the rough size of the memory block and the offset of 
>> the rcu_head structure, which as least provides at least a few clues 
>> to help locate the problem. If the problem is reproducible, 
>> additional slab debugging can be enabled, for example, 
>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
>>
>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>> ---
>> kernel/rcu/rcu.h      | 7 +++++++
>> kernel/rcu/srcutiny.c | 1 +
>> kernel/rcu/srcutree.c | 1 +
>> kernel/rcu/tasks.h    | 1 +
>> kernel/rcu/tiny.c     | 1 +
>> kernel/rcu/tree.c     | 1 +
>> 6 files changed, 12 insertions(+)
>>
>> v1 --> v2:
>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
>> 2. Paul E. McKenney helped me update the commit message, thanks.
>>
> 
> Hi, Zhen Lei
> 
> Maybe the following scenarios should be considered:
> 
>                 CPU 0
> tasks context
>    spin_lock(&vmap_area_lock)
>           Interrupt 
> 	 RCU softirq
> 	      rcu_do_batch
> 		mem_dump_obj
>                                   vmalloc_dump_obj
>                                        spin_lock(&vmap_area_lock)   <--  deadlock     

>Right, thanks. I just saw the robot's report. So this patch should be dropped.
>I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.

This is a workaround, or maybe try a modification like the following, 
of course, need to ask Paul's opinion.

diff --git a/mm/util.c b/mm/util.c
index 12984e76767e..86da0739fe5d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
 {
        const char *type;

+       if (is_vmalloc_addr(object)) {
+               if (in_task() && vmalloc_dump_obj(object))
+                       return;
+               type = "vmalloc memory";
+               goto end;
+       }
+
        if (kmem_valid_obj(object)) {
                kmem_dump_obj(object);
                return;
        }

-       if (vmalloc_dump_obj(object))
-               return;
-
        if (virt_addr_valid(object))
                type = "non-slab/vmalloc memory";
        else if (object == NULL)
@@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
                type = "zero-size pointer";
        else
                type = "non-paged memory";
-
+end:
        pr_cont(" %s\n", type);
 }
 EXPORT_SYMBOL_GPL(mem_dump_obj);

Thanks
Zqiang


> 
> Thanks
> Zqiang
> 
> 
>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
>> --- a/kernel/rcu/rcu.h
>> +++ b/kernel/rcu/rcu.h
>> @@ -10,6 +10,7 @@
>> #ifndef __LINUX_RCU_H
>> #define __LINUX_RCU_H
>>
>> +#include <linux/mm.h>
>> #include <trace/events/rcu.h>
>>
>> /*
>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
>> rcu_head *head) }
>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
>>
>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
>> +	if (unlikely(!rhp->func))
>> +		mem_dump_obj(rhp);
>> +}
>> +
>> extern int rcu_cpu_stall_suppress_at_boot;
>>
>> static inline bool rcu_stall_is_suppressed_at_boot(void)
>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
>> 33adafdad261389..5e7f336baa06ae0 100644
>> --- a/kernel/rcu/srcutiny.c
>> +++ b/kernel/rcu/srcutiny.c
>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
>> 	while (lh) {
>> 		rhp = lh;
>> 		lh = lh->next;
>> +		debug_rcu_head_callback(rhp);
>> 		local_bh_disable();
>> 		rhp->func(rhp);
>> 		local_bh_enable();
>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
>> ca4b5dcec675bac..294972e66b31863 100644
>> --- a/kernel/rcu/srcutree.c
>> +++ b/kernel/rcu/srcutree.c
>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
>> 		debug_rcu_head_unqueue(rhp);
>> +		debug_rcu_head_callback(rhp);
>> 		local_bh_disable();
>> 		rhp->func(rhp);
>> 		local_bh_enable();
>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
>> --- a/kernel/rcu/tasks.h
>> +++ b/kernel/rcu/tasks.h
>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
>> 	len = rcl.len;
>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
>> rcu_cblist_dequeue(&rcl)) {
>> +		debug_rcu_head_callback(rhp);
>> 		local_bh_disable();
>> 		rhp->func(rhp);
>> 		local_bh_enable();
>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
>> bb8f7d270f01747..56e9a5d91d97ec5 100644
>> --- a/kernel/rcu/tiny.c
>> +++ b/kernel/rcu/tiny.c
>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
>> *head)
>>
>> 	trace_rcu_invoke_callback("", head);
>> 	f = head->func;
>> +	debug_rcu_head_callback(head);
>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
>> 	f(head);
>> 	rcu_lock_release(&rcu_callback_map);
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
>>
>> 		f = rhp->func;
>> +		debug_rcu_head_callback(rhp);
>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
>> 		f(rhp);
>>
>> --
>> 2.25.1
> 
> .
> 

--
Regards,
  Zhen Lei
Elliott, Robert (Servers) Nov. 11, 2022, 3:50 p.m. UTC | #4
> +static inline void debug_rcu_head_callback(struct rcu_head *rhp)
> +{
> +	if (unlikely(!rhp->func))
> +		mem_dump_obj(rhp);
> +}
> +

The mm/util.c definition of mem_dump_object() says:
 * This function uses pr_cont(), so that the caller is expected to have
 * printed out whatever preamble is appropriate.

so this needs to call pr_alert() or pr_err() before that to explain what
is being printed (with no \n), like these:

kernel/rcu/rcutorture.c: pr_alert("mem_dump_obj(%px):", &rhp);
kernel/rcu/rcutorture.c: mem_dump_obj(&rhp);
...
kernel/rcu/tree.c: pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
kernel/rcu/tree.c: mem_dump_obj(head);
Paul E. McKenney Nov. 11, 2022, 6:42 p.m. UTC | #5
On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
> >> When a structure containing an RCU callback rhp is (incorrectly) 
> >> freed and reallocated after rhp is passed to call_rcu(), it is not 
> >> unusual for
> >> rhp->func to be set to NULL. This defeats the debugging prints used 
> >> rhp->by
> >> __call_rcu_common() in kernels built with 
> >> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
> >> offending code using the identity of this function.
> >>
> >> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
> >> are even worse, as can be seen from this splat:
> >>
> >> Unable to handle kernel NULL pointer dereference at virtual address 0 
> >> ... ...
> >> PC is at 0x0
> >> LR is at rcu_do_batch+0x1c0/0x3b8
> >> ... ...
> >> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
> >> (rcu_core) from (__do_softirq+0x24c/0x344)
> >> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
> >> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
> >> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
> >> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
> >> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
> >> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
> >> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
> >> (default_idle_call) from (do_idle+0xf8/0x150)
> >> (do_idle) from (cpu_startup_entry+0x18/0x20)
> >> (cpu_startup_entry) from (0xc01530)
> >>
> >> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
> >> information, for example:
> >>
> >>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
> >>
> >> This provides the rough size of the memory block and the offset of 
> >> the rcu_head structure, which as least provides at least a few clues 
> >> to help locate the problem. If the problem is reproducible, 
> >> additional slab debugging can be enabled, for example, 
> >> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
> >>
> >> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> >> ---
> >> kernel/rcu/rcu.h      | 7 +++++++
> >> kernel/rcu/srcutiny.c | 1 +
> >> kernel/rcu/srcutree.c | 1 +
> >> kernel/rcu/tasks.h    | 1 +
> >> kernel/rcu/tiny.c     | 1 +
> >> kernel/rcu/tree.c     | 1 +
> >> 6 files changed, 12 insertions(+)
> >>
> >> v1 --> v2:
> >> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
> >> 2. Paul E. McKenney helped me update the commit message, thanks.
> >>
> > 
> > Hi, Zhen Lei
> > 
> > Maybe the following scenarios should be considered:
> > 
> >                 CPU 0
> > tasks context
> >    spin_lock(&vmap_area_lock)
> >           Interrupt 
> > 	 RCU softirq
> > 	      rcu_do_batch
> > 		mem_dump_obj
> >                                   vmalloc_dump_obj
> >                                        spin_lock(&vmap_area_lock)   <--  deadlock     
> 
> >Right, thanks. I just saw the robot's report. So this patch should be dropped.
> >I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
> 
> This is a workaround, or maybe try a modification like the following, 
> of course, need to ask Paul's opinion.

Another approach is to schedule a workqueue handler to do the
mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
environment.

This would allow vmalloc_dump_obj() to be called unconditionally.

Other thoughts?

							Thanx, Paul

> diff --git a/mm/util.c b/mm/util.c
> index 12984e76767e..86da0739fe5d 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
>  {
>         const char *type;
> 
> +       if (is_vmalloc_addr(object)) {
> +               if (in_task() && vmalloc_dump_obj(object))
> +                       return;
> +               type = "vmalloc memory";
> +               goto end;
> +       }
> +
>         if (kmem_valid_obj(object)) {
>                 kmem_dump_obj(object);
>                 return;
>         }
> 
> -       if (vmalloc_dump_obj(object))
> -               return;
> -
>         if (virt_addr_valid(object))
>                 type = "non-slab/vmalloc memory";
>         else if (object == NULL)
> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
>                 type = "zero-size pointer";
>         else
>                 type = "non-paged memory";
> -
> +end:
>         pr_cont(" %s\n", type);
>  }
>  EXPORT_SYMBOL_GPL(mem_dump_obj);
> 
> Thanks
> Zqiang
> 
> 
> > 
> > Thanks
> > Zqiang
> > 
> > 
> >> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
> >> 65704cbc9df7b3d..32ab45fabf8eebf 100644
> >> --- a/kernel/rcu/rcu.h
> >> +++ b/kernel/rcu/rcu.h
> >> @@ -10,6 +10,7 @@
> >> #ifndef __LINUX_RCU_H
> >> #define __LINUX_RCU_H
> >>
> >> +#include <linux/mm.h>
> >> #include <trace/events/rcu.h>
> >>
> >> /*
> >> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
> >> rcu_head *head) }
> >> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
> >>
> >> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
> >> +	if (unlikely(!rhp->func))
> >> +		mem_dump_obj(rhp);
> >> +}
> >> +
> >> extern int rcu_cpu_stall_suppress_at_boot;
> >>
> >> static inline bool rcu_stall_is_suppressed_at_boot(void)
> >> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
> >> 33adafdad261389..5e7f336baa06ae0 100644
> >> --- a/kernel/rcu/srcutiny.c
> >> +++ b/kernel/rcu/srcutiny.c
> >> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
> >> 	while (lh) {
> >> 		rhp = lh;
> >> 		lh = lh->next;
> >> +		debug_rcu_head_callback(rhp);
> >> 		local_bh_disable();
> >> 		rhp->func(rhp);
> >> 		local_bh_enable();
> >> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
> >> ca4b5dcec675bac..294972e66b31863 100644
> >> --- a/kernel/rcu/srcutree.c
> >> +++ b/kernel/rcu/srcutree.c
> >> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
> >> 	rhp = rcu_cblist_dequeue(&ready_cbs);
> >> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
> >> 		debug_rcu_head_unqueue(rhp);
> >> +		debug_rcu_head_callback(rhp);
> >> 		local_bh_disable();
> >> 		rhp->func(rhp);
> >> 		local_bh_enable();
> >> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
> >> b0b885e071fa8dc..b7f8c67c586cdc4 100644
> >> --- a/kernel/rcu/tasks.h
> >> +++ b/kernel/rcu/tasks.h
> >> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
> >> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
> >> 	len = rcl.len;
> >> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
> >> rcu_cblist_dequeue(&rcl)) {
> >> +		debug_rcu_head_callback(rhp);
> >> 		local_bh_disable();
> >> 		rhp->func(rhp);
> >> 		local_bh_enable();
> >> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
> >> bb8f7d270f01747..56e9a5d91d97ec5 100644
> >> --- a/kernel/rcu/tiny.c
> >> +++ b/kernel/rcu/tiny.c
> >> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
> >> *head)
> >>
> >> 	trace_rcu_invoke_callback("", head);
> >> 	f = head->func;
> >> +	debug_rcu_head_callback(head);
> >> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
> >> 	f(head);
> >> 	rcu_lock_release(&rcu_callback_map);
> >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
> >> 15aaff3203bf2d0..ed93ddb8203d42c 100644
> >> --- a/kernel/rcu/tree.c
> >> +++ b/kernel/rcu/tree.c
> >> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> >> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
> >>
> >> 		f = rhp->func;
> >> +		debug_rcu_head_callback(rhp);
> >> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
> >> 		f(rhp);
> >>
> >> --
> >> 2.25.1
> > 
> > .
> > 
> 
> --
> Regards,
>   Zhen Lei
Leizhen (ThunderTown) Nov. 12, 2022, 2:32 a.m. UTC | #6
On 2022/11/12 2:42, Paul E. McKenney wrote:
> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
>>>> When a structure containing an RCU callback rhp is (incorrectly) 
>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
>>>> unusual for
>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
>>>> rhp->by
>>>> __call_rcu_common() in kernels built with 
>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
>>>> offending code using the identity of this function.
>>>>
>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
>>>> are even worse, as can be seen from this splat:
>>>>
>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
>>>> ... ...
>>>> PC is at 0x0
>>>> LR is at rcu_do_batch+0x1c0/0x3b8
>>>> ... ...
>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
>>>> (default_idle_call) from (do_idle+0xf8/0x150)
>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
>>>> (cpu_startup_entry) from (0xc01530)
>>>>
>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
>>>> information, for example:
>>>>
>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
>>>>
>>>> This provides the rough size of the memory block and the offset of 
>>>> the rcu_head structure, which as least provides at least a few clues 
>>>> to help locate the problem. If the problem is reproducible, 
>>>> additional slab debugging can be enabled, for example, 
>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
>>>>
>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>>>> ---
>>>> kernel/rcu/rcu.h      | 7 +++++++
>>>> kernel/rcu/srcutiny.c | 1 +
>>>> kernel/rcu/srcutree.c | 1 +
>>>> kernel/rcu/tasks.h    | 1 +
>>>> kernel/rcu/tiny.c     | 1 +
>>>> kernel/rcu/tree.c     | 1 +
>>>> 6 files changed, 12 insertions(+)
>>>>
>>>> v1 --> v2:
>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
>>>>
>>>
>>> Hi, Zhen Lei
>>>
>>> Maybe the following scenarios should be considered:
>>>
>>>                 CPU 0
>>> tasks context
>>>    spin_lock(&vmap_area_lock)
>>>           Interrupt 
>>> 	 RCU softirq
>>> 	      rcu_do_batch
>>> 		mem_dump_obj
>>>                                   vmalloc_dump_obj
>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
>>
>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
>>
>> This is a workaround, or maybe try a modification like the following, 
>> of course, need to ask Paul's opinion.
> 
> Another approach is to schedule a workqueue handler to do the
> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
> environment.

It's about to panic, so no chance to schedule.

> 
> This would allow vmalloc_dump_obj() to be called unconditionally.
> 
> Other thoughts?

locked = spin_is_locked(&vmap_area_lock);
if (!locked)
    spin_lock(&vmap_area_lock)

Careful analysis is required, which may cause other problems.

Or in new function:
if (locked)
    return;
spin_lock(&vmap_area_lock);

If there is a chance to dump the data, dump the data. If there is no
chance to dump the data, do not dump the data. This is the fate of
debugging information.

> 
> 							Thanx, Paul
> 
>> diff --git a/mm/util.c b/mm/util.c
>> index 12984e76767e..86da0739fe5d 100644
>> --- a/mm/util.c
>> +++ b/mm/util.c
>> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
>>  {
>>         const char *type;
>>
>> +       if (is_vmalloc_addr(object)) {
>> +               if (in_task() && vmalloc_dump_obj(object))
>> +                       return;
>> +               type = "vmalloc memory";
>> +               goto end;
>> +       }
>> +
>>         if (kmem_valid_obj(object)) {
>>                 kmem_dump_obj(object);
>>                 return;
>>         }
>>
>> -       if (vmalloc_dump_obj(object))
>> -               return;
>> -
>>         if (virt_addr_valid(object))
>>                 type = "non-slab/vmalloc memory";
>>         else if (object == NULL)
>> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
>>                 type = "zero-size pointer";
>>         else
>>                 type = "non-paged memory";
>> -
>> +end:
>>         pr_cont(" %s\n", type);
>>  }
>>  EXPORT_SYMBOL_GPL(mem_dump_obj);
>>
>> Thanks
>> Zqiang
>>
>>
>>>
>>> Thanks
>>> Zqiang
>>>
>>>
>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
>>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
>>>> --- a/kernel/rcu/rcu.h
>>>> +++ b/kernel/rcu/rcu.h
>>>> @@ -10,6 +10,7 @@
>>>> #ifndef __LINUX_RCU_H
>>>> #define __LINUX_RCU_H
>>>>
>>>> +#include <linux/mm.h>
>>>> #include <trace/events/rcu.h>
>>>>
>>>> /*
>>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
>>>> rcu_head *head) }
>>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
>>>>
>>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
>>>> +	if (unlikely(!rhp->func))
>>>> +		mem_dump_obj(rhp);
>>>> +}
>>>> +
>>>> extern int rcu_cpu_stall_suppress_at_boot;
>>>>
>>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
>>>> 33adafdad261389..5e7f336baa06ae0 100644
>>>> --- a/kernel/rcu/srcutiny.c
>>>> +++ b/kernel/rcu/srcutiny.c
>>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
>>>> 	while (lh) {
>>>> 		rhp = lh;
>>>> 		lh = lh->next;
>>>> +		debug_rcu_head_callback(rhp);
>>>> 		local_bh_disable();
>>>> 		rhp->func(rhp);
>>>> 		local_bh_enable();
>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
>>>> ca4b5dcec675bac..294972e66b31863 100644
>>>> --- a/kernel/rcu/srcutree.c
>>>> +++ b/kernel/rcu/srcutree.c
>>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
>>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
>>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
>>>> 		debug_rcu_head_unqueue(rhp);
>>>> +		debug_rcu_head_callback(rhp);
>>>> 		local_bh_disable();
>>>> 		rhp->func(rhp);
>>>> 		local_bh_enable();
>>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
>>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
>>>> --- a/kernel/rcu/tasks.h
>>>> +++ b/kernel/rcu/tasks.h
>>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
>>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
>>>> 	len = rcl.len;
>>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
>>>> rcu_cblist_dequeue(&rcl)) {
>>>> +		debug_rcu_head_callback(rhp);
>>>> 		local_bh_disable();
>>>> 		rhp->func(rhp);
>>>> 		local_bh_enable();
>>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
>>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
>>>> --- a/kernel/rcu/tiny.c
>>>> +++ b/kernel/rcu/tiny.c
>>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
>>>> *head)
>>>>
>>>> 	trace_rcu_invoke_callback("", head);
>>>> 	f = head->func;
>>>> +	debug_rcu_head_callback(head);
>>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
>>>> 	f(head);
>>>> 	rcu_lock_release(&rcu_callback_map);
>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
>>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
>>>> --- a/kernel/rcu/tree.c
>>>> +++ b/kernel/rcu/tree.c
>>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
>>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
>>>>
>>>> 		f = rhp->func;
>>>> +		debug_rcu_head_callback(rhp);
>>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
>>>> 		f(rhp);
>>>>
>>>> --
>>>> 2.25.1
>>>
>>> .
>>>
>>
>> --
>> Regards,
>>   Zhen Lei
> .
>
Leizhen (ThunderTown) Nov. 12, 2022, 2:39 a.m. UTC | #7
On 2022/11/12 10:32, Leizhen (ThunderTown) wrote:
> 
> 
> On 2022/11/12 2:42, Paul E. McKenney wrote:
>> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
>>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
>>>>> When a structure containing an RCU callback rhp is (incorrectly) 
>>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
>>>>> unusual for
>>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
>>>>> rhp->by
>>>>> __call_rcu_common() in kernels built with 
>>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
>>>>> offending code using the identity of this function.
>>>>>
>>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
>>>>> are even worse, as can be seen from this splat:
>>>>>
>>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
>>>>> ... ...
>>>>> PC is at 0x0
>>>>> LR is at rcu_do_batch+0x1c0/0x3b8
>>>>> ... ...
>>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
>>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
>>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
>>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
>>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
>>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
>>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
>>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
>>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
>>>>> (default_idle_call) from (do_idle+0xf8/0x150)
>>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
>>>>> (cpu_startup_entry) from (0xc01530)
>>>>>
>>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
>>>>> information, for example:
>>>>>
>>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
>>>>>
>>>>> This provides the rough size of the memory block and the offset of 
>>>>> the rcu_head structure, which as least provides at least a few clues 
>>>>> to help locate the problem. If the problem is reproducible, 
>>>>> additional slab debugging can be enabled, for example, 
>>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
>>>>>
>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>>>>> ---
>>>>> kernel/rcu/rcu.h      | 7 +++++++
>>>>> kernel/rcu/srcutiny.c | 1 +
>>>>> kernel/rcu/srcutree.c | 1 +
>>>>> kernel/rcu/tasks.h    | 1 +
>>>>> kernel/rcu/tiny.c     | 1 +
>>>>> kernel/rcu/tree.c     | 1 +
>>>>> 6 files changed, 12 insertions(+)
>>>>>
>>>>> v1 --> v2:
>>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
>>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
>>>>>
>>>>
>>>> Hi, Zhen Lei
>>>>
>>>> Maybe the following scenarios should be considered:
>>>>
>>>>                 CPU 0
>>>> tasks context
>>>>    spin_lock(&vmap_area_lock)
>>>>           Interrupt 
>>>> 	 RCU softirq
>>>> 	      rcu_do_batch
>>>> 		mem_dump_obj
>>>>                                   vmalloc_dump_obj
>>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
>>>
>>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
>>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
>>>
>>> This is a workaround, or maybe try a modification like the following, 
>>> of course, need to ask Paul's opinion.
>>
>> Another approach is to schedule a workqueue handler to do the
>> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
>> environment.
> 
> It's about to panic, so no chance to schedule.
> 
>>
>> This would allow vmalloc_dump_obj() to be called unconditionally.
>>
>> Other thoughts?
> 
> locked = spin_is_locked(&vmap_area_lock);
> if (!locked)
>     spin_lock(&vmap_area_lock)
> 
> Careful analysis is required, which may cause other problems.
> 
> Or in new function:

Oh, perhaps no new function is needed, mem_dump_obj() itself prints
debugging information. I will try a mm patch first.

> if (locked)
>     return;
> spin_lock(&vmap_area_lock);
> 
> If there is a chance to dump the data, dump the data. If there is no
> chance to dump the data, do not dump the data. This is the fate of
> debugging information.
> 
>>
>> 							Thanx, Paul
>>
>>> diff --git a/mm/util.c b/mm/util.c
>>> index 12984e76767e..86da0739fe5d 100644
>>> --- a/mm/util.c
>>> +++ b/mm/util.c
>>> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
>>>  {
>>>         const char *type;
>>>
>>> +       if (is_vmalloc_addr(object)) {
>>> +               if (in_task() && vmalloc_dump_obj(object))
>>> +                       return;
>>> +               type = "vmalloc memory";
>>> +               goto end;
>>> +       }
>>> +
>>>         if (kmem_valid_obj(object)) {
>>>                 kmem_dump_obj(object);
>>>                 return;
>>>         }
>>>
>>> -       if (vmalloc_dump_obj(object))
>>> -               return;
>>> -
>>>         if (virt_addr_valid(object))
>>>                 type = "non-slab/vmalloc memory";
>>>         else if (object == NULL)
>>> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
>>>                 type = "zero-size pointer";
>>>         else
>>>                 type = "non-paged memory";
>>> -
>>> +end:
>>>         pr_cont(" %s\n", type);
>>>  }
>>>  EXPORT_SYMBOL_GPL(mem_dump_obj);
>>>
>>> Thanks
>>> Zqiang
>>>
>>>
>>>>
>>>> Thanks
>>>> Zqiang
>>>>
>>>>
>>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
>>>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
>>>>> --- a/kernel/rcu/rcu.h
>>>>> +++ b/kernel/rcu/rcu.h
>>>>> @@ -10,6 +10,7 @@
>>>>> #ifndef __LINUX_RCU_H
>>>>> #define __LINUX_RCU_H
>>>>>
>>>>> +#include <linux/mm.h>
>>>>> #include <trace/events/rcu.h>
>>>>>
>>>>> /*
>>>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
>>>>> rcu_head *head) }
>>>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
>>>>>
>>>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
>>>>> +	if (unlikely(!rhp->func))
>>>>> +		mem_dump_obj(rhp);
>>>>> +}
>>>>> +
>>>>> extern int rcu_cpu_stall_suppress_at_boot;
>>>>>
>>>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
>>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
>>>>> 33adafdad261389..5e7f336baa06ae0 100644
>>>>> --- a/kernel/rcu/srcutiny.c
>>>>> +++ b/kernel/rcu/srcutiny.c
>>>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
>>>>> 	while (lh) {
>>>>> 		rhp = lh;
>>>>> 		lh = lh->next;
>>>>> +		debug_rcu_head_callback(rhp);
>>>>> 		local_bh_disable();
>>>>> 		rhp->func(rhp);
>>>>> 		local_bh_enable();
>>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
>>>>> ca4b5dcec675bac..294972e66b31863 100644
>>>>> --- a/kernel/rcu/srcutree.c
>>>>> +++ b/kernel/rcu/srcutree.c
>>>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
>>>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
>>>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
>>>>> 		debug_rcu_head_unqueue(rhp);
>>>>> +		debug_rcu_head_callback(rhp);
>>>>> 		local_bh_disable();
>>>>> 		rhp->func(rhp);
>>>>> 		local_bh_enable();
>>>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
>>>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
>>>>> --- a/kernel/rcu/tasks.h
>>>>> +++ b/kernel/rcu/tasks.h
>>>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
>>>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
>>>>> 	len = rcl.len;
>>>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
>>>>> rcu_cblist_dequeue(&rcl)) {
>>>>> +		debug_rcu_head_callback(rhp);
>>>>> 		local_bh_disable();
>>>>> 		rhp->func(rhp);
>>>>> 		local_bh_enable();
>>>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
>>>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
>>>>> --- a/kernel/rcu/tiny.c
>>>>> +++ b/kernel/rcu/tiny.c
>>>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
>>>>> *head)
>>>>>
>>>>> 	trace_rcu_invoke_callback("", head);
>>>>> 	f = head->func;
>>>>> +	debug_rcu_head_callback(head);
>>>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
>>>>> 	f(head);
>>>>> 	rcu_lock_release(&rcu_callback_map);
>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
>>>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
>>>>> --- a/kernel/rcu/tree.c
>>>>> +++ b/kernel/rcu/tree.c
>>>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
>>>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
>>>>>
>>>>> 		f = rhp->func;
>>>>> +		debug_rcu_head_callback(rhp);
>>>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
>>>>> 		f(rhp);
>>>>>
>>>>> --
>>>>> 2.25.1
>>>>
>>>> .
>>>>
>>>
>>> --
>>> Regards,
>>>   Zhen Lei
>> .
>>
>
Paul E. McKenney Nov. 12, 2022, 6:08 a.m. UTC | #8
On Sat, Nov 12, 2022 at 10:32:32AM +0800, Leizhen (ThunderTown) wrote:
> On 2022/11/12 2:42, Paul E. McKenney wrote:
> > On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
> >> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
> >>>> When a structure containing an RCU callback rhp is (incorrectly) 
> >>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
> >>>> unusual for
> >>>> rhp->func to be set to NULL. This defeats the debugging prints used 
> >>>> rhp->by
> >>>> __call_rcu_common() in kernels built with 
> >>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
> >>>> offending code using the identity of this function.
> >>>>
> >>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
> >>>> are even worse, as can be seen from this splat:
> >>>>
> >>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
> >>>> ... ...
> >>>> PC is at 0x0
> >>>> LR is at rcu_do_batch+0x1c0/0x3b8
> >>>> ... ...
> >>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
> >>>> (rcu_core) from (__do_softirq+0x24c/0x344)
> >>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
> >>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
> >>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
> >>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
> >>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
> >>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
> >>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
> >>>> (default_idle_call) from (do_idle+0xf8/0x150)
> >>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
> >>>> (cpu_startup_entry) from (0xc01530)
> >>>>
> >>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
> >>>> information, for example:
> >>>>
> >>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
> >>>>
> >>>> This provides the rough size of the memory block and the offset of 
> >>>> the rcu_head structure, which as least provides at least a few clues 
> >>>> to help locate the problem. If the problem is reproducible, 
> >>>> additional slab debugging can be enabled, for example, 
> >>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
> >>>>
> >>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> >>>> ---
> >>>> kernel/rcu/rcu.h      | 7 +++++++
> >>>> kernel/rcu/srcutiny.c | 1 +
> >>>> kernel/rcu/srcutree.c | 1 +
> >>>> kernel/rcu/tasks.h    | 1 +
> >>>> kernel/rcu/tiny.c     | 1 +
> >>>> kernel/rcu/tree.c     | 1 +
> >>>> 6 files changed, 12 insertions(+)
> >>>>
> >>>> v1 --> v2:
> >>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
> >>>> 2. Paul E. McKenney helped me update the commit message, thanks.
> >>>>
> >>>
> >>> Hi, Zhen Lei
> >>>
> >>> Maybe the following scenarios should be considered:
> >>>
> >>>                 CPU 0
> >>> tasks context
> >>>    spin_lock(&vmap_area_lock)
> >>>           Interrupt 
> >>> 	 RCU softirq
> >>> 	      rcu_do_batch
> >>> 		mem_dump_obj
> >>>                                   vmalloc_dump_obj
> >>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
> >>
> >>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
> >>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
> >>
> >> This is a workaround, or maybe try a modification like the following, 
> >> of course, need to ask Paul's opinion.
> > 
> > Another approach is to schedule a workqueue handler to do the
> > mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
> > environment.
> 
> It's about to panic, so no chance to schedule.

It won't panic if you drop the callback on the floor.

Though to your point, the ->next pointer is likely also trashed.  So you
could just drop the remainder of the callback list on the floor.  That
might provide a good (though not perfect) chance of getting decent output.

> > This would allow vmalloc_dump_obj() to be called unconditionally.
> > 
> > Other thoughts?
> 
> locked = spin_is_locked(&vmap_area_lock);
> if (!locked)
>     spin_lock(&vmap_area_lock)
> 
> Careful analysis is required, which may cause other problems.
> 
> Or in new function:
> if (locked)
>     return;
> spin_lock(&vmap_area_lock);
> 
> If there is a chance to dump the data, dump the data. If there is no
> chance to dump the data, do not dump the data. This is the fate of
> debugging information.

My concern is that there will be increasing numbers of special cases
over time.

							Thanx, Paul

> >> diff --git a/mm/util.c b/mm/util.c
> >> index 12984e76767e..86da0739fe5d 100644
> >> --- a/mm/util.c
> >> +++ b/mm/util.c
> >> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
> >>  {
> >>         const char *type;
> >>
> >> +       if (is_vmalloc_addr(object)) {
> >> +               if (in_task() && vmalloc_dump_obj(object))
> >> +                       return;
> >> +               type = "vmalloc memory";
> >> +               goto end;
> >> +       }
> >> +
> >>         if (kmem_valid_obj(object)) {
> >>                 kmem_dump_obj(object);
> >>                 return;
> >>         }
> >>
> >> -       if (vmalloc_dump_obj(object))
> >> -               return;
> >> -
> >>         if (virt_addr_valid(object))
> >>                 type = "non-slab/vmalloc memory";
> >>         else if (object == NULL)
> >> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
> >>                 type = "zero-size pointer";
> >>         else
> >>                 type = "non-paged memory";
> >> -
> >> +end:
> >>         pr_cont(" %s\n", type);
> >>  }
> >>  EXPORT_SYMBOL_GPL(mem_dump_obj);
> >>
> >> Thanks
> >> Zqiang
> >>
> >>
> >>>
> >>> Thanks
> >>> Zqiang
> >>>
> >>>
> >>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
> >>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
> >>>> --- a/kernel/rcu/rcu.h
> >>>> +++ b/kernel/rcu/rcu.h
> >>>> @@ -10,6 +10,7 @@
> >>>> #ifndef __LINUX_RCU_H
> >>>> #define __LINUX_RCU_H
> >>>>
> >>>> +#include <linux/mm.h>
> >>>> #include <trace/events/rcu.h>
> >>>>
> >>>> /*
> >>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
> >>>> rcu_head *head) }
> >>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
> >>>>
> >>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
> >>>> +	if (unlikely(!rhp->func))
> >>>> +		mem_dump_obj(rhp);
> >>>> +}
> >>>> +
> >>>> extern int rcu_cpu_stall_suppress_at_boot;
> >>>>
> >>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
> >>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
> >>>> 33adafdad261389..5e7f336baa06ae0 100644
> >>>> --- a/kernel/rcu/srcutiny.c
> >>>> +++ b/kernel/rcu/srcutiny.c
> >>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
> >>>> 	while (lh) {
> >>>> 		rhp = lh;
> >>>> 		lh = lh->next;
> >>>> +		debug_rcu_head_callback(rhp);
> >>>> 		local_bh_disable();
> >>>> 		rhp->func(rhp);
> >>>> 		local_bh_enable();
> >>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
> >>>> ca4b5dcec675bac..294972e66b31863 100644
> >>>> --- a/kernel/rcu/srcutree.c
> >>>> +++ b/kernel/rcu/srcutree.c
> >>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
> >>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
> >>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
> >>>> 		debug_rcu_head_unqueue(rhp);
> >>>> +		debug_rcu_head_callback(rhp);
> >>>> 		local_bh_disable();
> >>>> 		rhp->func(rhp);
> >>>> 		local_bh_enable();
> >>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
> >>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
> >>>> --- a/kernel/rcu/tasks.h
> >>>> +++ b/kernel/rcu/tasks.h
> >>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
> >>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
> >>>> 	len = rcl.len;
> >>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
> >>>> rcu_cblist_dequeue(&rcl)) {
> >>>> +		debug_rcu_head_callback(rhp);
> >>>> 		local_bh_disable();
> >>>> 		rhp->func(rhp);
> >>>> 		local_bh_enable();
> >>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
> >>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
> >>>> --- a/kernel/rcu/tiny.c
> >>>> +++ b/kernel/rcu/tiny.c
> >>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
> >>>> *head)
> >>>>
> >>>> 	trace_rcu_invoke_callback("", head);
> >>>> 	f = head->func;
> >>>> +	debug_rcu_head_callback(head);
> >>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
> >>>> 	f(head);
> >>>> 	rcu_lock_release(&rcu_callback_map);
> >>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
> >>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
> >>>> --- a/kernel/rcu/tree.c
> >>>> +++ b/kernel/rcu/tree.c
> >>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> >>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
> >>>>
> >>>> 		f = rhp->func;
> >>>> +		debug_rcu_head_callback(rhp);
> >>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
> >>>> 		f(rhp);
> >>>>
> >>>> --
> >>>> 2.25.1
> >>>
> >>> .
> >>>
> >>
> >> --
> >> Regards,
> >>   Zhen Lei
> > .
> > 
> 
> -- 
> Regards,
>   Zhen Lei
Paul E. McKenney Nov. 12, 2022, 6:09 a.m. UTC | #9
On Sat, Nov 12, 2022 at 10:39:55AM +0800, Leizhen (ThunderTown) wrote:
> On 2022/11/12 10:32, Leizhen (ThunderTown) wrote:
> > On 2022/11/12 2:42, Paul E. McKenney wrote:
> >> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
> >>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
> >>>>> When a structure containing an RCU callback rhp is (incorrectly) 
> >>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
> >>>>> unusual for
> >>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
> >>>>> rhp->by
> >>>>> __call_rcu_common() in kernels built with 
> >>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
> >>>>> offending code using the identity of this function.
> >>>>>
> >>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
> >>>>> are even worse, as can be seen from this splat:
> >>>>>
> >>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
> >>>>> ... ...
> >>>>> PC is at 0x0
> >>>>> LR is at rcu_do_batch+0x1c0/0x3b8
> >>>>> ... ...
> >>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
> >>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
> >>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
> >>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
> >>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
> >>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
> >>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
> >>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
> >>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
> >>>>> (default_idle_call) from (do_idle+0xf8/0x150)
> >>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
> >>>>> (cpu_startup_entry) from (0xc01530)
> >>>>>
> >>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
> >>>>> information, for example:
> >>>>>
> >>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
> >>>>>
> >>>>> This provides the rough size of the memory block and the offset of 
> >>>>> the rcu_head structure, which as least provides at least a few clues 
> >>>>> to help locate the problem. If the problem is reproducible, 
> >>>>> additional slab debugging can be enabled, for example, 
> >>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
> >>>>>
> >>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> >>>>> ---
> >>>>> kernel/rcu/rcu.h      | 7 +++++++
> >>>>> kernel/rcu/srcutiny.c | 1 +
> >>>>> kernel/rcu/srcutree.c | 1 +
> >>>>> kernel/rcu/tasks.h    | 1 +
> >>>>> kernel/rcu/tiny.c     | 1 +
> >>>>> kernel/rcu/tree.c     | 1 +
> >>>>> 6 files changed, 12 insertions(+)
> >>>>>
> >>>>> v1 --> v2:
> >>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
> >>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
> >>>>>
> >>>>
> >>>> Hi, Zhen Lei
> >>>>
> >>>> Maybe the following scenarios should be considered:
> >>>>
> >>>>                 CPU 0
> >>>> tasks context
> >>>>    spin_lock(&vmap_area_lock)
> >>>>           Interrupt 
> >>>> 	 RCU softirq
> >>>> 	      rcu_do_batch
> >>>> 		mem_dump_obj
> >>>>                                   vmalloc_dump_obj
> >>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
> >>>
> >>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
> >>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
> >>>
> >>> This is a workaround, or maybe try a modification like the following, 
> >>> of course, need to ask Paul's opinion.
> >>
> >> Another approach is to schedule a workqueue handler to do the
> >> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
> >> environment.
> > 
> > It's about to panic, so no chance to schedule.
> > 
> >>
> >> This would allow vmalloc_dump_obj() to be called unconditionally.
> >>
> >> Other thoughts?
> > 
> > locked = spin_is_locked(&vmap_area_lock);
> > if (!locked)
> >     spin_lock(&vmap_area_lock)
> > 
> > Careful analysis is required, which may cause other problems.
> > 
> > Or in new function:
> 
> Oh, perhaps no new function is needed, mem_dump_obj() itself prints
> debugging information. I will try a mm patch first.

That does sound very worth trying!

							Thanx, Paul

> > if (locked)
> >     return;
> > spin_lock(&vmap_area_lock);
> > 
> > If there is a chance to dump the data, dump the data. If there is no
> > chance to dump the data, do not dump the data. This is the fate of
> > debugging information.
> > 
> >>
> >> 							Thanx, Paul
> >>
> >>> diff --git a/mm/util.c b/mm/util.c
> >>> index 12984e76767e..86da0739fe5d 100644
> >>> --- a/mm/util.c
> >>> +++ b/mm/util.c
> >>> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
> >>>  {
> >>>         const char *type;
> >>>
> >>> +       if (is_vmalloc_addr(object)) {
> >>> +               if (in_task() && vmalloc_dump_obj(object))
> >>> +                       return;
> >>> +               type = "vmalloc memory";
> >>> +               goto end;
> >>> +       }
> >>> +
> >>>         if (kmem_valid_obj(object)) {
> >>>                 kmem_dump_obj(object);
> >>>                 return;
> >>>         }
> >>>
> >>> -       if (vmalloc_dump_obj(object))
> >>> -               return;
> >>> -
> >>>         if (virt_addr_valid(object))
> >>>                 type = "non-slab/vmalloc memory";
> >>>         else if (object == NULL)
> >>> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
> >>>                 type = "zero-size pointer";
> >>>         else
> >>>                 type = "non-paged memory";
> >>> -
> >>> +end:
> >>>         pr_cont(" %s\n", type);
> >>>  }
> >>>  EXPORT_SYMBOL_GPL(mem_dump_obj);
> >>>
> >>> Thanks
> >>> Zqiang
> >>>
> >>>
> >>>>
> >>>> Thanks
> >>>> Zqiang
> >>>>
> >>>>
> >>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
> >>>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
> >>>>> --- a/kernel/rcu/rcu.h
> >>>>> +++ b/kernel/rcu/rcu.h
> >>>>> @@ -10,6 +10,7 @@
> >>>>> #ifndef __LINUX_RCU_H
> >>>>> #define __LINUX_RCU_H
> >>>>>
> >>>>> +#include <linux/mm.h>
> >>>>> #include <trace/events/rcu.h>
> >>>>>
> >>>>> /*
> >>>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
> >>>>> rcu_head *head) }
> >>>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
> >>>>>
> >>>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
> >>>>> +	if (unlikely(!rhp->func))
> >>>>> +		mem_dump_obj(rhp);
> >>>>> +}
> >>>>> +
> >>>>> extern int rcu_cpu_stall_suppress_at_boot;
> >>>>>
> >>>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
> >>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
> >>>>> 33adafdad261389..5e7f336baa06ae0 100644
> >>>>> --- a/kernel/rcu/srcutiny.c
> >>>>> +++ b/kernel/rcu/srcutiny.c
> >>>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
> >>>>> 	while (lh) {
> >>>>> 		rhp = lh;
> >>>>> 		lh = lh->next;
> >>>>> +		debug_rcu_head_callback(rhp);
> >>>>> 		local_bh_disable();
> >>>>> 		rhp->func(rhp);
> >>>>> 		local_bh_enable();
> >>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
> >>>>> ca4b5dcec675bac..294972e66b31863 100644
> >>>>> --- a/kernel/rcu/srcutree.c
> >>>>> +++ b/kernel/rcu/srcutree.c
> >>>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
> >>>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
> >>>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
> >>>>> 		debug_rcu_head_unqueue(rhp);
> >>>>> +		debug_rcu_head_callback(rhp);
> >>>>> 		local_bh_disable();
> >>>>> 		rhp->func(rhp);
> >>>>> 		local_bh_enable();
> >>>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
> >>>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
> >>>>> --- a/kernel/rcu/tasks.h
> >>>>> +++ b/kernel/rcu/tasks.h
> >>>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
> >>>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
> >>>>> 	len = rcl.len;
> >>>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
> >>>>> rcu_cblist_dequeue(&rcl)) {
> >>>>> +		debug_rcu_head_callback(rhp);
> >>>>> 		local_bh_disable();
> >>>>> 		rhp->func(rhp);
> >>>>> 		local_bh_enable();
> >>>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
> >>>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
> >>>>> --- a/kernel/rcu/tiny.c
> >>>>> +++ b/kernel/rcu/tiny.c
> >>>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
> >>>>> *head)
> >>>>>
> >>>>> 	trace_rcu_invoke_callback("", head);
> >>>>> 	f = head->func;
> >>>>> +	debug_rcu_head_callback(head);
> >>>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
> >>>>> 	f(head);
> >>>>> 	rcu_lock_release(&rcu_callback_map);
> >>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
> >>>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
> >>>>> --- a/kernel/rcu/tree.c
> >>>>> +++ b/kernel/rcu/tree.c
> >>>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> >>>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
> >>>>>
> >>>>> 		f = rhp->func;
> >>>>> +		debug_rcu_head_callback(rhp);
> >>>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
> >>>>> 		f(rhp);
> >>>>>
> >>>>> --
> >>>>> 2.25.1
> >>>>
> >>>> .
> >>>>
> >>>
> >>> --
> >>> Regards,
> >>>   Zhen Lei
> >> .
> >>
> > 
> 
> -- 
> Regards,
>   Zhen Lei
Leizhen (ThunderTown) Nov. 14, 2022, 7:05 a.m. UTC | #10
On 2022/11/11 23:50, Elliott, Robert (Servers) wrote:
> 
> 
>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp)
>> +{
>> +	if (unlikely(!rhp->func))
>> +		mem_dump_obj(rhp);
>> +}
>> +
> 
> The mm/util.c definition of mem_dump_object() says:
>  * This function uses pr_cont(), so that the caller is expected to have
>  * printed out whatever preamble is appropriate.
> 
> so this needs to call pr_alert() or pr_err() before that to explain what
> is being printed (with no \n), like these:
> 
> kernel/rcu/rcutorture.c: pr_alert("mem_dump_obj(%px):", &rhp);
> kernel/rcu/rcutorture.c: mem_dump_obj(&rhp);
> ...
> kernel/rcu/tree.c: pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
> kernel/rcu/tree.c: mem_dump_obj(head);

Yes, right, thanks.

> 
> 
> .
>
Leizhen (ThunderTown) Nov. 14, 2022, 7:18 a.m. UTC | #11
On 2022/11/12 14:08, Paul E. McKenney wrote:
> On Sat, Nov 12, 2022 at 10:32:32AM +0800, Leizhen (ThunderTown) wrote:
>> On 2022/11/12 2:42, Paul E. McKenney wrote:
>>> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
>>>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
>>>>>> When a structure containing an RCU callback rhp is (incorrectly) 
>>>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
>>>>>> unusual for
>>>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
>>>>>> rhp->by
>>>>>> __call_rcu_common() in kernels built with 
>>>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
>>>>>> offending code using the identity of this function.
>>>>>>
>>>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
>>>>>> are even worse, as can be seen from this splat:
>>>>>>
>>>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
>>>>>> ... ...
>>>>>> PC is at 0x0
>>>>>> LR is at rcu_do_batch+0x1c0/0x3b8
>>>>>> ... ...
>>>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
>>>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
>>>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
>>>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
>>>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
>>>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
>>>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
>>>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
>>>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
>>>>>> (default_idle_call) from (do_idle+0xf8/0x150)
>>>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
>>>>>> (cpu_startup_entry) from (0xc01530)
>>>>>>
>>>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
>>>>>> information, for example:
>>>>>>
>>>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
>>>>>>
>>>>>> This provides the rough size of the memory block and the offset of 
>>>>>> the rcu_head structure, which as least provides at least a few clues 
>>>>>> to help locate the problem. If the problem is reproducible, 
>>>>>> additional slab debugging can be enabled, for example, 
>>>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
>>>>>>
>>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>>>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>>>>>> ---
>>>>>> kernel/rcu/rcu.h      | 7 +++++++
>>>>>> kernel/rcu/srcutiny.c | 1 +
>>>>>> kernel/rcu/srcutree.c | 1 +
>>>>>> kernel/rcu/tasks.h    | 1 +
>>>>>> kernel/rcu/tiny.c     | 1 +
>>>>>> kernel/rcu/tree.c     | 1 +
>>>>>> 6 files changed, 12 insertions(+)
>>>>>>
>>>>>> v1 --> v2:
>>>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
>>>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
>>>>>>
>>>>>
>>>>> Hi, Zhen Lei
>>>>>
>>>>> Maybe the following scenarios should be considered:
>>>>>
>>>>>                 CPU 0
>>>>> tasks context
>>>>>    spin_lock(&vmap_area_lock)
>>>>>           Interrupt 
>>>>> 	 RCU softirq
>>>>> 	      rcu_do_batch
>>>>> 		mem_dump_obj
>>>>>                                   vmalloc_dump_obj
>>>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
>>>>
>>>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
>>>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
>>>>
>>>> This is a workaround, or maybe try a modification like the following, 
>>>> of course, need to ask Paul's opinion.
>>>
>>> Another approach is to schedule a workqueue handler to do the
>>> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
>>> environment.
>>
>> It's about to panic, so no chance to schedule.
> 
> It won't panic if you drop the callback on the floor.
> 
> Though to your point, the ->next pointer is likely also trashed.  So you
> could just drop the remainder of the callback list on the floor.  That
> might provide a good (though not perfect) chance of getting decent output.

OK, I think I understand what you mean.
if (!f)
	schedule_work(&work);
else
	f(rhp)

> 
>>> This would allow vmalloc_dump_obj() to be called unconditionally.
>>>
>>> Other thoughts?
>>
>> locked = spin_is_locked(&vmap_area_lock);
>> if (!locked)
>>     spin_lock(&vmap_area_lock)
>>
>> Careful analysis is required, which may cause other problems.
>>
>> Or in new function:
>> if (locked)
>>     return;
>> spin_lock(&vmap_area_lock);
>>
>> If there is a chance to dump the data, dump the data. If there is no
>> chance to dump the data, do not dump the data. This is the fate of
>> debugging information.
> 
> My concern is that there will be increasing numbers of special cases
> over time.

OK, I got it.

> 
> 							Thanx, Paul
> 
>>>> diff --git a/mm/util.c b/mm/util.c
>>>> index 12984e76767e..86da0739fe5d 100644
>>>> --- a/mm/util.c
>>>> +++ b/mm/util.c
>>>> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
>>>>  {
>>>>         const char *type;
>>>>
>>>> +       if (is_vmalloc_addr(object)) {
>>>> +               if (in_task() && vmalloc_dump_obj(object))
>>>> +                       return;
>>>> +               type = "vmalloc memory";
>>>> +               goto end;
>>>> +       }
>>>> +
>>>>         if (kmem_valid_obj(object)) {
>>>>                 kmem_dump_obj(object);
>>>>                 return;
>>>>         }
>>>>
>>>> -       if (vmalloc_dump_obj(object))
>>>> -               return;
>>>> -
>>>>         if (virt_addr_valid(object))
>>>>                 type = "non-slab/vmalloc memory";
>>>>         else if (object == NULL)
>>>> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
>>>>                 type = "zero-size pointer";
>>>>         else
>>>>                 type = "non-paged memory";
>>>> -
>>>> +end:
>>>>         pr_cont(" %s\n", type);
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(mem_dump_obj);
>>>>
>>>> Thanks
>>>> Zqiang
>>>>
>>>>
>>>>>
>>>>> Thanks
>>>>> Zqiang
>>>>>
>>>>>
>>>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
>>>>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
>>>>>> --- a/kernel/rcu/rcu.h
>>>>>> +++ b/kernel/rcu/rcu.h
>>>>>> @@ -10,6 +10,7 @@
>>>>>> #ifndef __LINUX_RCU_H
>>>>>> #define __LINUX_RCU_H
>>>>>>
>>>>>> +#include <linux/mm.h>
>>>>>> #include <trace/events/rcu.h>
>>>>>>
>>>>>> /*
>>>>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
>>>>>> rcu_head *head) }
>>>>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
>>>>>>
>>>>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
>>>>>> +	if (unlikely(!rhp->func))
>>>>>> +		mem_dump_obj(rhp);
>>>>>> +}
>>>>>> +
>>>>>> extern int rcu_cpu_stall_suppress_at_boot;
>>>>>>
>>>>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
>>>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
>>>>>> 33adafdad261389..5e7f336baa06ae0 100644
>>>>>> --- a/kernel/rcu/srcutiny.c
>>>>>> +++ b/kernel/rcu/srcutiny.c
>>>>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
>>>>>> 	while (lh) {
>>>>>> 		rhp = lh;
>>>>>> 		lh = lh->next;
>>>>>> +		debug_rcu_head_callback(rhp);
>>>>>> 		local_bh_disable();
>>>>>> 		rhp->func(rhp);
>>>>>> 		local_bh_enable();
>>>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
>>>>>> ca4b5dcec675bac..294972e66b31863 100644
>>>>>> --- a/kernel/rcu/srcutree.c
>>>>>> +++ b/kernel/rcu/srcutree.c
>>>>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
>>>>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
>>>>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
>>>>>> 		debug_rcu_head_unqueue(rhp);
>>>>>> +		debug_rcu_head_callback(rhp);
>>>>>> 		local_bh_disable();
>>>>>> 		rhp->func(rhp);
>>>>>> 		local_bh_enable();
>>>>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
>>>>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
>>>>>> --- a/kernel/rcu/tasks.h
>>>>>> +++ b/kernel/rcu/tasks.h
>>>>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
>>>>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
>>>>>> 	len = rcl.len;
>>>>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
>>>>>> rcu_cblist_dequeue(&rcl)) {
>>>>>> +		debug_rcu_head_callback(rhp);
>>>>>> 		local_bh_disable();
>>>>>> 		rhp->func(rhp);
>>>>>> 		local_bh_enable();
>>>>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
>>>>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
>>>>>> --- a/kernel/rcu/tiny.c
>>>>>> +++ b/kernel/rcu/tiny.c
>>>>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
>>>>>> *head)
>>>>>>
>>>>>> 	trace_rcu_invoke_callback("", head);
>>>>>> 	f = head->func;
>>>>>> +	debug_rcu_head_callback(head);
>>>>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
>>>>>> 	f(head);
>>>>>> 	rcu_lock_release(&rcu_callback_map);
>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
>>>>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
>>>>>> --- a/kernel/rcu/tree.c
>>>>>> +++ b/kernel/rcu/tree.c
>>>>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
>>>>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
>>>>>>
>>>>>> 		f = rhp->func;
>>>>>> +		debug_rcu_head_callback(rhp);
>>>>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
>>>>>> 		f(rhp);
>>>>>>
>>>>>> --
>>>>>> 2.25.1
>>>>>
>>>>> .
>>>>>
>>>>
>>>> --
>>>> Regards,
>>>>   Zhen Lei
>>> .
>>>
>>
>> -- 
>> Regards,
>>   Zhen Lei
> .
>
Paul E. McKenney Nov. 14, 2022, 4:06 p.m. UTC | #12
On Mon, Nov 14, 2022 at 03:18:10PM +0800, Leizhen (ThunderTown) wrote:
> On 2022/11/12 14:08, Paul E. McKenney wrote:
> > On Sat, Nov 12, 2022 at 10:32:32AM +0800, Leizhen (ThunderTown) wrote:
> >> On 2022/11/12 2:42, Paul E. McKenney wrote:
> >>> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
> >>>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
> >>>>>> When a structure containing an RCU callback rhp is (incorrectly) 
> >>>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
> >>>>>> unusual for
> >>>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
> >>>>>> rhp->by
> >>>>>> __call_rcu_common() in kernels built with 
> >>>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
> >>>>>> offending code using the identity of this function.
> >>>>>>
> >>>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
> >>>>>> are even worse, as can be seen from this splat:
> >>>>>>
> >>>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
> >>>>>> ... ...
> >>>>>> PC is at 0x0
> >>>>>> LR is at rcu_do_batch+0x1c0/0x3b8
> >>>>>> ... ...
> >>>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
> >>>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
> >>>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
> >>>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
> >>>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
> >>>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
> >>>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
> >>>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
> >>>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
> >>>>>> (default_idle_call) from (do_idle+0xf8/0x150)
> >>>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
> >>>>>> (cpu_startup_entry) from (0xc01530)
> >>>>>>
> >>>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
> >>>>>> information, for example:
> >>>>>>
> >>>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
> >>>>>>
> >>>>>> This provides the rough size of the memory block and the offset of 
> >>>>>> the rcu_head structure, which as least provides at least a few clues 
> >>>>>> to help locate the problem. If the problem is reproducible, 
> >>>>>> additional slab debugging can be enabled, for example, 
> >>>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
> >>>>>>
> >>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >>>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> >>>>>> ---
> >>>>>> kernel/rcu/rcu.h      | 7 +++++++
> >>>>>> kernel/rcu/srcutiny.c | 1 +
> >>>>>> kernel/rcu/srcutree.c | 1 +
> >>>>>> kernel/rcu/tasks.h    | 1 +
> >>>>>> kernel/rcu/tiny.c     | 1 +
> >>>>>> kernel/rcu/tree.c     | 1 +
> >>>>>> 6 files changed, 12 insertions(+)
> >>>>>>
> >>>>>> v1 --> v2:
> >>>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
> >>>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
> >>>>>>
> >>>>>
> >>>>> Hi, Zhen Lei
> >>>>>
> >>>>> Maybe the following scenarios should be considered:
> >>>>>
> >>>>>                 CPU 0
> >>>>> tasks context
> >>>>>    spin_lock(&vmap_area_lock)
> >>>>>           Interrupt 
> >>>>> 	 RCU softirq
> >>>>> 	      rcu_do_batch
> >>>>> 		mem_dump_obj
> >>>>>                                   vmalloc_dump_obj
> >>>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
> >>>>
> >>>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
> >>>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
> >>>>
> >>>> This is a workaround, or maybe try a modification like the following, 
> >>>> of course, need to ask Paul's opinion.
> >>>
> >>> Another approach is to schedule a workqueue handler to do the
> >>> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
> >>> environment.
> >>
> >> It's about to panic, so no chance to schedule.
> > 
> > It won't panic if you drop the callback on the floor.
> > 
> > Though to your point, the ->next pointer is likely also trashed.  So you
> > could just drop the remainder of the callback list on the floor.  That
> > might provide a good (though not perfect) chance of getting decent output.
> 
> OK, I think I understand what you mean.
> if (!f)
> 	schedule_work(&work);
> else
> 	f(rhp)

Yes, except that the "schedule_work()" also needs to be accompanied
by something that refuses to execute the rest of those callbacks.
This needs to break out of the loop (or return) and to adjust counts,
among other things.  This might be as easy as setting count to the
negative of the length of the "rcl" list, but does need some attention
to the code following the callback-invocation loop.

							Thanx, Paul

> >>> This would allow vmalloc_dump_obj() to be called unconditionally.
> >>>
> >>> Other thoughts?
> >>
> >> locked = spin_is_locked(&vmap_area_lock);
> >> if (!locked)
> >>     spin_lock(&vmap_area_lock)
> >>
> >> Careful analysis is required, which may cause other problems.
> >>
> >> Or in new function:
> >> if (locked)
> >>     return;
> >> spin_lock(&vmap_area_lock);
> >>
> >> If there is a chance to dump the data, dump the data. If there is no
> >> chance to dump the data, do not dump the data. This is the fate of
> >> debugging information.
> > 
> > My concern is that there will be increasing numbers of special cases
> > over time.
> 
> OK, I got it.
> 
> > 
> > 							Thanx, Paul
> > 
> >>>> diff --git a/mm/util.c b/mm/util.c
> >>>> index 12984e76767e..86da0739fe5d 100644
> >>>> --- a/mm/util.c
> >>>> +++ b/mm/util.c
> >>>> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
> >>>>  {
> >>>>         const char *type;
> >>>>
> >>>> +       if (is_vmalloc_addr(object)) {
> >>>> +               if (in_task() && vmalloc_dump_obj(object))
> >>>> +                       return;
> >>>> +               type = "vmalloc memory";
> >>>> +               goto end;
> >>>> +       }
> >>>> +
> >>>>         if (kmem_valid_obj(object)) {
> >>>>                 kmem_dump_obj(object);
> >>>>                 return;
> >>>>         }
> >>>>
> >>>> -       if (vmalloc_dump_obj(object))
> >>>> -               return;
> >>>> -
> >>>>         if (virt_addr_valid(object))
> >>>>                 type = "non-slab/vmalloc memory";
> >>>>         else if (object == NULL)
> >>>> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
> >>>>                 type = "zero-size pointer";
> >>>>         else
> >>>>                 type = "non-paged memory";
> >>>> -
> >>>> +end:
> >>>>         pr_cont(" %s\n", type);
> >>>>  }
> >>>>  EXPORT_SYMBOL_GPL(mem_dump_obj);
> >>>>
> >>>> Thanks
> >>>> Zqiang
> >>>>
> >>>>
> >>>>>
> >>>>> Thanks
> >>>>> Zqiang
> >>>>>
> >>>>>
> >>>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
> >>>>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
> >>>>>> --- a/kernel/rcu/rcu.h
> >>>>>> +++ b/kernel/rcu/rcu.h
> >>>>>> @@ -10,6 +10,7 @@
> >>>>>> #ifndef __LINUX_RCU_H
> >>>>>> #define __LINUX_RCU_H
> >>>>>>
> >>>>>> +#include <linux/mm.h>
> >>>>>> #include <trace/events/rcu.h>
> >>>>>>
> >>>>>> /*
> >>>>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
> >>>>>> rcu_head *head) }
> >>>>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
> >>>>>>
> >>>>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
> >>>>>> +	if (unlikely(!rhp->func))
> >>>>>> +		mem_dump_obj(rhp);
> >>>>>> +}
> >>>>>> +
> >>>>>> extern int rcu_cpu_stall_suppress_at_boot;
> >>>>>>
> >>>>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
> >>>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
> >>>>>> 33adafdad261389..5e7f336baa06ae0 100644
> >>>>>> --- a/kernel/rcu/srcutiny.c
> >>>>>> +++ b/kernel/rcu/srcutiny.c
> >>>>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
> >>>>>> 	while (lh) {
> >>>>>> 		rhp = lh;
> >>>>>> 		lh = lh->next;
> >>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>> 		local_bh_disable();
> >>>>>> 		rhp->func(rhp);
> >>>>>> 		local_bh_enable();
> >>>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
> >>>>>> ca4b5dcec675bac..294972e66b31863 100644
> >>>>>> --- a/kernel/rcu/srcutree.c
> >>>>>> +++ b/kernel/rcu/srcutree.c
> >>>>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
> >>>>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
> >>>>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
> >>>>>> 		debug_rcu_head_unqueue(rhp);
> >>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>> 		local_bh_disable();
> >>>>>> 		rhp->func(rhp);
> >>>>>> 		local_bh_enable();
> >>>>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
> >>>>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
> >>>>>> --- a/kernel/rcu/tasks.h
> >>>>>> +++ b/kernel/rcu/tasks.h
> >>>>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
> >>>>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
> >>>>>> 	len = rcl.len;
> >>>>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
> >>>>>> rcu_cblist_dequeue(&rcl)) {
> >>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>> 		local_bh_disable();
> >>>>>> 		rhp->func(rhp);
> >>>>>> 		local_bh_enable();
> >>>>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
> >>>>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
> >>>>>> --- a/kernel/rcu/tiny.c
> >>>>>> +++ b/kernel/rcu/tiny.c
> >>>>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
> >>>>>> *head)
> >>>>>>
> >>>>>> 	trace_rcu_invoke_callback("", head);
> >>>>>> 	f = head->func;
> >>>>>> +	debug_rcu_head_callback(head);
> >>>>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
> >>>>>> 	f(head);
> >>>>>> 	rcu_lock_release(&rcu_callback_map);
> >>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
> >>>>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
> >>>>>> --- a/kernel/rcu/tree.c
> >>>>>> +++ b/kernel/rcu/tree.c
> >>>>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> >>>>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
> >>>>>>
> >>>>>> 		f = rhp->func;
> >>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
> >>>>>> 		f(rhp);
> >>>>>>
> >>>>>> --
> >>>>>> 2.25.1
> >>>>>
> >>>>> .
> >>>>>
> >>>>
> >>>> --
> >>>> Regards,
> >>>>   Zhen Lei
> >>> .
> >>>
> >>
> >> -- 
> >> Regards,
> >>   Zhen Lei
> > .
> > 
> 
> -- 
> Regards,
>   Zhen Lei
Uladzislau Rezki Nov. 14, 2022, 6:22 p.m. UTC | #13
> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
> > On 2022/11/11 19:54, Zhang, Qiang1 wrote:
> > >> When a structure containing an RCU callback rhp is (incorrectly) 
> > >> freed and reallocated after rhp is passed to call_rcu(), it is not 
> > >> unusual for
> > >> rhp->func to be set to NULL. This defeats the debugging prints used 
> > >> rhp->by
> > >> __call_rcu_common() in kernels built with 
> > >> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
> > >> offending code using the identity of this function.
> > >>
> > >> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
> > >> are even worse, as can be seen from this splat:
> > >>
> > >> Unable to handle kernel NULL pointer dereference at virtual address 0 
> > >> ... ...
> > >> PC is at 0x0
> > >> LR is at rcu_do_batch+0x1c0/0x3b8
> > >> ... ...
> > >> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
> > >> (rcu_core) from (__do_softirq+0x24c/0x344)
> > >> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
> > >> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
> > >> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
> > >> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
> > >> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
> > >> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
> > >> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
> > >> (default_idle_call) from (do_idle+0xf8/0x150)
> > >> (do_idle) from (cpu_startup_entry+0x18/0x20)
> > >> (cpu_startup_entry) from (0xc01530)
> > >>
> > >> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
> > >> information, for example:
> > >>
> > >>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
> > >>
> > >> This provides the rough size of the memory block and the offset of 
> > >> the rcu_head structure, which as least provides at least a few clues 
> > >> to help locate the problem. If the problem is reproducible, 
> > >> additional slab debugging can be enabled, for example, 
> > >> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
> > >>
> > >> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> > >> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > >> ---
> > >> kernel/rcu/rcu.h      | 7 +++++++
> > >> kernel/rcu/srcutiny.c | 1 +
> > >> kernel/rcu/srcutree.c | 1 +
> > >> kernel/rcu/tasks.h    | 1 +
> > >> kernel/rcu/tiny.c     | 1 +
> > >> kernel/rcu/tree.c     | 1 +
> > >> 6 files changed, 12 insertions(+)
> > >>
> > >> v1 --> v2:
> > >> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
> > >> 2. Paul E. McKenney helped me update the commit message, thanks.
> > >>
> > > 
> > > Hi, Zhen Lei
> > > 
> > > Maybe the following scenarios should be considered:
> > > 
> > >                 CPU 0
> > > tasks context
> > >    spin_lock(&vmap_area_lock)
> > >           Interrupt 
> > > 	 RCU softirq
> > > 	      rcu_do_batch
> > > 		mem_dump_obj
> > >                                   vmalloc_dump_obj
> > >                                        spin_lock(&vmap_area_lock)   <--  deadlock     
> > 
> > >Right, thanks. I just saw the robot's report. So this patch should be dropped.
> > >I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
> > 
> > This is a workaround, or maybe try a modification like the following, 
> > of course, need to ask Paul's opinion.
> 
> Another approach is to schedule a workqueue handler to do the
> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
> environment.
> 
> This would allow vmalloc_dump_obj() to be called unconditionally.
> 
> Other thoughts?
> 
<snip>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ca71de7c9d77..956eb28f9c77 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2591,6 +2591,19 @@ struct vm_struct *find_vm_area(const void *addr)
 	return va->vm;
 }
 
+static struct vm_struct *
+find_vm_area_trylock(const void *addr)
+{
+	struct vmap_area *va = NULL;
+
+	if (spin_trylock(&vmap_area_lock)) {
+		va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
+		spin_unlock(&vmap_area_lock);
+	}
+
+	return va ? va->vm : NULL;
+}
+
 /**
  * remove_vm_area - find and remove a continuous kernel virtual area
  * @addr:	    base address
@@ -4048,7 +4061,7 @@ bool vmalloc_dump_obj(void *object)
 	struct vm_struct *vm;
 	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
 
-	vm = find_vm_area(objp);
+	vm = find_vm_area_trylock(objp);
 	if (!vm)
 		return false;
 	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
<snip>

There is one issue though, it is not correct to reference vm->members
after a lock is dropped because of: use after free bugs. It is better
to embed the search and read out: nr_pages, addr, caller and drop the
lock.

--
Uladzislau Rezki
Leizhen (ThunderTown) Nov. 16, 2022, 2:43 p.m. UTC | #14
On 2022/11/15 0:06, Paul E. McKenney wrote:
> On Mon, Nov 14, 2022 at 03:18:10PM +0800, Leizhen (ThunderTown) wrote:
>> On 2022/11/12 14:08, Paul E. McKenney wrote:
>>> On Sat, Nov 12, 2022 at 10:32:32AM +0800, Leizhen (ThunderTown) wrote:
>>>> On 2022/11/12 2:42, Paul E. McKenney wrote:
>>>>> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
>>>>>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
>>>>>>>> When a structure containing an RCU callback rhp is (incorrectly) 
>>>>>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
>>>>>>>> unusual for
>>>>>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
>>>>>>>> rhp->by
>>>>>>>> __call_rcu_common() in kernels built with 
>>>>>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
>>>>>>>> offending code using the identity of this function.
>>>>>>>>
>>>>>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
>>>>>>>> are even worse, as can be seen from this splat:
>>>>>>>>
>>>>>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
>>>>>>>> ... ...
>>>>>>>> PC is at 0x0
>>>>>>>> LR is at rcu_do_batch+0x1c0/0x3b8
>>>>>>>> ... ...
>>>>>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
>>>>>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
>>>>>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
>>>>>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
>>>>>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
>>>>>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
>>>>>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
>>>>>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
>>>>>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
>>>>>>>> (default_idle_call) from (do_idle+0xf8/0x150)
>>>>>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
>>>>>>>> (cpu_startup_entry) from (0xc01530)
>>>>>>>>
>>>>>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
>>>>>>>> information, for example:
>>>>>>>>
>>>>>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
>>>>>>>>
>>>>>>>> This provides the rough size of the memory block and the offset of 
>>>>>>>> the rcu_head structure, which as least provides at least a few clues 
>>>>>>>> to help locate the problem. If the problem is reproducible, 
>>>>>>>> additional slab debugging can be enabled, for example, 
>>>>>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
>>>>>>>>
>>>>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>>>>>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>>>>>>>> ---
>>>>>>>> kernel/rcu/rcu.h      | 7 +++++++
>>>>>>>> kernel/rcu/srcutiny.c | 1 +
>>>>>>>> kernel/rcu/srcutree.c | 1 +
>>>>>>>> kernel/rcu/tasks.h    | 1 +
>>>>>>>> kernel/rcu/tiny.c     | 1 +
>>>>>>>> kernel/rcu/tree.c     | 1 +
>>>>>>>> 6 files changed, 12 insertions(+)
>>>>>>>>
>>>>>>>> v1 --> v2:
>>>>>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
>>>>>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
>>>>>>>>
>>>>>>>
>>>>>>> Hi, Zhen Lei
>>>>>>>
>>>>>>> Maybe the following scenarios should be considered:
>>>>>>>
>>>>>>>                 CPU 0
>>>>>>> tasks context
>>>>>>>    spin_lock(&vmap_area_lock)
>>>>>>>           Interrupt 
>>>>>>> 	 RCU softirq
>>>>>>> 	      rcu_do_batch
>>>>>>> 		mem_dump_obj
>>>>>>>                                   vmalloc_dump_obj
>>>>>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
>>>>>>
>>>>>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
>>>>>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
>>>>>>
>>>>>> This is a workaround, or maybe try a modification like the following, 
>>>>>> of course, need to ask Paul's opinion.
>>>>>
>>>>> Another approach is to schedule a workqueue handler to do the
>>>>> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
>>>>> environment.
>>>>
>>>> It's about to panic, so no chance to schedule.
>>>
>>> It won't panic if you drop the callback on the floor.
>>>
>>> Though to your point, the ->next pointer is likely also trashed.  So you
>>> could just drop the remainder of the callback list on the floor.  That
>>> might provide a good (though not perfect) chance of getting decent output.
>>
>> OK, I think I understand what you mean.
>> if (!f)
>> 	schedule_work(&work);
>> else
>> 	f(rhp)
> 
> Yes, except that the "schedule_work()" also needs to be accompanied
> by something that refuses to execute the rest of those callbacks.
> This needs to break out of the loop (or return) and to adjust counts,
> among other things.  This might be as easy as setting count to the
> negative of the length of the "rcl" list, but does need some attention
> to the code following the callback-invocation loop.

Yes, doing so would cause other problems. As you mentioned, the ->next
pointer is likely also trashed. Some nodes may need to be executed in
sequence. For such a weak debug function, it's not worth the risk, or
overly complicated thinking.

> 
> 							Thanx, Paul
> 
>>>>> This would allow vmalloc_dump_obj() to be called unconditionally.
>>>>>
>>>>> Other thoughts?
>>>>
>>>> locked = spin_is_locked(&vmap_area_lock);
>>>> if (!locked)
>>>>     spin_lock(&vmap_area_lock)
>>>>
>>>> Careful analysis is required, which may cause other problems.
>>>>
>>>> Or in new function:
>>>> if (locked)
>>>>     return;
>>>> spin_lock(&vmap_area_lock);
>>>>
>>>> If there is a chance to dump the data, dump the data. If there is no
>>>> chance to dump the data, do not dump the data. This is the fate of
>>>> debugging information.
>>>
>>> My concern is that there will be increasing numbers of special cases
>>> over time.

The memory modules are mature and stable, so your concerns may not be true.

>>
>> OK, I got it.
>>
>>>
>>> 							Thanx, Paul
>>>
>>>>>> diff --git a/mm/util.c b/mm/util.c
>>>>>> index 12984e76767e..86da0739fe5d 100644
>>>>>> --- a/mm/util.c
>>>>>> +++ b/mm/util.c
>>>>>> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
>>>>>>  {
>>>>>>         const char *type;
>>>>>>
>>>>>> +       if (is_vmalloc_addr(object)) {
>>>>>> +               if (in_task() && vmalloc_dump_obj(object))
>>>>>> +                       return;
>>>>>> +               type = "vmalloc memory";
>>>>>> +               goto end;
>>>>>> +       }
>>>>>> +
>>>>>>         if (kmem_valid_obj(object)) {
>>>>>>                 kmem_dump_obj(object);
>>>>>>                 return;
>>>>>>         }
>>>>>>
>>>>>> -       if (vmalloc_dump_obj(object))
>>>>>> -               return;
>>>>>> -
>>>>>>         if (virt_addr_valid(object))
>>>>>>                 type = "non-slab/vmalloc memory";
>>>>>>         else if (object == NULL)
>>>>>> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
>>>>>>                 type = "zero-size pointer";
>>>>>>         else
>>>>>>                 type = "non-paged memory";
>>>>>> -
>>>>>> +end:
>>>>>>         pr_cont(" %s\n", type);
>>>>>>  }
>>>>>>  EXPORT_SYMBOL_GPL(mem_dump_obj);
>>>>>>
>>>>>> Thanks
>>>>>> Zqiang
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Thanks
>>>>>>> Zqiang
>>>>>>>
>>>>>>>
>>>>>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
>>>>>>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
>>>>>>>> --- a/kernel/rcu/rcu.h
>>>>>>>> +++ b/kernel/rcu/rcu.h
>>>>>>>> @@ -10,6 +10,7 @@
>>>>>>>> #ifndef __LINUX_RCU_H
>>>>>>>> #define __LINUX_RCU_H
>>>>>>>>
>>>>>>>> +#include <linux/mm.h>
>>>>>>>> #include <trace/events/rcu.h>
>>>>>>>>
>>>>>>>> /*
>>>>>>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
>>>>>>>> rcu_head *head) }
>>>>>>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
>>>>>>>>
>>>>>>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
>>>>>>>> +	if (unlikely(!rhp->func))
>>>>>>>> +		mem_dump_obj(rhp);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> extern int rcu_cpu_stall_suppress_at_boot;
>>>>>>>>
>>>>>>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
>>>>>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
>>>>>>>> 33adafdad261389..5e7f336baa06ae0 100644
>>>>>>>> --- a/kernel/rcu/srcutiny.c
>>>>>>>> +++ b/kernel/rcu/srcutiny.c
>>>>>>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
>>>>>>>> 	while (lh) {
>>>>>>>> 		rhp = lh;
>>>>>>>> 		lh = lh->next;
>>>>>>>> +		debug_rcu_head_callback(rhp);
>>>>>>>> 		local_bh_disable();
>>>>>>>> 		rhp->func(rhp);
>>>>>>>> 		local_bh_enable();
>>>>>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
>>>>>>>> ca4b5dcec675bac..294972e66b31863 100644
>>>>>>>> --- a/kernel/rcu/srcutree.c
>>>>>>>> +++ b/kernel/rcu/srcutree.c
>>>>>>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
>>>>>>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
>>>>>>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
>>>>>>>> 		debug_rcu_head_unqueue(rhp);
>>>>>>>> +		debug_rcu_head_callback(rhp);
>>>>>>>> 		local_bh_disable();
>>>>>>>> 		rhp->func(rhp);
>>>>>>>> 		local_bh_enable();
>>>>>>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
>>>>>>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
>>>>>>>> --- a/kernel/rcu/tasks.h
>>>>>>>> +++ b/kernel/rcu/tasks.h
>>>>>>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
>>>>>>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
>>>>>>>> 	len = rcl.len;
>>>>>>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
>>>>>>>> rcu_cblist_dequeue(&rcl)) {
>>>>>>>> +		debug_rcu_head_callback(rhp);
>>>>>>>> 		local_bh_disable();
>>>>>>>> 		rhp->func(rhp);
>>>>>>>> 		local_bh_enable();
>>>>>>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
>>>>>>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
>>>>>>>> --- a/kernel/rcu/tiny.c
>>>>>>>> +++ b/kernel/rcu/tiny.c
>>>>>>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
>>>>>>>> *head)
>>>>>>>>
>>>>>>>> 	trace_rcu_invoke_callback("", head);
>>>>>>>> 	f = head->func;
>>>>>>>> +	debug_rcu_head_callback(head);
>>>>>>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
>>>>>>>> 	f(head);
>>>>>>>> 	rcu_lock_release(&rcu_callback_map);
>>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
>>>>>>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
>>>>>>>> --- a/kernel/rcu/tree.c
>>>>>>>> +++ b/kernel/rcu/tree.c
>>>>>>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
>>>>>>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
>>>>>>>>
>>>>>>>> 		f = rhp->func;
>>>>>>>> +		debug_rcu_head_callback(rhp);
>>>>>>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
>>>>>>>> 		f(rhp);
>>>>>>>>
>>>>>>>> --
>>>>>>>> 2.25.1
>>>>>>>
>>>>>>> .
>>>>>>>
>>>>>>
>>>>>> --
>>>>>> Regards,
>>>>>>   Zhen Lei
>>>>> .
>>>>>
>>>>
>>>> -- 
>>>> Regards,
>>>>   Zhen Lei
>>> .
>>>
>>
>> -- 
>> Regards,
>>   Zhen Lei
> .
>
Paul E. McKenney Nov. 16, 2022, 7:57 p.m. UTC | #15
On Wed, Nov 16, 2022 at 10:43:57PM +0800, Leizhen (ThunderTown) wrote:
> On 2022/11/15 0:06, Paul E. McKenney wrote:
> > On Mon, Nov 14, 2022 at 03:18:10PM +0800, Leizhen (ThunderTown) wrote:
> >> On 2022/11/12 14:08, Paul E. McKenney wrote:
> >>> On Sat, Nov 12, 2022 at 10:32:32AM +0800, Leizhen (ThunderTown) wrote:
> >>>> On 2022/11/12 2:42, Paul E. McKenney wrote:
> >>>>> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
> >>>>>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
> >>>>>>>> When a structure containing an RCU callback rhp is (incorrectly) 
> >>>>>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
> >>>>>>>> unusual for
> >>>>>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
> >>>>>>>> rhp->by
> >>>>>>>> __call_rcu_common() in kernels built with 
> >>>>>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
> >>>>>>>> offending code using the identity of this function.
> >>>>>>>>
> >>>>>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
> >>>>>>>> are even worse, as can be seen from this splat:
> >>>>>>>>
> >>>>>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
> >>>>>>>> ... ...
> >>>>>>>> PC is at 0x0
> >>>>>>>> LR is at rcu_do_batch+0x1c0/0x3b8
> >>>>>>>> ... ...
> >>>>>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
> >>>>>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
> >>>>>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
> >>>>>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
> >>>>>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
> >>>>>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
> >>>>>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
> >>>>>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
> >>>>>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
> >>>>>>>> (default_idle_call) from (do_idle+0xf8/0x150)
> >>>>>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
> >>>>>>>> (cpu_startup_entry) from (0xc01530)
> >>>>>>>>
> >>>>>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
> >>>>>>>> information, for example:
> >>>>>>>>
> >>>>>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
> >>>>>>>>
> >>>>>>>> This provides the rough size of the memory block and the offset of 
> >>>>>>>> the rcu_head structure, which as least provides at least a few clues 
> >>>>>>>> to help locate the problem. If the problem is reproducible, 
> >>>>>>>> additional slab debugging can be enabled, for example, 
> >>>>>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >>>>>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> >>>>>>>> ---
> >>>>>>>> kernel/rcu/rcu.h      | 7 +++++++
> >>>>>>>> kernel/rcu/srcutiny.c | 1 +
> >>>>>>>> kernel/rcu/srcutree.c | 1 +
> >>>>>>>> kernel/rcu/tasks.h    | 1 +
> >>>>>>>> kernel/rcu/tiny.c     | 1 +
> >>>>>>>> kernel/rcu/tree.c     | 1 +
> >>>>>>>> 6 files changed, 12 insertions(+)
> >>>>>>>>
> >>>>>>>> v1 --> v2:
> >>>>>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
> >>>>>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
> >>>>>>>>
> >>>>>>>
> >>>>>>> Hi, Zhen Lei
> >>>>>>>
> >>>>>>> Maybe the following scenarios should be considered:
> >>>>>>>
> >>>>>>>                 CPU 0
> >>>>>>> tasks context
> >>>>>>>    spin_lock(&vmap_area_lock)
> >>>>>>>           Interrupt 
> >>>>>>> 	 RCU softirq
> >>>>>>> 	      rcu_do_batch
> >>>>>>> 		mem_dump_obj
> >>>>>>>                                   vmalloc_dump_obj
> >>>>>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
> >>>>>>
> >>>>>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
> >>>>>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
> >>>>>>
> >>>>>> This is a workaround, or maybe try a modification like the following, 
> >>>>>> of course, need to ask Paul's opinion.
> >>>>>
> >>>>> Another approach is to schedule a workqueue handler to do the
> >>>>> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
> >>>>> environment.
> >>>>
> >>>> It's about to panic, so no chance to schedule.
> >>>
> >>> It won't panic if you drop the callback on the floor.
> >>>
> >>> Though to your point, the ->next pointer is likely also trashed.  So you
> >>> could just drop the remainder of the callback list on the floor.  That
> >>> might provide a good (though not perfect) chance of getting decent output.
> >>
> >> OK, I think I understand what you mean.
> >> if (!f)
> >> 	schedule_work(&work);
> >> else
> >> 	f(rhp)
> > 
> > Yes, except that the "schedule_work()" also needs to be accompanied
> > by something that refuses to execute the rest of those callbacks.
> > This needs to break out of the loop (or return) and to adjust counts,
> > among other things.  This might be as easy as setting count to the
> > negative of the length of the "rcl" list, but does need some attention
> > to the code following the callback-invocation loop.
> 
> Yes, doing so would cause other problems. As you mentioned, the ->next
> pointer is likely also trashed. Some nodes may need to be executed in
> sequence. For such a weak debug function, it's not worth the risk, or
> overly complicated thinking.

Do we have similar deadlock issues with the calls to mem_dump_obj() in
the call_rcu() code path?  These are somewhat less concerning because
they are invoked under a Kconfig option that is (as far as I know)
rarely set, but still...

							Thanx, Paul

> >>>>> This would allow vmalloc_dump_obj() to be called unconditionally.
> >>>>>
> >>>>> Other thoughts?
> >>>>
> >>>> locked = spin_is_locked(&vmap_area_lock);
> >>>> if (!locked)
> >>>>     spin_lock(&vmap_area_lock)
> >>>>
> >>>> Careful analysis is required, which may cause other problems.
> >>>>
> >>>> Or in new function:
> >>>> if (locked)
> >>>>     return;
> >>>> spin_lock(&vmap_area_lock);
> >>>>
> >>>> If there is a chance to dump the data, dump the data. If there is no
> >>>> chance to dump the data, do not dump the data. This is the fate of
> >>>> debugging information.
> >>>
> >>> My concern is that there will be increasing numbers of special cases
> >>> over time.
> 
> The memory modules are mature and stable, so your concerns may not be true.
> 
> >>
> >> OK, I got it.
> >>
> >>>
> >>> 							Thanx, Paul
> >>>
> >>>>>> diff --git a/mm/util.c b/mm/util.c
> >>>>>> index 12984e76767e..86da0739fe5d 100644
> >>>>>> --- a/mm/util.c
> >>>>>> +++ b/mm/util.c
> >>>>>> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
> >>>>>>  {
> >>>>>>         const char *type;
> >>>>>>
> >>>>>> +       if (is_vmalloc_addr(object)) {
> >>>>>> +               if (in_task() && vmalloc_dump_obj(object))
> >>>>>> +                       return;
> >>>>>> +               type = "vmalloc memory";
> >>>>>> +               goto end;
> >>>>>> +       }
> >>>>>> +
> >>>>>>         if (kmem_valid_obj(object)) {
> >>>>>>                 kmem_dump_obj(object);
> >>>>>>                 return;
> >>>>>>         }
> >>>>>>
> >>>>>> -       if (vmalloc_dump_obj(object))
> >>>>>> -               return;
> >>>>>> -
> >>>>>>         if (virt_addr_valid(object))
> >>>>>>                 type = "non-slab/vmalloc memory";
> >>>>>>         else if (object == NULL)
> >>>>>> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
> >>>>>>                 type = "zero-size pointer";
> >>>>>>         else
> >>>>>>                 type = "non-paged memory";
> >>>>>> -
> >>>>>> +end:
> >>>>>>         pr_cont(" %s\n", type);
> >>>>>>  }
> >>>>>>  EXPORT_SYMBOL_GPL(mem_dump_obj);
> >>>>>>
> >>>>>> Thanks
> >>>>>> Zqiang
> >>>>>>
> >>>>>>
> >>>>>>>
> >>>>>>> Thanks
> >>>>>>> Zqiang
> >>>>>>>
> >>>>>>>
> >>>>>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
> >>>>>>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
> >>>>>>>> --- a/kernel/rcu/rcu.h
> >>>>>>>> +++ b/kernel/rcu/rcu.h
> >>>>>>>> @@ -10,6 +10,7 @@
> >>>>>>>> #ifndef __LINUX_RCU_H
> >>>>>>>> #define __LINUX_RCU_H
> >>>>>>>>
> >>>>>>>> +#include <linux/mm.h>
> >>>>>>>> #include <trace/events/rcu.h>
> >>>>>>>>
> >>>>>>>> /*
> >>>>>>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
> >>>>>>>> rcu_head *head) }
> >>>>>>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
> >>>>>>>>
> >>>>>>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
> >>>>>>>> +	if (unlikely(!rhp->func))
> >>>>>>>> +		mem_dump_obj(rhp);
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> extern int rcu_cpu_stall_suppress_at_boot;
> >>>>>>>>
> >>>>>>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
> >>>>>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
> >>>>>>>> 33adafdad261389..5e7f336baa06ae0 100644
> >>>>>>>> --- a/kernel/rcu/srcutiny.c
> >>>>>>>> +++ b/kernel/rcu/srcutiny.c
> >>>>>>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
> >>>>>>>> 	while (lh) {
> >>>>>>>> 		rhp = lh;
> >>>>>>>> 		lh = lh->next;
> >>>>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>>>> 		local_bh_disable();
> >>>>>>>> 		rhp->func(rhp);
> >>>>>>>> 		local_bh_enable();
> >>>>>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
> >>>>>>>> ca4b5dcec675bac..294972e66b31863 100644
> >>>>>>>> --- a/kernel/rcu/srcutree.c
> >>>>>>>> +++ b/kernel/rcu/srcutree.c
> >>>>>>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
> >>>>>>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
> >>>>>>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
> >>>>>>>> 		debug_rcu_head_unqueue(rhp);
> >>>>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>>>> 		local_bh_disable();
> >>>>>>>> 		rhp->func(rhp);
> >>>>>>>> 		local_bh_enable();
> >>>>>>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
> >>>>>>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
> >>>>>>>> --- a/kernel/rcu/tasks.h
> >>>>>>>> +++ b/kernel/rcu/tasks.h
> >>>>>>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
> >>>>>>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
> >>>>>>>> 	len = rcl.len;
> >>>>>>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
> >>>>>>>> rcu_cblist_dequeue(&rcl)) {
> >>>>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>>>> 		local_bh_disable();
> >>>>>>>> 		rhp->func(rhp);
> >>>>>>>> 		local_bh_enable();
> >>>>>>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
> >>>>>>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
> >>>>>>>> --- a/kernel/rcu/tiny.c
> >>>>>>>> +++ b/kernel/rcu/tiny.c
> >>>>>>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
> >>>>>>>> *head)
> >>>>>>>>
> >>>>>>>> 	trace_rcu_invoke_callback("", head);
> >>>>>>>> 	f = head->func;
> >>>>>>>> +	debug_rcu_head_callback(head);
> >>>>>>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
> >>>>>>>> 	f(head);
> >>>>>>>> 	rcu_lock_release(&rcu_callback_map);
> >>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
> >>>>>>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
> >>>>>>>> --- a/kernel/rcu/tree.c
> >>>>>>>> +++ b/kernel/rcu/tree.c
> >>>>>>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> >>>>>>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
> >>>>>>>>
> >>>>>>>> 		f = rhp->func;
> >>>>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
> >>>>>>>> 		f(rhp);
> >>>>>>>>
> >>>>>>>> --
> >>>>>>>> 2.25.1
> >>>>>>>
> >>>>>>> .
> >>>>>>>
> >>>>>>
> >>>>>> --
> >>>>>> Regards,
> >>>>>>   Zhen Lei
> >>>>> .
> >>>>>
> >>>>
> >>>> -- 
> >>>> Regards,
> >>>>   Zhen Lei
> >>> .
> >>>
> >>
> >> -- 
> >> Regards,
> >>   Zhen Lei
> > .
> > 
> 
> -- 
> Regards,
>   Zhen Lei
Zqiang Nov. 17, 2022, 12:09 a.m. UTC | #16
On Wed, Nov 16, 2022 at 10:43:57PM +0800, Leizhen (ThunderTown) wrote:
> On 2022/11/15 0:06, Paul E. McKenney wrote:
> > On Mon, Nov 14, 2022 at 03:18:10PM +0800, Leizhen (ThunderTown) wrote:
> >> On 2022/11/12 14:08, Paul E. McKenney wrote:
> >>> On Sat, Nov 12, 2022 at 10:32:32AM +0800, Leizhen (ThunderTown) wrote:
> >>>> On 2022/11/12 2:42, Paul E. McKenney wrote:
> >>>>> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
> >>>>>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
> >>>>>>>> When a structure containing an RCU callback rhp is (incorrectly) 
> >>>>>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
> >>>>>>>> unusual for
> >>>>>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
> >>>>>>>> rhp->by
> >>>>>>>> __call_rcu_common() in kernels built with 
> >>>>>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
> >>>>>>>> offending code using the identity of this function.
> >>>>>>>>
> >>>>>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
> >>>>>>>> are even worse, as can be seen from this splat:
> >>>>>>>>
> >>>>>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
> >>>>>>>> ... ...
> >>>>>>>> PC is at 0x0
> >>>>>>>> LR is at rcu_do_batch+0x1c0/0x3b8
> >>>>>>>> ... ...
> >>>>>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
> >>>>>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
> >>>>>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
> >>>>>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
> >>>>>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
> >>>>>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
> >>>>>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
> >>>>>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
> >>>>>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
> >>>>>>>> (default_idle_call) from (do_idle+0xf8/0x150)
> >>>>>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
> >>>>>>>> (cpu_startup_entry) from (0xc01530)
> >>>>>>>>
> >>>>>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
> >>>>>>>> information, for example:
> >>>>>>>>
> >>>>>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
> >>>>>>>>
> >>>>>>>> This provides the rough size of the memory block and the offset of 
> >>>>>>>> the rcu_head structure, which as least provides at least a few clues 
> >>>>>>>> to help locate the problem. If the problem is reproducible, 
> >>>>>>>> additional slab debugging can be enabled, for example, 
> >>>>>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >>>>>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> >>>>>>>> ---
> >>>>>>>> kernel/rcu/rcu.h      | 7 +++++++
> >>>>>>>> kernel/rcu/srcutiny.c | 1 +
> >>>>>>>> kernel/rcu/srcutree.c | 1 +
> >>>>>>>> kernel/rcu/tasks.h    | 1 +
> >>>>>>>> kernel/rcu/tiny.c     | 1 +
> >>>>>>>> kernel/rcu/tree.c     | 1 +
> >>>>>>>> 6 files changed, 12 insertions(+)
> >>>>>>>>
> >>>>>>>> v1 --> v2:
> >>>>>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
> >>>>>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
> >>>>>>>>
> >>>>>>>
> >>>>>>> Hi, Zhen Lei
> >>>>>>>
> >>>>>>> Maybe the following scenarios should be considered:
> >>>>>>>
> >>>>>>>                 CPU 0
> >>>>>>> tasks context
> >>>>>>>    spin_lock(&vmap_area_lock)
> >>>>>>>           Interrupt 
> >>>>>>> 	 RCU softirq
> >>>>>>> 	      rcu_do_batch
> >>>>>>> 		mem_dump_obj
> >>>>>>>                                   vmalloc_dump_obj
> >>>>>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
> >>>>>>
> >>>>>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
> >>>>>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
> >>>>>>
> >>>>>> This is a workaround, or maybe try a modification like the following, 
> >>>>>> of course, need to ask Paul's opinion.
> >>>>>
> >>>>> Another approach is to schedule a workqueue handler to do the
> >>>>> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
> >>>>> environment.
> >>>>
> >>>> It's about to panic, so no chance to schedule.
> >>>
> >>> It won't panic if you drop the callback on the floor.
> >>>
> >>> Though to your point, the ->next pointer is likely also trashed.  So you
> >>> could just drop the remainder of the callback list on the floor.  That
> >>> might provide a good (though not perfect) chance of getting decent output.
> >>
> >> OK, I think I understand what you mean.
> >> if (!f)
> >> 	schedule_work(&work);
> >> else
> >> 	f(rhp)
> > 
> > Yes, except that the "schedule_work()" also needs to be accompanied
> > by something that refuses to execute the rest of those callbacks.
> > This needs to break out of the loop (or return) and to adjust counts,
> > among other things.  This might be as easy as setting count to the
> > negative of the length of the "rcl" list, but does need some attention
> > to the code following the callback-invocation loop.
> 
> Yes, doing so would cause other problems. As you mentioned, the ->next
> pointer is likely also trashed. Some nodes may need to be executed in
> sequence. For such a weak debug function, it's not worth the risk, or
> overly complicated thinking.

>Do we have similar deadlock issues with the calls to mem_dump_obj() in
>the call_rcu() code path? 

I think it exists, and also consider PREEMPT_RT kernel ,  because the vmap_area_lock spinlock
convert to sleepable lock. I have sent a version of the patch.

Thanks
Zqiang

> These are somewhat less concerning because
>they are invoked under a Kconfig option that is (as far as I know)
>rarely set, but still...
>
>							Thanx, Paul

> >>>>> This would allow vmalloc_dump_obj() to be called unconditionally.
> >>>>>
> >>>>> Other thoughts?
> >>>>
> >>>> locked = spin_is_locked(&vmap_area_lock);
> >>>> if (!locked)
> >>>>     spin_lock(&vmap_area_lock)
> >>>>
> >>>> Careful analysis is required, which may cause other problems.
> >>>>
> >>>> Or in new function:
> >>>> if (locked)
> >>>>     return;
> >>>> spin_lock(&vmap_area_lock);
> >>>>
> >>>> If there is a chance to dump the data, dump the data. If there is no
> >>>> chance to dump the data, do not dump the data. This is the fate of
> >>>> debugging information.
> >>>
> >>> My concern is that there will be increasing numbers of special cases
> >>> over time.
> 
> The memory modules are mature and stable, so your concerns may not be true.
> 
> >>
> >> OK, I got it.
> >>
> >>>
> >>> 							Thanx, Paul
> >>>
> >>>>>> diff --git a/mm/util.c b/mm/util.c
> >>>>>> index 12984e76767e..86da0739fe5d 100644
> >>>>>> --- a/mm/util.c
> >>>>>> +++ b/mm/util.c
> >>>>>> @@ -1119,14 +1119,18 @@ void mem_dump_obj(void *object)
> >>>>>>  {
> >>>>>>         const char *type;
> >>>>>>
> >>>>>> +       if (is_vmalloc_addr(object)) {
> >>>>>> +               if (in_task() && vmalloc_dump_obj(object))
> >>>>>> +                       return;
> >>>>>> +               type = "vmalloc memory";
> >>>>>> +               goto end;
> >>>>>> +       }
> >>>>>> +
> >>>>>>         if (kmem_valid_obj(object)) {
> >>>>>>                 kmem_dump_obj(object);
> >>>>>>                 return;
> >>>>>>         }
> >>>>>>
> >>>>>> -       if (vmalloc_dump_obj(object))
> >>>>>> -               return;
> >>>>>> -
> >>>>>>         if (virt_addr_valid(object))
> >>>>>>                 type = "non-slab/vmalloc memory";
> >>>>>>         else if (object == NULL)
> >>>>>> @@ -1135,7 +1139,7 @@ void mem_dump_obj(void *object)
> >>>>>>                 type = "zero-size pointer";
> >>>>>>         else
> >>>>>>                 type = "non-paged memory";
> >>>>>> -
> >>>>>> +end:
> >>>>>>         pr_cont(" %s\n", type);
> >>>>>>  }
> >>>>>>  EXPORT_SYMBOL_GPL(mem_dump_obj);
> >>>>>>
> >>>>>> Thanks
> >>>>>> Zqiang
> >>>>>>
> >>>>>>
> >>>>>>>
> >>>>>>> Thanks
> >>>>>>> Zqiang
> >>>>>>>
> >>>>>>>
> >>>>>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 
> >>>>>>>> 65704cbc9df7b3d..32ab45fabf8eebf 100644
> >>>>>>>> --- a/kernel/rcu/rcu.h
> >>>>>>>> +++ b/kernel/rcu/rcu.h
> >>>>>>>> @@ -10,6 +10,7 @@
> >>>>>>>> #ifndef __LINUX_RCU_H
> >>>>>>>> #define __LINUX_RCU_H
> >>>>>>>>
> >>>>>>>> +#include <linux/mm.h>
> >>>>>>>> #include <trace/events/rcu.h>
> >>>>>>>>
> >>>>>>>> /*
> >>>>>>>> @@ -211,6 +212,12 @@ static inline void debug_rcu_head_unqueue(struct 
> >>>>>>>> rcu_head *head) }
> >>>>>>>> #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
> >>>>>>>>
> >>>>>>>> +static inline void debug_rcu_head_callback(struct rcu_head *rhp) {
> >>>>>>>> +	if (unlikely(!rhp->func))
> >>>>>>>> +		mem_dump_obj(rhp);
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> extern int rcu_cpu_stall_suppress_at_boot;
> >>>>>>>>
> >>>>>>>> static inline bool rcu_stall_is_suppressed_at_boot(void)
> >>>>>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 
> >>>>>>>> 33adafdad261389..5e7f336baa06ae0 100644
> >>>>>>>> --- a/kernel/rcu/srcutiny.c
> >>>>>>>> +++ b/kernel/rcu/srcutiny.c
> >>>>>>>> @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
> >>>>>>>> 	while (lh) {
> >>>>>>>> 		rhp = lh;
> >>>>>>>> 		lh = lh->next;
> >>>>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>>>> 		local_bh_disable();
> >>>>>>>> 		rhp->func(rhp);
> >>>>>>>> 		local_bh_enable();
> >>>>>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 
> >>>>>>>> ca4b5dcec675bac..294972e66b31863 100644
> >>>>>>>> --- a/kernel/rcu/srcutree.c
> >>>>>>>> +++ b/kernel/rcu/srcutree.c
> >>>>>>>> @@ -1631,6 +1631,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
> >>>>>>>> 	rhp = rcu_cblist_dequeue(&ready_cbs);
> >>>>>>>> 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
> >>>>>>>> 		debug_rcu_head_unqueue(rhp);
> >>>>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>>>> 		local_bh_disable();
> >>>>>>>> 		rhp->func(rhp);
> >>>>>>>> 		local_bh_enable();
> >>>>>>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 
> >>>>>>>> b0b885e071fa8dc..b7f8c67c586cdc4 100644
> >>>>>>>> --- a/kernel/rcu/tasks.h
> >>>>>>>> +++ b/kernel/rcu/tasks.h
> >>>>>>>> @@ -478,6 +478,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
> >>>>>>>> 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
> >>>>>>>> 	len = rcl.len;
> >>>>>>>> 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = 
> >>>>>>>> rcu_cblist_dequeue(&rcl)) {
> >>>>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>>>> 		local_bh_disable();
> >>>>>>>> 		rhp->func(rhp);
> >>>>>>>> 		local_bh_enable();
> >>>>>>>> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 
> >>>>>>>> bb8f7d270f01747..56e9a5d91d97ec5 100644
> >>>>>>>> --- a/kernel/rcu/tiny.c
> >>>>>>>> +++ b/kernel/rcu/tiny.c
> >>>>>>>> @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head 
> >>>>>>>> *head)
> >>>>>>>>
> >>>>>>>> 	trace_rcu_invoke_callback("", head);
> >>>>>>>> 	f = head->func;
> >>>>>>>> +	debug_rcu_head_callback(head);
> >>>>>>>> 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
> >>>>>>>> 	f(head);
> >>>>>>>> 	rcu_lock_release(&rcu_callback_map);
> >>>>>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 
> >>>>>>>> 15aaff3203bf2d0..ed93ddb8203d42c 100644
> >>>>>>>> --- a/kernel/rcu/tree.c
> >>>>>>>> +++ b/kernel/rcu/tree.c
> >>>>>>>> @@ -2088,6 +2088,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> >>>>>>>> 		trace_rcu_invoke_callback(rcu_state.name, rhp);
> >>>>>>>>
> >>>>>>>> 		f = rhp->func;
> >>>>>>>> +		debug_rcu_head_callback(rhp);
> >>>>>>>> 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
> >>>>>>>> 		f(rhp);
> >>>>>>>>
> >>>>>>>> --
> >>>>>>>> 2.25.1
> >>>>>>>
> >>>>>>> .
> >>>>>>>
> >>>>>>
> >>>>>> --
> >>>>>> Regards,
> >>>>>>   Zhen Lei
> >>>>> .
> >>>>>
> >>>>
> >>>> -- 
> >>>> Regards,
> >>>>   Zhen Lei
> >>> .
> >>>
> >>
> >> -- 
> >> Regards,
> >>   Zhen Lei
> > .
> > 
> 
> -- 
> Regards,
>   Zhen Lei
Leizhen (ThunderTown) Nov. 17, 2022, 1:29 a.m. UTC | #17
On 2022/11/15 2:22, Uladzislau Rezki wrote:
>> On Fri, Nov 11, 2022 at 01:05:56PM +0000, Zhang, Qiang1 wrote:
>>> On 2022/11/11 19:54, Zhang, Qiang1 wrote:
>>>>> When a structure containing an RCU callback rhp is (incorrectly) 
>>>>> freed and reallocated after rhp is passed to call_rcu(), it is not 
>>>>> unusual for
>>>>> rhp->func to be set to NULL. This defeats the debugging prints used 
>>>>> rhp->by
>>>>> __call_rcu_common() in kernels built with 
>>>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the 
>>>>> offending code using the identity of this function.
>>>>>
>>>>> And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things 
>>>>> are even worse, as can be seen from this splat:
>>>>>
>>>>> Unable to handle kernel NULL pointer dereference at virtual address 0 
>>>>> ... ...
>>>>> PC is at 0x0
>>>>> LR is at rcu_do_batch+0x1c0/0x3b8
>>>>> ... ...
>>>>> (rcu_do_batch) from (rcu_core+0x1d4/0x284)
>>>>> (rcu_core) from (__do_softirq+0x24c/0x344)
>>>>> (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
>>>>> (__irq_exit_rcu) from (irq_exit+0x8/0x10)
>>>>> (irq_exit) from (__handle_domain_irq+0x74/0x9c)
>>>>> (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
>>>>> (gic_handle_irq) from (__irq_svc+0x5c/0x94)
>>>>> (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
>>>>> (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
>>>>> (default_idle_call) from (do_idle+0xf8/0x150)
>>>>> (do_idle) from (cpu_startup_entry+0x18/0x20)
>>>>> (cpu_startup_entry) from (0xc01530)
>>>>>
>>>>> This commit therefore adds calls to mem_dump_obj(rhp) to output some 
>>>>> information, for example:
>>>>>
>>>>>  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256
>>>>>
>>>>> This provides the rough size of the memory block and the offset of 
>>>>> the rcu_head structure, which as least provides at least a few clues 
>>>>> to help locate the problem. If the problem is reproducible, 
>>>>> additional slab debugging can be enabled, for example, 
>>>>> CONFIG_DEBUG_SLAB=y, which can provide significantly more information.
>>>>>
>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>>>>> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>>>>> ---
>>>>> kernel/rcu/rcu.h      | 7 +++++++
>>>>> kernel/rcu/srcutiny.c | 1 +
>>>>> kernel/rcu/srcutree.c | 1 +
>>>>> kernel/rcu/tasks.h    | 1 +
>>>>> kernel/rcu/tiny.c     | 1 +
>>>>> kernel/rcu/tree.c     | 1 +
>>>>> 6 files changed, 12 insertions(+)
>>>>>
>>>>> v1 --> v2:
>>>>> 1. Remove condition "(unsigned long)rhp->func & 0x3", it have problems on x86.
>>>>> 2. Paul E. McKenney helped me update the commit message, thanks.
>>>>>
>>>>
>>>> Hi, Zhen Lei
>>>>
>>>> Maybe the following scenarios should be considered:
>>>>
>>>>                 CPU 0
>>>> tasks context
>>>>    spin_lock(&vmap_area_lock)
>>>>           Interrupt 
>>>> 	 RCU softirq
>>>> 	      rcu_do_batch
>>>> 		mem_dump_obj
>>>>                                   vmalloc_dump_obj
>>>>                                        spin_lock(&vmap_area_lock)   <--  deadlock     
>>>
>>>> Right, thanks. I just saw the robot's report. So this patch should be dropped.
>>>> I'll try to add an helper in mm, where I can check whether the lock has been held, and dump the content of memory object.
>>>
>>> This is a workaround, or maybe try a modification like the following, 
>>> of course, need to ask Paul's opinion.
>>
>> Another approach is to schedule a workqueue handler to do the
>> mem_dump_obj().  This would allow mem_dump_obj() to run in a clean
>> environment.
>>
>> This would allow vmalloc_dump_obj() to be called unconditionally.
>>
>> Other thoughts?
>>
> <snip>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ca71de7c9d77..956eb28f9c77 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2591,6 +2591,19 @@ struct vm_struct *find_vm_area(const void *addr)
>  	return va->vm;
>  }
>  
> +static struct vm_struct *
> +find_vm_area_trylock(const void *addr)
> +{
> +	struct vmap_area *va = NULL;
> +
> +	if (spin_trylock(&vmap_area_lock)) {
> +		va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
> +		spin_unlock(&vmap_area_lock);
> +	}
> +
> +	return va ? va->vm : NULL;
> +}

This method also has the problems mentioned by Andrew Morton.
https://lkml.org/lkml/2022/11/14/1238

> +
>  /**
>   * remove_vm_area - find and remove a continuous kernel virtual area
>   * @addr:	    base address
> @@ -4048,7 +4061,7 @@ bool vmalloc_dump_obj(void *object)
>  	struct vm_struct *vm;
>  	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
>  
> -	vm = find_vm_area(objp);
> +	vm = find_vm_area_trylock(objp);
>  	if (!vm)
>  		return false;
>  	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
> <snip>
> 
> There is one issue though, it is not correct to reference vm->members
> after a lock is dropped because of: use after free bugs. It is better
> to embed the search and read out: nr_pages, addr, caller and drop the
> lock.
> 
> --
> Uladzislau Rezki
> .
>
diff mbox series

Patch

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 65704cbc9df7b3d..32ab45fabf8eebf 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -10,6 +10,7 @@ 
 #ifndef __LINUX_RCU_H
 #define __LINUX_RCU_H
 
+#include <linux/mm.h>
 #include <trace/events/rcu.h>
 
 /*
@@ -211,6 +212,12 @@  static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 }
 #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
+static inline void debug_rcu_head_callback(struct rcu_head *rhp)
+{
+	if (unlikely(!rhp->func))
+		mem_dump_obj(rhp);
+}
+
 extern int rcu_cpu_stall_suppress_at_boot;
 
 static inline bool rcu_stall_is_suppressed_at_boot(void)
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 33adafdad261389..5e7f336baa06ae0 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -138,6 +138,7 @@  void srcu_drive_gp(struct work_struct *wp)
 	while (lh) {
 		rhp = lh;
 		lh = lh->next;
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ca4b5dcec675bac..294972e66b31863 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1631,6 +1631,7 @@  static void srcu_invoke_callbacks(struct work_struct *work)
 	rhp = rcu_cblist_dequeue(&ready_cbs);
 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
 		debug_rcu_head_unqueue(rhp);
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index b0b885e071fa8dc..b7f8c67c586cdc4 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -478,6 +478,7 @@  static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 	len = rcl.len;
 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index bb8f7d270f01747..56e9a5d91d97ec5 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -97,6 +97,7 @@  static inline bool rcu_reclaim_tiny(struct rcu_head *head)
 
 	trace_rcu_invoke_callback("", head);
 	f = head->func;
+	debug_rcu_head_callback(head);
 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
 	f(head);
 	rcu_lock_release(&rcu_callback_map);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 15aaff3203bf2d0..ed93ddb8203d42c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2088,6 +2088,7 @@  static void rcu_do_batch(struct rcu_data *rdp)
 		trace_rcu_invoke_callback(rcu_state.name, rhp);
 
 		f = rhp->func;
+		debug_rcu_head_callback(rhp);
 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
 		f(rhp);