diff mbox series

[1/1] mm/vmalloc: Add preempt point in purge_vmap_node() when enabling kasan

Message ID 20240705130808.1581-1-ahuang12@lenovo.com (mailing list archive)
State New
Headers show
Series [1/1] mm/vmalloc: Add preempt point in purge_vmap_node() when enabling kasan | expand

Commit Message

Adrian Huang July 5, 2024, 1:08 p.m. UTC
From: Adrian Huang <ahuang12@lenovo.com>

When compiling kernel source 'make -j $(nproc)' with the up-and-running
KASAN-enabled kernel on a 256-core machine, the following soft lockup
is shown:

watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760]
CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95
Workqueue: events drain_vmap_area_work
RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0
Code: 38 c8 7c 08 84 c9 0f 85 49 08 00 00 8b 45 08 a8 01 74 2e 48 89 f1 49 89 f7 48 c1 e9 03 41 83 e7 07 4c 01 e9 41 83 c7 03 f3 90 <0f> b6 01 41 38 c7 7c 08 84 c0 0f 85 d4 06 00 00 8b 45 08 a8 01 75
RSP: 0018:ffffc9000cb3fb60 EFLAGS: 00000202
RAX: 0000000000000011 RBX: ffff8883bc4469c0 RCX: ffffed10776e9949
RDX: 0000000000000002 RSI: ffff8883bb74ca48 RDI: ffffffff8434dc50
RBP: ffff8883bb74ca40 R08: ffff888103585dc0 R09: ffff8884533a1800
R10: 0000000000000004 R11: ffffffffffffffff R12: ffffed1077888d39
R13: dffffc0000000000 R14: ffffed1077888d38 R15: 0000000000000003
FS:  0000000000000000(0000) GS:ffff8883bc400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005577b5c8d158 CR3: 0000000004850000 CR4: 0000000000350ef0
Call Trace:
 <IRQ>
 ? watchdog_timer_fn+0x2cd/0x390
 ? __pfx_watchdog_timer_fn+0x10/0x10
 ? __hrtimer_run_queues+0x300/0x6d0
 ? sched_clock_cpu+0x69/0x4e0
 ? __pfx___hrtimer_run_queues+0x10/0x10
 ? srso_return_thunk+0x5/0x5f
 ? ktime_get_update_offsets_now+0x7f/0x2a0
 ? srso_return_thunk+0x5/0x5f
 ? srso_return_thunk+0x5/0x5f
 ? hrtimer_interrupt+0x2ca/0x760
 ? __sysvec_apic_timer_interrupt+0x8c/0x2b0
 ? sysvec_apic_timer_interrupt+0x6a/0x90
 </IRQ>
 <TASK>
 ? asm_sysvec_apic_timer_interrupt+0x16/0x20
 ? smp_call_function_many_cond+0x1d8/0xbb0
 ? __pfx_do_kernel_range_flush+0x10/0x10
 on_each_cpu_cond_mask+0x20/0x40
 flush_tlb_kernel_range+0x19b/0x250
 ? srso_return_thunk+0x5/0x5f
 ? kasan_release_vmalloc+0xa7/0xc0
 purge_vmap_node+0x357/0x820
 ? __pfx_purge_vmap_node+0x10/0x10
 __purge_vmap_area_lazy+0x5b8/0xa10
 drain_vmap_area_work+0x21/0x30
 process_one_work+0x661/0x10b0
 worker_thread+0x844/0x10e0
 ? srso_return_thunk+0x5/0x5f
 ? __kthread_parkme+0x82/0x140
 ? __pfx_worker_thread+0x10/0x10
 kthread+0x2a5/0x370
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x30/0x70
 ? __pfx_kthread+0x10/0x10
 ret_from_fork_asm+0x1a/0x30
 </TASK>

Debugging Analysis:
 1. [Crash] The call trace indicates CPU #28 is waiting for other CPUs'
    responses by sending an IPI message in order to flush tlb.
    However, crash indicates the target CPU has responded.

     A. CPU #28 is waiting for CPU #2' response.

	crash> cfd_data | grep -w 28
	  [28]: ffff8883bc4469c0
	crash> struct call_function_data ffff8883bc4469c0
	struct call_function_data {
	  csd = 0x4ca40,
	  cpumask = 0xffff888103585e40,
	  cpumask_ipi = 0xffff8881035858c0
	}

        crash> struct __call_single_data 0x4ca40:a | grep ffff8883bb74ca40
        [2]: ffff8883bb74ca40

     B. CPU #2 has responded because the bit 'CSD_FLAG_LOCK' of u_flags
        is cleared.

        crash> struct __call_single_data 0xffff8883bb74ca40
        struct __call_single_data {
          node = {
            ...
            {
              u_flags = 0,
              a_flags = {
                counter = 0
              }
            },
	    ...
          },
          func = 0xffffffff8117b080 <do_kernel_range_flush>,
          info = 0xffff8883bc444940
        }

     C. CPU #2 is running userspace application 'nm'.

        crash> bt -c 2
	PID: 52035  TASK: ffff888194c21ac0  CPU: 2   COMMAND: "nm"
 	#0 [ffffc90043157ea8] crash_nmi_callback at ffffffff81122f42
 	#1 [ffffc90043157eb0] nmi_handle at ffffffff8108c988
	#2 [ffffc90043157f10] default_do_nmi at ffffffff835b3460
 	#3 [ffffc90043157f30] exc_nmi at ffffffff835b3630
 	#4 [ffffc90043157f50] asm_exc_nmi at ffffffff83601573
        RIP: 00007f6261b90d38 RSP: 00007ffe4d1cb180 RFLAGS: 00000202
        RAX: 0000000000000001 RBX: 6e6f2d7865646e69 RCX: 00007f626281f634
        RDX: 00007f6262ba7f67 RSI: 00007f626265f65e RDI: 00007f62648f8a30
        RBP: 2d6f746c6e696874  R8: 00007f62618a4cc0  R9: 0000000000000001
        R10: 00007f627233e488 R11: 00007f627233d768 R12: 000055bee0ff4b90
        R13: 000055bee0fac650 R14: 00007ffe4d1cb280 R15: 0000000000000000
        ORIG_RAX: ffffffffffffffff  CS: 0033  SS: 002b

     D. The soft lockup CPU iteratively traverses 128 vmap_nodes (128 bits
	are set) and flushes tlb. This might be the time-consuming work.

	crash> p /x purge_nodes
	$1 = {
	  bits = {0xffffffffffffffff, 0xffffffffffffffff, 0x0, ...}

 2. [Ftrace] In order to prove that the soft lockup CPU spends too much
    time iterating vmap_nodes and flushing tlb when purging vm_area
    structures, the ftrace confirms the speculation (Some info is trimmed).

     kworker: funcgraph_entry:		    |  drain_vmap_area_work() {
     kworker: funcgraph_entry:              |   mutex_lock() {
     kworker: funcgraph_entry:  1.092 us    |     __cond_resched();
     kworker: funcgraph_exit:   3.306 us    |   }
     ...				        ...
     kworker: funcgraph_entry: 		    |    flush_tlb_kernel_range() {
     ...				 	  ...
     kworker: funcgraph_exit: # 7533.649 us |    }
     ...                                         ...
     kworker: funcgraph_entry:  2.344 us    |   mutex_unlock();
     kworker: funcgraph_exit: $ 23871554 us | }

     The drain_vmap_area_work() spends over 23 seconds.

     There are 2805 flush_tlb_kernel_range() calls in this ftrace log.
       * One is called in __purge_vmap_area_lazy().
       * Others are called in kasan_release_vmalloc().

 3. Extending the soft lockup time can work around the issue (For example,
    # echo 60 > /proc/sys/kernel/watchdog_thresh). This confirms the
    above-mentioned speculation: drain_vmap_area_work() spends too much
    time.

Commit 72210662c5a2 ("mm: vmalloc: offload free_vmap_area_lock lock")
employs an effective vmap node logic. However, current design iterates
128 vmap_nodes and flushes tlb by a single CPU if
vmap_lazy_nr < 2 * lazy_max_pages(). When enabling kasan, this might
trigger the soft lockup because additional tlb flushes of kasan vmalloc
spend much more time if 128 vmap nodes have the available purge list.

Fix the issue by adding preempt point in purge_vmap_node() when
enabling kasan.

Fixes: 72210662c5a2 ("mm: vmalloc: offload free_vmap_area_lock lock")
Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
Reviewed-and-tested-by: Jiwei Sun <sunjw10@lenovo.com>
---
 mm/vmalloc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

Comments

Uladzislau Rezki July 5, 2024, 3:36 p.m. UTC | #1
On Fri, Jul 05, 2024 at 09:08:08PM +0800, Adrian Huang wrote:
> From: Adrian Huang <ahuang12@lenovo.com>
> 
> When compiling kernel source 'make -j $(nproc)' with the up-and-running
> KASAN-enabled kernel on a 256-core machine, the following soft lockup
> is shown:
> 
> watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760]
> CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95
> Workqueue: events drain_vmap_area_work
> RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0
> Code: 38 c8 7c 08 84 c9 0f 85 49 08 00 00 8b 45 08 a8 01 74 2e 48 89 f1 49 89 f7 48 c1 e9 03 41 83 e7 07 4c 01 e9 41 83 c7 03 f3 90 <0f> b6 01 41 38 c7 7c 08 84 c0 0f 85 d4 06 00 00 8b 45 08 a8 01 75
> RSP: 0018:ffffc9000cb3fb60 EFLAGS: 00000202
> RAX: 0000000000000011 RBX: ffff8883bc4469c0 RCX: ffffed10776e9949
> RDX: 0000000000000002 RSI: ffff8883bb74ca48 RDI: ffffffff8434dc50
> RBP: ffff8883bb74ca40 R08: ffff888103585dc0 R09: ffff8884533a1800
> R10: 0000000000000004 R11: ffffffffffffffff R12: ffffed1077888d39
> R13: dffffc0000000000 R14: ffffed1077888d38 R15: 0000000000000003
> FS:  0000000000000000(0000) GS:ffff8883bc400000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00005577b5c8d158 CR3: 0000000004850000 CR4: 0000000000350ef0
> Call Trace:
>  <IRQ>
>  ? watchdog_timer_fn+0x2cd/0x390
>  ? __pfx_watchdog_timer_fn+0x10/0x10
>  ? __hrtimer_run_queues+0x300/0x6d0
>  ? sched_clock_cpu+0x69/0x4e0
>  ? __pfx___hrtimer_run_queues+0x10/0x10
>  ? srso_return_thunk+0x5/0x5f
>  ? ktime_get_update_offsets_now+0x7f/0x2a0
>  ? srso_return_thunk+0x5/0x5f
>  ? srso_return_thunk+0x5/0x5f
>  ? hrtimer_interrupt+0x2ca/0x760
>  ? __sysvec_apic_timer_interrupt+0x8c/0x2b0
>  ? sysvec_apic_timer_interrupt+0x6a/0x90
>  </IRQ>
>  <TASK>
>  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
>  ? smp_call_function_many_cond+0x1d8/0xbb0
>  ? __pfx_do_kernel_range_flush+0x10/0x10
>  on_each_cpu_cond_mask+0x20/0x40
>  flush_tlb_kernel_range+0x19b/0x250
>  ? srso_return_thunk+0x5/0x5f
>  ? kasan_release_vmalloc+0xa7/0xc0
>  purge_vmap_node+0x357/0x820
>  ? __pfx_purge_vmap_node+0x10/0x10
>  __purge_vmap_area_lazy+0x5b8/0xa10
>  drain_vmap_area_work+0x21/0x30
>  process_one_work+0x661/0x10b0
>  worker_thread+0x844/0x10e0
>  ? srso_return_thunk+0x5/0x5f
>  ? __kthread_parkme+0x82/0x140
>  ? __pfx_worker_thread+0x10/0x10
>  kthread+0x2a5/0x370
>  ? __pfx_kthread+0x10/0x10
>  ret_from_fork+0x30/0x70
>  ? __pfx_kthread+0x10/0x10
>  ret_from_fork_asm+0x1a/0x30
>  </TASK>
> 
> Debugging Analysis:
>  1. [Crash] The call trace indicates CPU #28 is waiting for other CPUs'
>     responses by sending an IPI message in order to flush tlb.
>     However, crash indicates the target CPU has responded.
> 
>      A. CPU #28 is waiting for CPU #2' response.
> 
> 	crash> cfd_data | grep -w 28
> 	  [28]: ffff8883bc4469c0
> 	crash> struct call_function_data ffff8883bc4469c0
> 	struct call_function_data {
> 	  csd = 0x4ca40,
> 	  cpumask = 0xffff888103585e40,
> 	  cpumask_ipi = 0xffff8881035858c0
> 	}
> 
>         crash> struct __call_single_data 0x4ca40:a | grep ffff8883bb74ca40
>         [2]: ffff8883bb74ca40
> 
>      B. CPU #2 has responded because the bit 'CSD_FLAG_LOCK' of u_flags
>         is cleared.
> 
>         crash> struct __call_single_data 0xffff8883bb74ca40
>         struct __call_single_data {
>           node = {
>             ...
>             {
>               u_flags = 0,
>               a_flags = {
>                 counter = 0
>               }
>             },
> 	    ...
>           },
>           func = 0xffffffff8117b080 <do_kernel_range_flush>,
>           info = 0xffff8883bc444940
>         }
> 
>      C. CPU #2 is running userspace application 'nm'.
> 
>         crash> bt -c 2
> 	PID: 52035  TASK: ffff888194c21ac0  CPU: 2   COMMAND: "nm"
>  	#0 [ffffc90043157ea8] crash_nmi_callback at ffffffff81122f42
>  	#1 [ffffc90043157eb0] nmi_handle at ffffffff8108c988
> 	#2 [ffffc90043157f10] default_do_nmi at ffffffff835b3460
>  	#3 [ffffc90043157f30] exc_nmi at ffffffff835b3630
>  	#4 [ffffc90043157f50] asm_exc_nmi at ffffffff83601573
>         RIP: 00007f6261b90d38 RSP: 00007ffe4d1cb180 RFLAGS: 00000202
>         RAX: 0000000000000001 RBX: 6e6f2d7865646e69 RCX: 00007f626281f634
>         RDX: 00007f6262ba7f67 RSI: 00007f626265f65e RDI: 00007f62648f8a30
>         RBP: 2d6f746c6e696874  R8: 00007f62618a4cc0  R9: 0000000000000001
>         R10: 00007f627233e488 R11: 00007f627233d768 R12: 000055bee0ff4b90
>         R13: 000055bee0fac650 R14: 00007ffe4d1cb280 R15: 0000000000000000
>         ORIG_RAX: ffffffffffffffff  CS: 0033  SS: 002b
> 
>      D. The soft lockup CPU iteratively traverses 128 vmap_nodes (128 bits
> 	are set) and flushes tlb. This might be the time-consuming work.
> 
> 	crash> p /x purge_nodes
> 	$1 = {
> 	  bits = {0xffffffffffffffff, 0xffffffffffffffff, 0x0, ...}
> 
>  2. [Ftrace] In order to prove that the soft lockup CPU spends too much
>     time iterating vmap_nodes and flushing tlb when purging vm_area
>     structures, the ftrace confirms the speculation (Some info is trimmed).
> 
>      kworker: funcgraph_entry:		    |  drain_vmap_area_work() {
>      kworker: funcgraph_entry:              |   mutex_lock() {
>      kworker: funcgraph_entry:  1.092 us    |     __cond_resched();
>      kworker: funcgraph_exit:   3.306 us    |   }
>      ...				        ...
>      kworker: funcgraph_entry: 		    |    flush_tlb_kernel_range() {
>      ...				 	  ...
>      kworker: funcgraph_exit: # 7533.649 us |    }
>      ...                                         ...
>      kworker: funcgraph_entry:  2.344 us    |   mutex_unlock();
>      kworker: funcgraph_exit: $ 23871554 us | }
> 
>      The drain_vmap_area_work() spends over 23 seconds.
> 
>      There are 2805 flush_tlb_kernel_range() calls in this ftrace log.
>        * One is called in __purge_vmap_area_lazy().
>        * Others are called in kasan_release_vmalloc().
> 
>  3. Extending the soft lockup time can work around the issue (For example,
>     # echo 60 > /proc/sys/kernel/watchdog_thresh). This confirms the
>     above-mentioned speculation: drain_vmap_area_work() spends too much
>     time.
> 
> Commit 72210662c5a2 ("mm: vmalloc: offload free_vmap_area_lock lock")
> employs an effective vmap node logic. However, current design iterates
> 128 vmap_nodes and flushes tlb by a single CPU if
> vmap_lazy_nr < 2 * lazy_max_pages(). When enabling kasan, this might
> trigger the soft lockup because additional tlb flushes of kasan vmalloc
> spend much more time if 128 vmap nodes have the available purge list.
> 
> Fix the issue by adding preempt point in purge_vmap_node() when
> enabling kasan.
> 
> Fixes: 72210662c5a2 ("mm: vmalloc: offload free_vmap_area_lock lock")
> Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
> Reviewed-and-tested-by: Jiwei Sun <sunjw10@lenovo.com>
> ---
>  mm/vmalloc.c | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d0cbdd7c1e5b..380bdc317c8d 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2193,6 +2193,15 @@ static void purge_vmap_node(struct work_struct *work)
>  	struct vmap_area *va, *n_va;
>  	LIST_HEAD(local_list);
>  
> +	/*
> +	 * Add the preemption point when enabling KASAN. Each vmap_area of
> +	 * vmap nodes has to flush tlb when releasing vmalloc, which might
> +	 * be the time-consuming work if lots of vamp nodes have the
> +	 * available purge list.
> +	 */
> +	if (kasan_enabled())
> +		cond_resched();
> +
>  	vn->nr_purged = 0;
>  
>  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
> -- 
> 2.34.1
> 
Thank you for highlighting this. We had a preempt point during purging
process. But it has been removed by the following commit:

<snip>
commit 282631cb2447318e2a55b41a665dbe8571c46d70
Author: Uladzislau Rezki (Sony) <urezki@gmail.com>
Date:   Tue Jan 2 19:46:28 2024 +0100

    mm: vmalloc: remove global purge_vmap_area_root rb-tree
<snip>

and it looks like a decision was wrong. We should restore this.
Could you please test it:

<snip>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 03b82fb8ecd3..6dc204b8495a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2190,10 +2190,12 @@ static void purge_vmap_node(struct work_struct *work)
 {
 	struct vmap_node *vn = container_of(work,
 		struct vmap_node, purge_work);
+	unsigned long resched_threshold;
 	struct vmap_area *va, *n_va;
 	LIST_HEAD(local_list);
 
 	vn->nr_purged = 0;
+	resched_threshold = lazy_max_pages() << 1;
 
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
@@ -2210,6 +2212,9 @@ static void purge_vmap_node(struct work_struct *work)
 		atomic_long_sub(nr, &vmap_lazy_nr);
 		vn->nr_purged++;
 
+		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
+			cond_resched();
+
 		if (is_vn_id_valid(vn_id) && !vn->skip_populate)
 			if (node_pool_add_va(vn, va))
 				continue;
<snip>

Thank you in advance!

--
Uladzislau Rezki
Adrian Huang12 July 8, 2024, 1:39 p.m. UTC | #2
Hi,

> Could you please test it:
> 
> <snip>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 03b82fb8ecd3..6dc204b8495a
> 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2190,10 +2190,12 @@ static void purge_vmap_node(struct work_struct
> *work)  {
>  	struct vmap_node *vn = container_of(work,
>  		struct vmap_node, purge_work);
> +	unsigned long resched_threshold;
>  	struct vmap_area *va, *n_va;
>  	LIST_HEAD(local_list);
> 
>  	vn->nr_purged = 0;
> +	resched_threshold = lazy_max_pages() << 1;
> 
>  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
>  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; @@
> -2210,6 +2212,9 @@ static void purge_vmap_node(struct work_struct *work)
>  		atomic_long_sub(nr, &vmap_lazy_nr);
>  		vn->nr_purged++;
> 
> +		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
> +			cond_resched();
> +
>  		if (is_vn_id_valid(vn_id) && !vn->skip_populate)
>  			if (node_pool_add_va(vn, va))
>  				continue;
> <snip>

This patch can fix the issue. Feel free to add my tested-by.
Tested-by: Adrian Huang <ahuang12@lenovo.com>

-- Adrian
Uladzislau Rezki July 8, 2024, 4:06 p.m. UTC | #3
On Mon, Jul 08, 2024 at 01:39:57PM +0000, Adrian Huang12 wrote:
> Hi,
> 
> > Could you please test it:
> > 
> > <snip>
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 03b82fb8ecd3..6dc204b8495a
> > 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -2190,10 +2190,12 @@ static void purge_vmap_node(struct work_struct
> > *work)  {
> >  	struct vmap_node *vn = container_of(work,
> >  		struct vmap_node, purge_work);
> > +	unsigned long resched_threshold;
> >  	struct vmap_area *va, *n_va;
> >  	LIST_HEAD(local_list);
> > 
> >  	vn->nr_purged = 0;
> > +	resched_threshold = lazy_max_pages() << 1;
> > 
> >  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
> >  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; @@
> > -2210,6 +2212,9 @@ static void purge_vmap_node(struct work_struct *work)
> >  		atomic_long_sub(nr, &vmap_lazy_nr);
> >  		vn->nr_purged++;
> > 
> > +		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
> > +			cond_resched();
> > +
> >  		if (is_vn_id_valid(vn_id) && !vn->skip_populate)
> >  			if (node_pool_add_va(vn, va))
> >  				continue;
> > <snip>
> 
> This patch can fix the issue. Feel free to add my tested-by.
> Tested-by: Adrian Huang <ahuang12@lenovo.com>
> 
Thank you. I will add you tested-by!

--
Uladzislau Rezki
Uladzislau Rezki July 19, 2024, 11:40 a.m. UTC | #4
Hello, Peter!

Could you please support here? See below:

> On Mon, Jul 08, 2024 at 01:39:57PM +0000, Adrian Huang12 wrote:
> > Hi,
> > 
> > > Could you please test it:
> > > 
> > > <snip>
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 03b82fb8ecd3..6dc204b8495a
> > > 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -2190,10 +2190,12 @@ static void purge_vmap_node(struct work_struct
> > > *work)  {
> > >  	struct vmap_node *vn = container_of(work,
> > >  		struct vmap_node, purge_work);
> > > +	unsigned long resched_threshold;
> > >  	struct vmap_area *va, *n_va;
> > >  	LIST_HEAD(local_list);
> > > 
> > >  	vn->nr_purged = 0;
> > > +	resched_threshold = lazy_max_pages() << 1;
> > > 
> > >  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
> > >  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; @@
> > > -2210,6 +2212,9 @@ static void purge_vmap_node(struct work_struct *work)
> > >  		atomic_long_sub(nr, &vmap_lazy_nr);
> > >  		vn->nr_purged++;
> > > 
> > > +		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
> > > +			cond_resched();
> > > +
> > >  		if (is_vn_id_valid(vn_id) && !vn->skip_populate)
> > >  			if (node_pool_add_va(vn, va))
> > >  				continue;
> > > <snip>
> > 
> > This patch can fix the issue. Feel free to add my tested-by.
> > Tested-by: Adrian Huang <ahuang12@lenovo.com>
> > 
> Thank you. I will add you tested-by!
> 
I tried to simulate the reported splat and i can reproduce it with KASAN
enabled. I use qemu on my 64-core system, it allows me to specify 255
cores while running VM. The kernel is 6.10.0-rc5.

The kernel should be built with CONFIG_KASAN=y and CONFIG_KASAN_VMALLOC=y

The "soft lockup" can be triggered when the kernel is compiled within a
VM using 256 jobs and preemption is disabled:

echo none > /sys/kernel/debug/sched/preempt
make -C coding/linux.git/ -j256 bzImage

<snip>
watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760]
CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95
Workqueue: events drain_vmap_area_work
RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0
...
<snip>

(See the full splat in the beginning of this email-thread). 

After some debugging, i figured out that a CSD lock is taken and not released
for too long. To debug this the kernel should be compiled with CONFIG_CSD_LOCK_WAIT_DEBUG.
The /sys/module/smp/parameters/csd_lock_timeout i set to 1000, i.e. 1 second.

See below the CSD-lock debug-info during running the compiling test:

<snip>
[  163.697057] smp: csd: Detected non-responsive CSD lock (#1) on CPU#206, waiting 1000000292 ns for CPU#17 do_kernel_range_flush+0x0/0xb0(0xffff888dfd943d80).
[  163.697165] smp:     csd: CSD lock (#1) unresponsive.
[  163.697198] Sending NMI from CPU 206 to CPUs 17:
[  163.697214] NMI backtrace for cpu 17
[  163.697223] CPU: 17 PID: 36681 Comm: as Kdump: loaded Tainted: G             L     6.10.0-rc5-00019-g4236f0255ea8-dirty #3439
[  163.697228] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[  163.697231] RIP: 0010:native_queued_spin_lock_slowpath+0x2b9/0xae0
[  163.697241] Code: 40 ba 01 00 00 00 ..............<cut>...............
[  163.697244] RSP: 0018:ffffc900396afb98 EFLAGS: 00000002
[  163.697248] RAX: 0000000000000001 RBX: ffffffffb6e0a084 RCX: ffffffffb5eff77c
[  163.697250] RDX: fffffbfff6dc1411 RSI: 0000000000000004 RDI: ffffffffb6e0a084
[  163.697252] RBP: 0000000000000001 R08: 0000000000000000 R09: fffffbfff6dc1410
[  163.697254] R10: ffffffffb6e0a087 R11: ffffffffb80f8112 R12: 1ffff920072d5f75
[  163.697255] R13: 0000000000000007 R14: fffffbfff6dc1410 R15: ffffc900396afbd8
[  163.697260] FS:  0000000000000000(0000) GS:ffff888df7a80000(0000) knlGS:0000000000000000
[  163.697263] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  163.697266] CR2: 00007f354595c3e8 CR3: 0000000088a44000 CR4: 00000000000006f0
[  163.697268] Call Trace:
[  163.697270]  <NMI>
[  163.697272]  ? nmi_cpu_backtrace+0xd1/0x190
[  163.697277]  ? nmi_cpu_backtrace_handler+0x11/0x20
[  163.697282]  ? nmi_handle+0xb7/0x2a0
[  163.697288]  ? default_do_nmi+0x45/0x110
[  163.697292]  ? exc_nmi+0x104/0x190
[  163.697295]  ? end_repeat_nmi+0xf/0x53
[  163.697299]  ? native_queued_spin_lock_slowpath+0xbc/0xae0
[  163.697303]  ? native_queued_spin_lock_slowpath+0x2b9/0xae0
[  163.697306]  ? native_queued_spin_lock_slowpath+0x2b9/0xae0
[  163.697308]  ? native_queued_spin_lock_slowpath+0x2b9/0xae0
[  163.697311]  </NMI>
[  163.697312]  <TASK>
[  163.697313]  ? __pfx_native_queued_spin_lock_slowpath+0x10/0x10
[  163.697317]  queued_write_lock_slowpath+0x3c6/0x440
[  163.697321]  ? __pfx_queued_write_lock_slowpath+0x10/0x10
[  163.697324]  ? task_rq_lock+0xd0/0x390
[  163.697327]  ? perf_lock_task_context+0x106/0x310
[  163.697333]  _raw_write_lock_irq+0xcf/0xe0
[  163.697335]  ? __pfx__raw_write_lock_irq+0x10/0x10
[  163.697339]  exit_notify+0x86/0x780
[  163.697342]  ? __pfx_exit_notify+0x10/0x10
[  163.697345]  ? exit_tasks_rcu_start+0x173/0x230
[  163.697350]  do_exit+0x707/0xcb0
[  163.697352]  ? __count_memcg_events+0xe1/0x340
[  163.697357]  ? __pfx_do_exit+0x10/0x10
[  163.697359]  ? _raw_spin_lock_irq+0x86/0xe0
[  163.697361]  ? __pfx__raw_spin_lock_irq+0x10/0x10
[  163.697364]  ? handle_mm_fault+0x150/0x740
[  163.697368]  do_group_exit+0xac/0x230
[  163.697371]  __x64_sys_exit_group+0x3e/0x50
[  163.697374]  do_syscall_64+0x5f/0x170
[  163.697377]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  163.697381] RIP: 0033:0x7f35457aa349
[  163.697384] Code: Unable to access opcode bytes at 0x7f35457aa31f.
[  163.697385] RSP: 002b:00007ffce767e198 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
[  163.697388] RAX: ffffffffffffffda RBX: 00007f35458a49e0 RCX: 00007f35457aa349
[  163.697390] RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000
[  163.697392] RBP: 0000000000000000 R08: ffffffffffffff80 R09: 00007ffce767e0af
[  163.697394] R10: 00007ffce767e010 R11: 0000000000000246 R12: 00007f35458a49e0
[  163.697395] R13: 00007f35458aa2e0 R14: 0000000000000001 R15: 00007f35458aa2c8
[  163.697399]  </TASK>
<snip>

i see that CPU_17 does not respond waiting in the do_exit() trying to grab
a writing lock. Below one:

<snip>
static void exit_notify(struct task_struct *tsk, int group_dead)
{
	bool autoreap;
	struct task_struct *p, *n;
	LIST_HEAD(dead);

	write_lock_irq(&tasklist_lock);
...
<snip>

As i see it disables interrupts and tries to acquire the rw-lock. In
order to debug this further i built the kernel with CONFIG_LOCK_STAT
to see the contention and waiting time on it:

echo 0 > /proc/lock_stat; echo 1 > /proc/sys/kernel/lock_stat; make -C coding/linux.git/ -j256 bzImage > /dev/null; echo 0 > /proc/sys/kernel/lock_stat;

<snip>
...
class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
...
tasklist_lock-W:         56291          56830           0.23     3352084.59  3191025005.50       56150.36          86555         110489           0.60       65916.78    50226089.66         454.58
tasklist_lock-R:         40925          40987           0.23     3450788.93  2149685495.54       52447.98          73248          88581           0.09       32061.59      237549.19           2.68
     ---------------
     tasklist_lock          21879          [<000000004337def3>] exit_notify+0x86/0x790
     tasklist_lock          40987          [<000000008c6daba0>] __do_wait+0xdc/0x700
     tasklist_lock          17632          [<00000000c86b4505>] release_task+0x106/0x470
     tasklist_lock          17319          [<000000004c9a1afc>] copy_process+0x2a1b/0x4b00
     ---------------
     tasklist_lock          73231          [<000000004337def3>] exit_notify+0x86/0x790
     tasklist_lock          15082          [<00000000c86b4505>] release_task+0x106/0x470
     tasklist_lock           4233          [<000000004c9a1afc>] copy_process+0x2a1b/0x4b00
     tasklist_lock           5271          [<000000008c6daba0>] __do_wait+0xdc/0x700
...
<snip>

A waiting time can be quite long on a kernel built with KASAN. If i
interpret it correctly, the time is in microseconds, so we have 3.3
seconds as a maximum waiting time on the "tasklist_lock.

Since it disables interrupts on a current CPU, do_kernel_range_flush()
on that CPU is delayed thus it delays kworker which does TLB flushing.

Could you please comment on it? Maybe you can give some good input about
long wait on the tasklist_lock(it disables IRQs also) when KASAN is enabled.

Is that something which is expectable for debug kernel?

Thank you in advance!

--
Uladzislau Rezki
Adrian Huang July 22, 2024, 11:50 a.m. UTC | #5
> I tried to simulate the reported splat and i can reproduce it with KASAN
> enabled. I use qemu on my 64-core system, it allows me to specify 255
> cores while running VM. The kernel is 6.10.0-rc5.
>
> The kernel should be built with CONFIG_KASAN=y and CONFIG_KASAN_VMALLOC=y
>
> The "soft lockup" can be triggered when the kernel is compiled within a
> VM using 256 jobs and preemption is disabled:
>
> echo none > /sys/kernel/debug/sched/preempt
> make -C coding/linux.git/ -j256 bzImage
>
> <snip>
> watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760]
> CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95
> Workqueue: events drain_vmap_area_work
> RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0
> ...
> <snip>

Great to hear you're able to reproduce the issue.

I keep debugging, and the original patch (https://lore.kernel.org/all/ZogS_04dP5LlRlXN@pc636/T/) shows purge_vmap_node() iteratively releases kasan vmalloc
allocations and flushes tlb for each vmap_area. There are 2805
flush_tlb_kernel_range() calls in ftrace log.
  * One is called in __purge_vmap_area_lazy().
  * Others are called in kasan_release_vmalloc(): Called by purge_vmap_node().
    - [Rough calculation] Each flush_tlb_kernel_range() runs about 7.5ms.
      -- 2804 * 7.5ms = 21.03 seconds (That's why a soft lock is trigger)

If we combine all tlb flush operations into one operation in the call path
'purge_vmap_node()->kasan_release_vmalloc()', the running time of
drain_vmap_area_work() can be saved greately. The idea is from the
flush_tlb_kernel_range() call in __purge_vmap_area_lazy().
And, the soft lockup won't not be triggered. Please refer to the following patch.
Here is the test result based on 6.10:

[6.10 wo/ the patch below]
  1. ftrace latency profiling (record a trace if the latency > 20s): Commands
     echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh
     echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function
     echo function_graph > /sys/kernel/debug/tracing/current_tracer
     echo 1 > /sys/kernel/debug/tracing/tracing_on

  2. Run `make -j $(nproc)` to compile the kernel source

  3. Once the soft lockup is reproduced, check the ftace:
     cat /sys/kernel/debug/tracing/trace
        # tracer: function_graph
        #
        # CPU  DURATION                  FUNCTION CALLS
        # |     |   |                     |   |   |   |
          76) $ 50412985 us |    } /* __purge_vmap_area_lazy */
          76) $ 50412997 us |  } /* drain_vmap_area_work */
          76) $ 29165911 us |    } /* __purge_vmap_area_lazy */
          76) $ 29165926 us |  } /* drain_vmap_area_work */
          91) $ 53629423 us |    } /* __purge_vmap_area_lazy */
          91) $ 53629434 us |  } /* drain_vmap_area_work */
          91) $ 28121014 us |    } /* __purge_vmap_area_lazy */
          91) $ 28121026 us |  } /* drain_vmap_area_work */


[6.10 w/ the patch below]
  1. Repeat step 1-2 in "[6.10 wo/ the patch below]"

  2. The soft lockup is not triggered and the ftrace log is empty.
     cat /sys/kernel/debug/tracing/trace
     # tracer: function_graph
     #
     # CPU  DURATION                  FUNCTION CALLS
     # |     |   |                     |   |   |   |


  3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace log.

  4. Setting 'tracing_thresh' to 1 second gets ftrace log.
      cat /sys/kernel/tracing/trace
      # tracer: function_graph
      #
      # CPU  DURATION                  FUNCTION CALLS
      # |     |   |                     |   |   |   |
        51) $ 1019695 us  |    } /* __purge_vmap_area_lazy */
        51) $ 1019703 us  |  } /* drain_vmap_area_work */
       198) $ 1018707 us  |    } /* __purge_vmap_area_lazy */
       198) $ 1018718 us  |  } /* drain_vmap_area_work */

  5. Run the following test_vmalloc command without any issues
     modprobe test_vmalloc nr_threads=$(nproc) run_test_mask=0x1 nr_pages=4

Could you please test this patch in your VM environment? 

---
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 70d6a8f6e25d..ddbf42a1a7b7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 int kasan_populate_early_shadow(const void *shadow_start,
 				const void *shadow_end);
 
+#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply existing page range */
+#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
+
 #ifndef kasan_mem_to_shadow
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
@@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end);
+			   unsigned long free_region_end,
+			   unsigned long flags);
 
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 static inline void *kasan_unpoison_vmalloc(const void *start,
 					   unsigned long size,
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d6210ca48dda..88d1c9dcb507 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  */
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end)
+			   unsigned long free_region_end,
+			   unsigned long flags)
 {
 	void *shadow_start, *shadow_end;
 	unsigned long region_start, region_end;
@@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
 			return;
 		}
-		apply_to_existing_page_range(&init_mm,
+
+
+		if (flags & KASAN_VMALLOC_PAGE_RANGE)
+			apply_to_existing_page_range(&init_mm,
 					     (unsigned long)shadow_start,
 					     size, kasan_depopulate_vmalloc_pte,
 					     NULL);
-		flush_tlb_kernel_range((unsigned long)shadow_start,
-				       (unsigned long)shadow_end);
+
+		if (flags & KASAN_VMALLOC_TLB_FLUSH)
+			flush_tlb_kernel_range((unsigned long)shadow_start,
+					       (unsigned long)shadow_end);
 	}
 }
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e34ea860153f..d66e09135876 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2193,8 +2193,15 @@ static void purge_vmap_node(struct work_struct *work)
 	struct vmap_area *va, *n_va;
 	LIST_HEAD(local_list);
 
+	unsigned long start;
+	unsigned long end;
+
 	vn->nr_purged = 0;
 
+	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+
+	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
+
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 		unsigned long orig_start = va->va_start;
@@ -2205,7 +2212,8 @@ static void purge_vmap_node(struct work_struct *work)
 
 		if (is_vmalloc_or_module_addr((void *)orig_start))
 			kasan_release_vmalloc(orig_start, orig_end,
-					      va->va_start, va->va_end);
+					      va->va_start, va->va_end,
+					      KASAN_VMALLOC_PAGE_RANGE);
 
 		atomic_long_sub(nr, &vmap_lazy_nr);
 		vn->nr_purged++;
@@ -2218,6 +2226,8 @@ static void purge_vmap_node(struct work_struct *work)
 		list_add(&va->list, &local_list);
 	}
 
+	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
+
 	reclaim_list_global(&local_list);
 }
 
@@ -4726,7 +4736,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 	}
 
@@ -4776,7 +4787,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 		kfree(vms[area]);
 	}
Uladzislau Rezki July 22, 2024, 12:40 p.m. UTC | #6
On Mon, Jul 22, 2024 at 07:50:54PM +0800, Adrian Huang wrote:
> > I tried to simulate the reported splat and i can reproduce it with KASAN
> > enabled. I use qemu on my 64-core system, it allows me to specify 255
> > cores while running VM. The kernel is 6.10.0-rc5.
> >
> > The kernel should be built with CONFIG_KASAN=y and CONFIG_KASAN_VMALLOC=y
> >
> > The "soft lockup" can be triggered when the kernel is compiled within a
> > VM using 256 jobs and preemption is disabled:
> >
> > echo none > /sys/kernel/debug/sched/preempt
> > make -C coding/linux.git/ -j256 bzImage
> >
> > <snip>
> > watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760]
> > CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95
> > Workqueue: events drain_vmap_area_work
> > RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0
> > ...
> > <snip>
> 
> Great to hear you're able to reproduce the issue.
> 
> I keep debugging, and the original patch (https://lore.kernel.org/all/ZogS_04dP5LlRlXN@pc636/T/) shows purge_vmap_node() iteratively releases kasan vmalloc
> allocations and flushes tlb for each vmap_area. There are 2805
> flush_tlb_kernel_range() calls in ftrace log.
>   * One is called in __purge_vmap_area_lazy().
>   * Others are called in kasan_release_vmalloc(): Called by purge_vmap_node().
>     - [Rough calculation] Each flush_tlb_kernel_range() runs about 7.5ms.
>       -- 2804 * 7.5ms = 21.03 seconds (That's why a soft lock is trigger)
> 
> If we combine all tlb flush operations into one operation in the call path
> 'purge_vmap_node()->kasan_release_vmalloc()', the running time of
> drain_vmap_area_work() can be saved greately. The idea is from the
> flush_tlb_kernel_range() call in __purge_vmap_area_lazy().
> And, the soft lockup won't not be triggered. Please refer to the following patch.
> Here is the test result based on 6.10:
> 
> [6.10 wo/ the patch below]
>   1. ftrace latency profiling (record a trace if the latency > 20s): Commands
>      echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh
>      echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function
>      echo function_graph > /sys/kernel/debug/tracing/current_tracer
>      echo 1 > /sys/kernel/debug/tracing/tracing_on
> 
>   2. Run `make -j $(nproc)` to compile the kernel source
> 
>   3. Once the soft lockup is reproduced, check the ftace:
>      cat /sys/kernel/debug/tracing/trace
>         # tracer: function_graph
>         #
>         # CPU  DURATION                  FUNCTION CALLS
>         # |     |   |                     |   |   |   |
>           76) $ 50412985 us |    } /* __purge_vmap_area_lazy */
>           76) $ 50412997 us |  } /* drain_vmap_area_work */
>           76) $ 29165911 us |    } /* __purge_vmap_area_lazy */
>           76) $ 29165926 us |  } /* drain_vmap_area_work */
>           91) $ 53629423 us |    } /* __purge_vmap_area_lazy */
>           91) $ 53629434 us |  } /* drain_vmap_area_work */
>           91) $ 28121014 us |    } /* __purge_vmap_area_lazy */
>           91) $ 28121026 us |  } /* drain_vmap_area_work */
> 
> 
> [6.10 w/ the patch below]
>   1. Repeat step 1-2 in "[6.10 wo/ the patch below]"
> 
>   2. The soft lockup is not triggered and the ftrace log is empty.
>      cat /sys/kernel/debug/tracing/trace
>      # tracer: function_graph
>      #
>      # CPU  DURATION                  FUNCTION CALLS
>      # |     |   |                     |   |   |   |
> 
> 
>   3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace log.
> 
>   4. Setting 'tracing_thresh' to 1 second gets ftrace log.
>       cat /sys/kernel/tracing/trace
>       # tracer: function_graph
>       #
>       # CPU  DURATION                  FUNCTION CALLS
>       # |     |   |                     |   |   |   |
>         51) $ 1019695 us  |    } /* __purge_vmap_area_lazy */
>         51) $ 1019703 us  |  } /* drain_vmap_area_work */
>        198) $ 1018707 us  |    } /* __purge_vmap_area_lazy */
>        198) $ 1018718 us  |  } /* drain_vmap_area_work */
> 
>   5. Run the following test_vmalloc command without any issues
>      modprobe test_vmalloc nr_threads=$(nproc) run_test_mask=0x1 nr_pages=4
> 
> Could you please test this patch in your VM environment? 
> 
> ---
> diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> index 70d6a8f6e25d..ddbf42a1a7b7 100644
> --- a/include/linux/kasan.h
> +++ b/include/linux/kasan.h
> @@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
>  int kasan_populate_early_shadow(const void *shadow_start,
>  				const void *shadow_end);
>  
> +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply existing page range */
> +#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
> +
>  #ifndef kasan_mem_to_shadow
>  static inline void *kasan_mem_to_shadow(const void *addr)
>  {
> @@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
>  int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
>  void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			   unsigned long free_region_start,
> -			   unsigned long free_region_end);
> +			   unsigned long free_region_end,
> +			   unsigned long flags);
>  
>  #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
>  
> @@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
>  static inline void kasan_release_vmalloc(unsigned long start,
>  					 unsigned long end,
>  					 unsigned long free_region_start,
> -					 unsigned long free_region_end) { }
> +					 unsigned long free_region_end,
> +					 unsigned long flags) { }
>  
>  #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
>  
> @@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
>  static inline void kasan_release_vmalloc(unsigned long start,
>  					 unsigned long end,
>  					 unsigned long free_region_start,
> -					 unsigned long free_region_end) { }
> +					 unsigned long free_region_end,
> +					 unsigned long flags) { }
>  
>  static inline void *kasan_unpoison_vmalloc(const void *start,
>  					   unsigned long size,
> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> index d6210ca48dda..88d1c9dcb507 100644
> --- a/mm/kasan/shadow.c
> +++ b/mm/kasan/shadow.c
> @@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
>   */
>  void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			   unsigned long free_region_start,
> -			   unsigned long free_region_end)
> +			   unsigned long free_region_end,
> +			   unsigned long flags)
>  {
>  	void *shadow_start, *shadow_end;
>  	unsigned long region_start, region_end;
> @@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
>  			return;
>  		}
> -		apply_to_existing_page_range(&init_mm,
> +
> +
> +		if (flags & KASAN_VMALLOC_PAGE_RANGE)
> +			apply_to_existing_page_range(&init_mm,
>  					     (unsigned long)shadow_start,
>  					     size, kasan_depopulate_vmalloc_pte,
>  					     NULL);
> -		flush_tlb_kernel_range((unsigned long)shadow_start,
> -				       (unsigned long)shadow_end);
> +
> +		if (flags & KASAN_VMALLOC_TLB_FLUSH)
> +			flush_tlb_kernel_range((unsigned long)shadow_start,
> +					       (unsigned long)shadow_end);
>  	}
>  }
>  
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index e34ea860153f..d66e09135876 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2193,8 +2193,15 @@ static void purge_vmap_node(struct work_struct *work)
>  	struct vmap_area *va, *n_va;
>  	LIST_HEAD(local_list);
>  
> +	unsigned long start;
> +	unsigned long end;
> +
>  	vn->nr_purged = 0;
>  
> +	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
> +
> +	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
> +
>  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
>  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
>  		unsigned long orig_start = va->va_start;
> @@ -2205,7 +2212,8 @@ static void purge_vmap_node(struct work_struct *work)
>  
>  		if (is_vmalloc_or_module_addr((void *)orig_start))
>  			kasan_release_vmalloc(orig_start, orig_end,
> -					      va->va_start, va->va_end);
> +					      va->va_start, va->va_end,
> +					      KASAN_VMALLOC_PAGE_RANGE);
>  
>  		atomic_long_sub(nr, &vmap_lazy_nr);
>  		vn->nr_purged++;
> @@ -2218,6 +2226,8 @@ static void purge_vmap_node(struct work_struct *work)
>  		list_add(&va->list, &local_list);
>  	}
>  
> +	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> +
>  	reclaim_list_global(&local_list);
>  }
>  
> @@ -4726,7 +4736,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  				&free_vmap_area_list);
>  		if (va)
>  			kasan_release_vmalloc(orig_start, orig_end,
> -				va->va_start, va->va_end);
> +				va->va_start, va->va_end,
> +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
>  		vas[area] = NULL;
>  	}
>  
> @@ -4776,7 +4787,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  				&free_vmap_area_list);
>  		if (va)
>  			kasan_release_vmalloc(orig_start, orig_end,
> -				va->va_start, va->va_end);
> +				va->va_start, va->va_end,
> +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
>  		vas[area] = NULL;
>  		kfree(vms[area]);
>  	}
>
Sure! I will have a look at the patch and check my environment.

--
Uladzislau Rezki
Uladzislau Rezki July 23, 2024, 10:50 a.m. UTC | #7
> If we combine all tlb flush operations into one operation in the call path
> 'purge_vmap_node()->kasan_release_vmalloc()', the running time of
> drain_vmap_area_work() can be saved greately. The idea is from the
> flush_tlb_kernel_range() call in __purge_vmap_area_lazy().
> And, the soft lockup won't not be triggered. Please refer to the following patch.
> Here is the test result based on 6.10:
> 
> [6.10 wo/ the patch below]
>   1. ftrace latency profiling (record a trace if the latency > 20s): Commands
>      echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh
>      echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function
>      echo function_graph > /sys/kernel/debug/tracing/current_tracer
>      echo 1 > /sys/kernel/debug/tracing/tracing_on
> 
>   2. Run `make -j $(nproc)` to compile the kernel source
> 
>   3. Once the soft lockup is reproduced, check the ftace:
>      cat /sys/kernel/debug/tracing/trace
>         # tracer: function_graph
>         #
>         # CPU  DURATION                  FUNCTION CALLS
>         # |     |   |                     |   |   |   |
>           76) $ 50412985 us |    } /* __purge_vmap_area_lazy */
>           76) $ 50412997 us |  } /* drain_vmap_area_work */
>           76) $ 29165911 us |    } /* __purge_vmap_area_lazy */
>           76) $ 29165926 us |  } /* drain_vmap_area_work */
>           91) $ 53629423 us |    } /* __purge_vmap_area_lazy */
>           91) $ 53629434 us |  } /* drain_vmap_area_work */
>           91) $ 28121014 us |    } /* __purge_vmap_area_lazy */
>           91) $ 28121026 us |  } /* drain_vmap_area_work */
> 
> 
> [6.10 w/ the patch below]
>   1. Repeat step 1-2 in "[6.10 wo/ the patch below]"
> 
>   2. The soft lockup is not triggered and the ftrace log is empty.
>      cat /sys/kernel/debug/tracing/trace
>      # tracer: function_graph
>      #
>      # CPU  DURATION                  FUNCTION CALLS
>      # |     |   |                     |   |   |   |
> 
> 
>   3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace log.
> 
>   4. Setting 'tracing_thresh' to 1 second gets ftrace log.
>       cat /sys/kernel/tracing/trace
>       # tracer: function_graph
>       #
>       # CPU  DURATION                  FUNCTION CALLS
>       # |     |   |                     |   |   |   |
>         51) $ 1019695 us  |    } /* __purge_vmap_area_lazy */
>         51) $ 1019703 us  |  } /* drain_vmap_area_work */
>        198) $ 1018707 us  |    } /* __purge_vmap_area_lazy */
>        198) $ 1018718 us  |  } /* drain_vmap_area_work */
> 
>   5. Run the following test_vmalloc command without any issues
>      modprobe test_vmalloc nr_threads=$(nproc) run_test_mask=0x1 nr_pages=4
> 
> Could you please test this patch in your VM environment? 
> 
It works great and does not generate the soft-lock-up splat :)
See below some comments:

> ---
> diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> index 70d6a8f6e25d..ddbf42a1a7b7 100644
> --- a/include/linux/kasan.h
> +++ b/include/linux/kasan.h
> @@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
>  int kasan_populate_early_shadow(const void *shadow_start,
>  				const void *shadow_end);
>  
> +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply existing page range */
> +#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
> +
>  #ifndef kasan_mem_to_shadow
>  static inline void *kasan_mem_to_shadow(const void *addr)
>  {
> @@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
>  int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
>  void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			   unsigned long free_region_start,
> -			   unsigned long free_region_end);
> +			   unsigned long free_region_end,
> +			   unsigned long flags);
>  
>  #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
>  
> @@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
>  static inline void kasan_release_vmalloc(unsigned long start,
>  					 unsigned long end,
>  					 unsigned long free_region_start,
> -					 unsigned long free_region_end) { }
> +					 unsigned long free_region_end,
> +					 unsigned long flags) { }
>  
>  #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
>  
> @@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
>  static inline void kasan_release_vmalloc(unsigned long start,
>  					 unsigned long end,
>  					 unsigned long free_region_start,
> -					 unsigned long free_region_end) { }
> +					 unsigned long free_region_end,
> +					 unsigned long flags) { }
>  
>  static inline void *kasan_unpoison_vmalloc(const void *start,
>  					   unsigned long size,
> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> index d6210ca48dda..88d1c9dcb507 100644
> --- a/mm/kasan/shadow.c
> +++ b/mm/kasan/shadow.c
> @@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
>   */
>  void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			   unsigned long free_region_start,
> -			   unsigned long free_region_end)
> +			   unsigned long free_region_end,
> +			   unsigned long flags)
>  {
>  	void *shadow_start, *shadow_end;
>  	unsigned long region_start, region_end;
> @@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
>  			return;
>  		}
> -		apply_to_existing_page_range(&init_mm,
> +
> +
> +		if (flags & KASAN_VMALLOC_PAGE_RANGE)
> +			apply_to_existing_page_range(&init_mm,
>  					     (unsigned long)shadow_start,
>  					     size, kasan_depopulate_vmalloc_pte,
>  					     NULL);
> -		flush_tlb_kernel_range((unsigned long)shadow_start,
> -				       (unsigned long)shadow_end);
> +
> +		if (flags & KASAN_VMALLOC_TLB_FLUSH)
> +			flush_tlb_kernel_range((unsigned long)shadow_start,
> +					       (unsigned long)shadow_end);
>  	}
>  }
>  
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index e34ea860153f..d66e09135876 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2193,8 +2193,15 @@ static void purge_vmap_node(struct work_struct *work)
>  	struct vmap_area *va, *n_va;
>  	LIST_HEAD(local_list);
>  
> +	unsigned long start;
> +	unsigned long end;
> +
>  	vn->nr_purged = 0;
>  
> +	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
> +
> +	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
> +
>  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
>  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
>  		unsigned long orig_start = va->va_start;
> @@ -2205,7 +2212,8 @@ static void purge_vmap_node(struct work_struct *work)
>  
>  		if (is_vmalloc_or_module_addr((void *)orig_start))
>  			kasan_release_vmalloc(orig_start, orig_end,
> -					      va->va_start, va->va_end);
> +					      va->va_start, va->va_end,
> +					      KASAN_VMALLOC_PAGE_RANGE);
>  
>  		atomic_long_sub(nr, &vmap_lazy_nr);
>  		vn->nr_purged++;
> @@ -2218,6 +2226,8 @@ static void purge_vmap_node(struct work_struct *work)
>  		list_add(&va->list, &local_list);
>  	}
>  
> +	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> +
>
Do we need it here? We just did the TLB flush for en entire range in the
__purge_vmap_area_lazy(). So, it is two times invoked and looks odd to me.

Am i missing something?

Thanks!

--
Uladzislau Rezki
Adrian Huang July 24, 2024, 12:46 p.m. UTC | #8
> It works great and does not generate the soft-lock-up splat :)
> See below some comments:

Great. Thanks for the confirmation.

<snip>

>> +     kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
>> +
>>
> Do we need it here? We just did the TLB flush for en entire range in the
> __purge_vmap_area_lazy(). So, it is two times invoked and looks odd to me.
>
> Am i missing something?

1. The TLB flush for the entire range in __purge_vmap_area_lazy() is for
the vmalloc virtual address (VMALLOC_START->VMALLOC_END).

2. The TLB flush in purge_vmap_node() is for the KASAN shadow virtual address 
(the shadow offset 'CONFIG_KASAN_SHADOW_OFFSET' is defined in .config).

BTW, I found my first patch has the potential risk. We need to flush TLB of
the KASAN shadow virtual address firstly. Please see the following patch for
detail. (I put the comment in the following patch). The following patch
also works well on my 256-core machine.

If you're ok with the patch, I'll submit it for upstream review. And, may I
have your tag(s): tested-by/reviewed-by? (If possible, could you please have
a test for the following patch).

Thanks.

---
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 70d6a8f6e25d..ddbf42a1a7b7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 int kasan_populate_early_shadow(const void *shadow_start,
 				const void *shadow_end);
 
+#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
+#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
+
 #ifndef kasan_mem_to_shadow
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
@@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end);
+			   unsigned long free_region_end,
+			   unsigned long flags);
 
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 static inline void *kasan_unpoison_vmalloc(const void *start,
 					   unsigned long size,
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d6210ca48dda..88d1c9dcb507 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  */
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end)
+			   unsigned long free_region_end,
+			   unsigned long flags)
 {
 	void *shadow_start, *shadow_end;
 	unsigned long region_start, region_end;
@@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
 			return;
 		}
-		apply_to_existing_page_range(&init_mm,
+
+
+		if (flags & KASAN_VMALLOC_PAGE_RANGE)
+			apply_to_existing_page_range(&init_mm,
 					     (unsigned long)shadow_start,
 					     size, kasan_depopulate_vmalloc_pte,
 					     NULL);
-		flush_tlb_kernel_range((unsigned long)shadow_start,
-				       (unsigned long)shadow_end);
+
+		if (flags & KASAN_VMALLOC_TLB_FLUSH)
+			flush_tlb_kernel_range((unsigned long)shadow_start,
+					       (unsigned long)shadow_end);
 	}
 }
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e34ea860153f..12cdc92cdb83 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2193,8 +2193,22 @@ static void purge_vmap_node(struct work_struct *work)
 	struct vmap_area *va, *n_va;
 	LIST_HEAD(local_list);
 
+	unsigned long start;
+	unsigned long end;
+
 	vn->nr_purged = 0;
 
+	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+
+	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
+
+	/*
+	 * Since node_pool_add_va() returns vmap_area(s) to its pool, the
+	 * returned vmap_area(s) might be grabbed immediately via node_alloc()
+	 * by other core. We need to flush TLB firstly.
+	 */
+	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
+
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 		unsigned long orig_start = va->va_start;
@@ -2205,7 +2219,8 @@ static void purge_vmap_node(struct work_struct *work)
 
 		if (is_vmalloc_or_module_addr((void *)orig_start))
 			kasan_release_vmalloc(orig_start, orig_end,
-					      va->va_start, va->va_end);
+					      va->va_start, va->va_end,
+					      KASAN_VMALLOC_PAGE_RANGE);
 
 		atomic_long_sub(nr, &vmap_lazy_nr);
 		vn->nr_purged++;
@@ -4726,7 +4741,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 	}
 
@@ -4776,7 +4792,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 		kfree(vms[area]);
 	}
Uladzislau Rezki July 24, 2024, 2:32 p.m. UTC | #9
On Wed, Jul 24, 2024 at 08:46:24PM +0800, Adrian Huang wrote:
> > It works great and does not generate the soft-lock-up splat :)
> > See below some comments:
> 
> Great. Thanks for the confirmation.
> 
> <snip>
> 
> >> +     kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> >> +
> >>
> > Do we need it here? We just did the TLB flush for en entire range in the
> > __purge_vmap_area_lazy(). So, it is two times invoked and looks odd to me.
> >
> > Am i missing something?
> 
> 1. The TLB flush for the entire range in __purge_vmap_area_lazy() is for
> the vmalloc virtual address (VMALLOC_START->VMALLOC_END).
> 
> 2. The TLB flush in purge_vmap_node() is for the KASAN shadow virtual address 
> (the shadow offset 'CONFIG_KASAN_SHADOW_OFFSET' is defined in .config).
> 
Correct. It deals with a shadow region!

>
> BTW, I found my first patch has the potential risk. We need to flush TLB of
> the KASAN shadow virtual address firstly. Please see the following patch for
> detail. (I put the comment in the following patch). The following patch
> also works well on my 256-core machine.
> 
I noticed that and it would be my second question :)

>
> If you're ok with the patch, I'll submit it for upstream review. And, may I
> have your tag(s): tested-by/reviewed-by? (If possible, could you please have
> a test for the following patch).
> 
I am OK. I will test and get back soon.

> Thanks.
> 
> ---
> diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> index 70d6a8f6e25d..ddbf42a1a7b7 100644
> --- a/include/linux/kasan.h
> +++ b/include/linux/kasan.h
> @@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
>  int kasan_populate_early_shadow(const void *shadow_start,
>  				const void *shadow_end);
>  
> +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
> +#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
> +
>  #ifndef kasan_mem_to_shadow
>  static inline void *kasan_mem_to_shadow(const void *addr)
>  {
> @@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
>  int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
>  void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			   unsigned long free_region_start,
> -			   unsigned long free_region_end);
> +			   unsigned long free_region_end,
> +			   unsigned long flags);
>  
>  #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
>  
> @@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
>  static inline void kasan_release_vmalloc(unsigned long start,
>  					 unsigned long end,
>  					 unsigned long free_region_start,
> -					 unsigned long free_region_end) { }
> +					 unsigned long free_region_end,
> +					 unsigned long flags) { }
>  
>  #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
>  
> @@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
>  static inline void kasan_release_vmalloc(unsigned long start,
>  					 unsigned long end,
>  					 unsigned long free_region_start,
> -					 unsigned long free_region_end) { }
> +					 unsigned long free_region_end,
> +					 unsigned long flags) { }
>  
>  static inline void *kasan_unpoison_vmalloc(const void *start,
>  					   unsigned long size,
> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> index d6210ca48dda..88d1c9dcb507 100644
> --- a/mm/kasan/shadow.c
> +++ b/mm/kasan/shadow.c
> @@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
>   */
>  void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			   unsigned long free_region_start,
> -			   unsigned long free_region_end)
> +			   unsigned long free_region_end,
> +			   unsigned long flags)
>  {
>  	void *shadow_start, *shadow_end;
>  	unsigned long region_start, region_end;
> @@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
>  			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
>  			return;
>  		}
> -		apply_to_existing_page_range(&init_mm,
> +
> +
> +		if (flags & KASAN_VMALLOC_PAGE_RANGE)
> +			apply_to_existing_page_range(&init_mm,
>  					     (unsigned long)shadow_start,
>  					     size, kasan_depopulate_vmalloc_pte,
>  					     NULL);
> -		flush_tlb_kernel_range((unsigned long)shadow_start,
> -				       (unsigned long)shadow_end);
> +
> +		if (flags & KASAN_VMALLOC_TLB_FLUSH)
> +			flush_tlb_kernel_range((unsigned long)shadow_start,
> +					       (unsigned long)shadow_end);
>  	}
>  }
>  
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index e34ea860153f..12cdc92cdb83 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2193,8 +2193,22 @@ static void purge_vmap_node(struct work_struct *work)
>  	struct vmap_area *va, *n_va;
>  	LIST_HEAD(local_list);
>  
> +	unsigned long start;
> +	unsigned long end;
> +
>  	vn->nr_purged = 0;
>  
> +	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
> +
> +	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
> +
> +	/*
> +	 * Since node_pool_add_va() returns vmap_area(s) to its pool, the
> +	 * returned vmap_area(s) might be grabbed immediately via node_alloc()
> +	 * by other core. We need to flush TLB firstly.
> +	 */
> +	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> +
>  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
>  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
>  		unsigned long orig_start = va->va_start;
> @@ -2205,7 +2219,8 @@ static void purge_vmap_node(struct work_struct *work)
>  
>  		if (is_vmalloc_or_module_addr((void *)orig_start))
>  			kasan_release_vmalloc(orig_start, orig_end,
> -					      va->va_start, va->va_end);
> +					      va->va_start, va->va_end,
> +					      KASAN_VMALLOC_PAGE_RANGE);
>  
>  		atomic_long_sub(nr, &vmap_lazy_nr);
>  		vn->nr_purged++;
> @@ -4726,7 +4741,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  				&free_vmap_area_list);
>  		if (va)
>  			kasan_release_vmalloc(orig_start, orig_end,
> -				va->va_start, va->va_end);
> +				va->va_start, va->va_end,
> +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
>  		vas[area] = NULL;
>  	}
>  
> @@ -4776,7 +4792,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  				&free_vmap_area_list);
>  		if (va)
>  			kasan_release_vmalloc(orig_start, orig_end,
> -				va->va_start, va->va_end);
> +				va->va_start, va->va_end,
> +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
>  		vas[area] = NULL;
>  		kfree(vms[area]);
>  	}
Uladzislau Rezki July 24, 2024, 3:09 p.m. UTC | #10
On Wed, Jul 24, 2024 at 04:32:37PM +0200, Uladzislau Rezki wrote:
> On Wed, Jul 24, 2024 at 08:46:24PM +0800, Adrian Huang wrote:
> > > It works great and does not generate the soft-lock-up splat :)
> > > See below some comments:
> > 
> > Great. Thanks for the confirmation.
> > 
> > <snip>
> > 
> > >> +     kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> > >> +
> > >>
> > > Do we need it here? We just did the TLB flush for en entire range in the
> > > __purge_vmap_area_lazy(). So, it is two times invoked and looks odd to me.
> > >
> > > Am i missing something?
> > 
> > 1. The TLB flush for the entire range in __purge_vmap_area_lazy() is for
> > the vmalloc virtual address (VMALLOC_START->VMALLOC_END).
> > 
> > 2. The TLB flush in purge_vmap_node() is for the KASAN shadow virtual address 
> > (the shadow offset 'CONFIG_KASAN_SHADOW_OFFSET' is defined in .config).
> > 
> Correct. It deals with a shadow region!
> 
> >
> > BTW, I found my first patch has the potential risk. We need to flush TLB of
> > the KASAN shadow virtual address firstly. Please see the following patch for
> > detail. (I put the comment in the following patch). The following patch
> > also works well on my 256-core machine.
> > 
> I noticed that and it would be my second question :)
> 
> >
> > If you're ok with the patch, I'll submit it for upstream review. And, may I
> > have your tag(s): tested-by/reviewed-by? (If possible, could you please have
> > a test for the following patch).
> > 
> I am OK. I will test and get back soon.
> 
> > Thanks.
> > 
> > ---
> > diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> > index 70d6a8f6e25d..ddbf42a1a7b7 100644
> > --- a/include/linux/kasan.h
> > +++ b/include/linux/kasan.h
> > @@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
> >  int kasan_populate_early_shadow(const void *shadow_start,
> >  				const void *shadow_end);
> >  
> > +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
> > +#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
> > +
> >  #ifndef kasan_mem_to_shadow
> >  static inline void *kasan_mem_to_shadow(const void *addr)
> >  {
> > @@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
> >  int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
> >  void kasan_release_vmalloc(unsigned long start, unsigned long end,
> >  			   unsigned long free_region_start,
> > -			   unsigned long free_region_end);
> > +			   unsigned long free_region_end,
> > +			   unsigned long flags);
> >  
> >  #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
> >  
> > @@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
> >  static inline void kasan_release_vmalloc(unsigned long start,
> >  					 unsigned long end,
> >  					 unsigned long free_region_start,
> > -					 unsigned long free_region_end) { }
> > +					 unsigned long free_region_end,
> > +					 unsigned long flags) { }
> >  
> >  #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
> >  
> > @@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
> >  static inline void kasan_release_vmalloc(unsigned long start,
> >  					 unsigned long end,
> >  					 unsigned long free_region_start,
> > -					 unsigned long free_region_end) { }
> > +					 unsigned long free_region_end,
> > +					 unsigned long flags) { }
> >  
> >  static inline void *kasan_unpoison_vmalloc(const void *start,
> >  					   unsigned long size,
> > diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> > index d6210ca48dda..88d1c9dcb507 100644
> > --- a/mm/kasan/shadow.c
> > +++ b/mm/kasan/shadow.c
> > @@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
> >   */
> >  void kasan_release_vmalloc(unsigned long start, unsigned long end,
> >  			   unsigned long free_region_start,
> > -			   unsigned long free_region_end)
> > +			   unsigned long free_region_end,
> > +			   unsigned long flags)
> >  {
> >  	void *shadow_start, *shadow_end;
> >  	unsigned long region_start, region_end;
> > @@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
> >  			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
> >  			return;
> >  		}
> > -		apply_to_existing_page_range(&init_mm,
> > +
> > +
> > +		if (flags & KASAN_VMALLOC_PAGE_RANGE)
> > +			apply_to_existing_page_range(&init_mm,
> >  					     (unsigned long)shadow_start,
> >  					     size, kasan_depopulate_vmalloc_pte,
> >  					     NULL);
> > -		flush_tlb_kernel_range((unsigned long)shadow_start,
> > -				       (unsigned long)shadow_end);
> > +
> > +		if (flags & KASAN_VMALLOC_TLB_FLUSH)
> > +			flush_tlb_kernel_range((unsigned long)shadow_start,
> > +					       (unsigned long)shadow_end);
> >  	}
> >  }
> >  
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index e34ea860153f..12cdc92cdb83 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -2193,8 +2193,22 @@ static void purge_vmap_node(struct work_struct *work)
> >  	struct vmap_area *va, *n_va;
> >  	LIST_HEAD(local_list);
remove the space.
> >  
> > +	unsigned long start;
> > +	unsigned long end;
> > +
> >  	vn->nr_purged = 0;
> >  
> > +	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
no need to have an extra space.
> > +
> > +	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
> > +
> > +	/*
> > +	 * Since node_pool_add_va() returns vmap_area(s) to its pool, the
> > +	 * returned vmap_area(s) might be grabbed immediately via node_alloc()
> > +	 * by other core. We need to flush TLB firstly.
> > +	 */
> > +	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> > +
> >  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
> >  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
> >  		unsigned long orig_start = va->va_start;
> > @@ -2205,7 +2219,8 @@ static void purge_vmap_node(struct work_struct *work)
> >  
> >  		if (is_vmalloc_or_module_addr((void *)orig_start))
> >  			kasan_release_vmalloc(orig_start, orig_end,
> > -					      va->va_start, va->va_end);
> > +					      va->va_start, va->va_end,
> > +					      KASAN_VMALLOC_PAGE_RANGE);
>
orig_start and orig_end are unnecessary now. But it can be removed by
an extra patch!

> >  
> >  		atomic_long_sub(nr, &vmap_lazy_nr);
> >  		vn->nr_purged++;
> > @@ -4726,7 +4741,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> >  				&free_vmap_area_list);
> >  		if (va)
> >  			kasan_release_vmalloc(orig_start, orig_end,
> > -				va->va_start, va->va_end);
> > +				va->va_start, va->va_end,
> > +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
> >  		vas[area] = NULL;
> >  	}
> >  
> > @@ -4776,7 +4792,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> >  				&free_vmap_area_list);
> >  		if (va)
> >  			kasan_release_vmalloc(orig_start, orig_end,
> > -				va->va_start, va->va_end);
> > +				va->va_start, va->va_end,
> > +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
> >  		vas[area] = NULL;
> >  		kfree(vms[area]);
> >  	}
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Tested-by: Uladzislau Rezki (Sony) <urezki@gmail.com>

Thank you.

--
Uladzislau Rezki
Uladzislau Rezki July 24, 2024, 7:27 p.m. UTC | #11
On Wed, Jul 24, 2024 at 05:09:59PM +0200, Uladzislau Rezki wrote:
> On Wed, Jul 24, 2024 at 04:32:37PM +0200, Uladzislau Rezki wrote:
> > On Wed, Jul 24, 2024 at 08:46:24PM +0800, Adrian Huang wrote:
> > > > It works great and does not generate the soft-lock-up splat :)
> > > > See below some comments:
> > > 
> > > Great. Thanks for the confirmation.
> > > 
> > > <snip>
> > > 
> > > >> +     kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> > > >> +
> > > >>
> > > > Do we need it here? We just did the TLB flush for en entire range in the
> > > > __purge_vmap_area_lazy(). So, it is two times invoked and looks odd to me.
> > > >
> > > > Am i missing something?
> > > 
> > > 1. The TLB flush for the entire range in __purge_vmap_area_lazy() is for
> > > the vmalloc virtual address (VMALLOC_START->VMALLOC_END).
> > > 
> > > 2. The TLB flush in purge_vmap_node() is for the KASAN shadow virtual address 
> > > (the shadow offset 'CONFIG_KASAN_SHADOW_OFFSET' is defined in .config).
> > > 
> > Correct. It deals with a shadow region!
> > 
> > >
> > > BTW, I found my first patch has the potential risk. We need to flush TLB of
> > > the KASAN shadow virtual address firstly. Please see the following patch for
> > > detail. (I put the comment in the following patch). The following patch
> > > also works well on my 256-core machine.
> > > 
> > I noticed that and it would be my second question :)
> > 
> > >
> > > If you're ok with the patch, I'll submit it for upstream review. And, may I
> > > have your tag(s): tested-by/reviewed-by? (If possible, could you please have
> > > a test for the following patch).
> > > 
> > I am OK. I will test and get back soon.
> > 
> > > Thanks.
> > > 
> > > ---
> > > diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> > > index 70d6a8f6e25d..ddbf42a1a7b7 100644
> > > --- a/include/linux/kasan.h
> > > +++ b/include/linux/kasan.h
> > > @@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
> > >  int kasan_populate_early_shadow(const void *shadow_start,
> > >  				const void *shadow_end);
> > >  
> > > +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
> > > +#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
> > > +
> > >  #ifndef kasan_mem_to_shadow
> > >  static inline void *kasan_mem_to_shadow(const void *addr)
> > >  {
> > > @@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
> > >  int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
> > >  void kasan_release_vmalloc(unsigned long start, unsigned long end,
> > >  			   unsigned long free_region_start,
> > > -			   unsigned long free_region_end);
> > > +			   unsigned long free_region_end,
> > > +			   unsigned long flags);
> > >  
> > >  #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
> > >  
> > > @@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
> > >  static inline void kasan_release_vmalloc(unsigned long start,
> > >  					 unsigned long end,
> > >  					 unsigned long free_region_start,
> > > -					 unsigned long free_region_end) { }
> > > +					 unsigned long free_region_end,
> > > +					 unsigned long flags) { }
> > >  
> > >  #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
> > >  
> > > @@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
> > >  static inline void kasan_release_vmalloc(unsigned long start,
> > >  					 unsigned long end,
> > >  					 unsigned long free_region_start,
> > > -					 unsigned long free_region_end) { }
> > > +					 unsigned long free_region_end,
> > > +					 unsigned long flags) { }
> > >  
> > >  static inline void *kasan_unpoison_vmalloc(const void *start,
> > >  					   unsigned long size,
> > > diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> > > index d6210ca48dda..88d1c9dcb507 100644
> > > --- a/mm/kasan/shadow.c
> > > +++ b/mm/kasan/shadow.c
> > > @@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
> > >   */
> > >  void kasan_release_vmalloc(unsigned long start, unsigned long end,
> > >  			   unsigned long free_region_start,
> > > -			   unsigned long free_region_end)
> > > +			   unsigned long free_region_end,
> > > +			   unsigned long flags)
> > >  {
> > >  	void *shadow_start, *shadow_end;
> > >  	unsigned long region_start, region_end;
> > > @@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
> > >  			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
> > >  			return;
> > >  		}
> > > -		apply_to_existing_page_range(&init_mm,
> > > +
> > > +
> > > +		if (flags & KASAN_VMALLOC_PAGE_RANGE)
> > > +			apply_to_existing_page_range(&init_mm,
> > >  					     (unsigned long)shadow_start,
> > >  					     size, kasan_depopulate_vmalloc_pte,
> > >  					     NULL);
> > > -		flush_tlb_kernel_range((unsigned long)shadow_start,
> > > -				       (unsigned long)shadow_end);
> > > +
> > > +		if (flags & KASAN_VMALLOC_TLB_FLUSH)
> > > +			flush_tlb_kernel_range((unsigned long)shadow_start,
> > > +					       (unsigned long)shadow_end);
> > >  	}
> > >  }
> > >  
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index e34ea860153f..12cdc92cdb83 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -2193,8 +2193,22 @@ static void purge_vmap_node(struct work_struct *work)
> > >  	struct vmap_area *va, *n_va;
> > >  	LIST_HEAD(local_list);
> remove the space.
> > >  
> > > +	unsigned long start;
> > > +	unsigned long end;
> > > +
> > >  	vn->nr_purged = 0;
> > >  
> > > +	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
> no need to have an extra space.
> > > +
> > > +	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
> > > +
> > > +	/*
> > > +	 * Since node_pool_add_va() returns vmap_area(s) to its pool, the
> > > +	 * returned vmap_area(s) might be grabbed immediately via node_alloc()
> > > +	 * by other core. We need to flush TLB firstly.
> > > +	 */
> > > +	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> > > +
> > >  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
> > >  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
> > >  		unsigned long orig_start = va->va_start;
> > > @@ -2205,7 +2219,8 @@ static void purge_vmap_node(struct work_struct *work)
> > >  
> > >  		if (is_vmalloc_or_module_addr((void *)orig_start))
> > >  			kasan_release_vmalloc(orig_start, orig_end,
> > > -					      va->va_start, va->va_end);
> > > +					      va->va_start, va->va_end,
> > > +					      KASAN_VMALLOC_PAGE_RANGE);
> >
> orig_start and orig_end are unnecessary now. But it can be removed by
> an extra patch!
> 
> > >  
> > >  		atomic_long_sub(nr, &vmap_lazy_nr);
> > >  		vn->nr_purged++;
> > > @@ -4726,7 +4741,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> > >  				&free_vmap_area_list);
> > >  		if (va)
> > >  			kasan_release_vmalloc(orig_start, orig_end,
> > > -				va->va_start, va->va_end);
> > > +				va->va_start, va->va_end,
> > > +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
> > >  		vas[area] = NULL;
> > >  	}
> > >  
> > > @@ -4776,7 +4792,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> > >  				&free_vmap_area_list);
> > >  		if (va)
> > >  			kasan_release_vmalloc(orig_start, orig_end,
> > > -				va->va_start, va->va_end);
> > > +				va->va_start, va->va_end,
> > > +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
> > >  		vas[area] = NULL;
> > >  		kfree(vms[area]);
> > >  	}
> Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> Tested-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> 
I get: BUG: KASAN: vmalloc-out-of-bounds in long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]

[15579.900340] ==================================================================
[15579.900412] BUG: KASAN: vmalloc-out-of-bounds in long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
[15579.900459] Write of size 1 at addr ffffc901c0578000 by task vmalloc_test/2/49374

[15579.900506] CPU: 199 PID: 49374 Comm: vmalloc_test/2 Kdump: loaded Not tainted 6.10.0-rc5-00019-g4236f0255ea8-dirty #3450
[15579.900554] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[15579.900595] Call Trace:
[15579.900611]  <TASK>
[15579.900635]  dump_stack_lvl+0x53/0x70
[15579.900670]  print_address_description.constprop.0+0x2c/0x3a0
[15579.900701]  ? long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
[15579.900732]  print_report+0xb9/0x2b0
[15579.900752]  ? kasan_addr_to_slab+0xd/0xb0
[15579.900776]  ? long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
[15579.900806]  kasan_report+0xd3/0x110
[15579.900828]  ? long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
[15579.900860]  long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
[15579.900890]  ? ktime_get+0xa1/0x170
[15579.900910]  ? __pfx_long_busy_list_alloc_test+0x10/0x10 [test_vmalloc]
[15579.900943]  test_func+0x232/0x510 [test_vmalloc]
[15579.900970]  ? __pfx_test_func+0x10/0x10 [test_vmalloc]
[15579.900998]  ? __kthread_parkme+0x82/0x140
[15579.901022]  ? __pfx_test_func+0x10/0x10 [test_vmalloc]
[15579.901049]  kthread+0x2a5/0x370
[15579.901069]  ? __pfx_kthread+0x10/0x10
[15579.901091]  ret_from_fork+0x34/0x70
[15579.901113]  ? __pfx_kthread+0x10/0x10
[15579.901135]  ret_from_fork_asm+0x1a/0x30
[15579.901161]  </TASK>

[15579.901189] The buggy address belongs to the virtual mapping at
                [ffffc901c0578000, ffffc901c05dd000) created by:
                long_busy_list_alloc_test+0x8e/0x1c0 [test_vmalloc]

[15579.901281] The buggy address belongs to the physical page:
[15579.901309] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x50611a
[15579.901312] flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
[15579.901317] raw: 0017ffffc0000000 0000000000000000 dead000000000122 0000000000000000
[15579.901320] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
[15579.901321] page dumped because: kasan: bad access detected

[15579.901335] Memory state around the buggy address:
[15579.901359]  ffffc901c0577f00: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[15579.901391]  ffffc901c0577f80: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[15579.901423] >ffffc901c0578000: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[15579.901455]                    ^
[15579.901474]  ffffc901c0578080: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[15579.901506]  ffffc901c0578100: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
[15579.901538] ==================================================================
[15579.902332] Disabling lock debugging due to kernel taint

after applying this patch. Let me check tomorrow if it is a real BUG or
it is a side-effect of the patch.

--
Uladzislau Rezki
Uladzislau Rezki July 24, 2024, 8:34 p.m. UTC | #12
On Wed, Jul 24, 2024 at 09:27:46PM +0200, Uladzislau Rezki wrote:
> On Wed, Jul 24, 2024 at 05:09:59PM +0200, Uladzislau Rezki wrote:
> > On Wed, Jul 24, 2024 at 04:32:37PM +0200, Uladzislau Rezki wrote:
> > > On Wed, Jul 24, 2024 at 08:46:24PM +0800, Adrian Huang wrote:
> > > > > It works great and does not generate the soft-lock-up splat :)
> > > > > See below some comments:
> > > > 
> > > > Great. Thanks for the confirmation.
> > > > 
> > > > <snip>
> > > > 
> > > > >> +     kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> > > > >> +
> > > > >>
> > > > > Do we need it here? We just did the TLB flush for en entire range in the
> > > > > __purge_vmap_area_lazy(). So, it is two times invoked and looks odd to me.
> > > > >
> > > > > Am i missing something?
> > > > 
> > > > 1. The TLB flush for the entire range in __purge_vmap_area_lazy() is for
> > > > the vmalloc virtual address (VMALLOC_START->VMALLOC_END).
> > > > 
> > > > 2. The TLB flush in purge_vmap_node() is for the KASAN shadow virtual address 
> > > > (the shadow offset 'CONFIG_KASAN_SHADOW_OFFSET' is defined in .config).
> > > > 
> > > Correct. It deals with a shadow region!
> > > 
> > > >
> > > > BTW, I found my first patch has the potential risk. We need to flush TLB of
> > > > the KASAN shadow virtual address firstly. Please see the following patch for
> > > > detail. (I put the comment in the following patch). The following patch
> > > > also works well on my 256-core machine.
> > > > 
> > > I noticed that and it would be my second question :)
> > > 
> > > >
> > > > If you're ok with the patch, I'll submit it for upstream review. And, may I
> > > > have your tag(s): tested-by/reviewed-by? (If possible, could you please have
> > > > a test for the following patch).
> > > > 
> > > I am OK. I will test and get back soon.
> > > 
> > > > Thanks.
> > > > 
> > > > ---
> > > > diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> > > > index 70d6a8f6e25d..ddbf42a1a7b7 100644
> > > > --- a/include/linux/kasan.h
> > > > +++ b/include/linux/kasan.h
> > > > @@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
> > > >  int kasan_populate_early_shadow(const void *shadow_start,
> > > >  				const void *shadow_end);
> > > >  
> > > > +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
> > > > +#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
> > > > +
> > > >  #ifndef kasan_mem_to_shadow
> > > >  static inline void *kasan_mem_to_shadow(const void *addr)
> > > >  {
> > > > @@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
> > > >  int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
> > > >  void kasan_release_vmalloc(unsigned long start, unsigned long end,
> > > >  			   unsigned long free_region_start,
> > > > -			   unsigned long free_region_end);
> > > > +			   unsigned long free_region_end,
> > > > +			   unsigned long flags);
> > > >  
> > > >  #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
> > > >  
> > > > @@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
> > > >  static inline void kasan_release_vmalloc(unsigned long start,
> > > >  					 unsigned long end,
> > > >  					 unsigned long free_region_start,
> > > > -					 unsigned long free_region_end) { }
> > > > +					 unsigned long free_region_end,
> > > > +					 unsigned long flags) { }
> > > >  
> > > >  #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
> > > >  
> > > > @@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
> > > >  static inline void kasan_release_vmalloc(unsigned long start,
> > > >  					 unsigned long end,
> > > >  					 unsigned long free_region_start,
> > > > -					 unsigned long free_region_end) { }
> > > > +					 unsigned long free_region_end,
> > > > +					 unsigned long flags) { }
> > > >  
> > > >  static inline void *kasan_unpoison_vmalloc(const void *start,
> > > >  					   unsigned long size,
> > > > diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> > > > index d6210ca48dda..88d1c9dcb507 100644
> > > > --- a/mm/kasan/shadow.c
> > > > +++ b/mm/kasan/shadow.c
> > > > @@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
> > > >   */
> > > >  void kasan_release_vmalloc(unsigned long start, unsigned long end,
> > > >  			   unsigned long free_region_start,
> > > > -			   unsigned long free_region_end)
> > > > +			   unsigned long free_region_end,
> > > > +			   unsigned long flags)
> > > >  {
> > > >  	void *shadow_start, *shadow_end;
> > > >  	unsigned long region_start, region_end;
> > > > @@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
> > > >  			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
> > > >  			return;
> > > >  		}
> > > > -		apply_to_existing_page_range(&init_mm,
> > > > +
> > > > +
> > > > +		if (flags & KASAN_VMALLOC_PAGE_RANGE)
> > > > +			apply_to_existing_page_range(&init_mm,
> > > >  					     (unsigned long)shadow_start,
> > > >  					     size, kasan_depopulate_vmalloc_pte,
> > > >  					     NULL);
> > > > -		flush_tlb_kernel_range((unsigned long)shadow_start,
> > > > -				       (unsigned long)shadow_end);
> > > > +
> > > > +		if (flags & KASAN_VMALLOC_TLB_FLUSH)
> > > > +			flush_tlb_kernel_range((unsigned long)shadow_start,
> > > > +					       (unsigned long)shadow_end);
> > > >  	}
> > > >  }
> > > >  
> > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > index e34ea860153f..12cdc92cdb83 100644
> > > > --- a/mm/vmalloc.c
> > > > +++ b/mm/vmalloc.c
> > > > @@ -2193,8 +2193,22 @@ static void purge_vmap_node(struct work_struct *work)
> > > >  	struct vmap_area *va, *n_va;
> > > >  	LIST_HEAD(local_list);
> > remove the space.
> > > >  
> > > > +	unsigned long start;
> > > > +	unsigned long end;
> > > > +
> > > >  	vn->nr_purged = 0;
> > > >  
> > > > +	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
> > no need to have an extra space.
> > > > +
> > > > +	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
> > > > +
> > > > +	/*
> > > > +	 * Since node_pool_add_va() returns vmap_area(s) to its pool, the
> > > > +	 * returned vmap_area(s) might be grabbed immediately via node_alloc()
> > > > +	 * by other core. We need to flush TLB firstly.
> > > > +	 */
> > > > +	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
> > > > +
> > > >  	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
> > > >  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
> > > >  		unsigned long orig_start = va->va_start;
> > > > @@ -2205,7 +2219,8 @@ static void purge_vmap_node(struct work_struct *work)
> > > >  
> > > >  		if (is_vmalloc_or_module_addr((void *)orig_start))
> > > >  			kasan_release_vmalloc(orig_start, orig_end,
> > > > -					      va->va_start, va->va_end);
> > > > +					      va->va_start, va->va_end,
> > > > +					      KASAN_VMALLOC_PAGE_RANGE);
> > >
> > orig_start and orig_end are unnecessary now. But it can be removed by
> > an extra patch!
> > 
> > > >  
> > > >  		atomic_long_sub(nr, &vmap_lazy_nr);
> > > >  		vn->nr_purged++;
> > > > @@ -4726,7 +4741,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> > > >  				&free_vmap_area_list);
> > > >  		if (va)
> > > >  			kasan_release_vmalloc(orig_start, orig_end,
> > > > -				va->va_start, va->va_end);
> > > > +				va->va_start, va->va_end,
> > > > +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
> > > >  		vas[area] = NULL;
> > > >  	}
> > > >  
> > > > @@ -4776,7 +4792,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> > > >  				&free_vmap_area_list);
> > > >  		if (va)
> > > >  			kasan_release_vmalloc(orig_start, orig_end,
> > > > -				va->va_start, va->va_end);
> > > > +				va->va_start, va->va_end,
> > > > +				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
> > > >  		vas[area] = NULL;
> > > >  		kfree(vms[area]);
> > > >  	}
> > Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > Tested-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > 
> I get: BUG: KASAN: vmalloc-out-of-bounds in long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
> 
> [15579.900340] ==================================================================
> [15579.900412] BUG: KASAN: vmalloc-out-of-bounds in long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
> [15579.900459] Write of size 1 at addr ffffc901c0578000 by task vmalloc_test/2/49374
> 
> [15579.900506] CPU: 199 PID: 49374 Comm: vmalloc_test/2 Kdump: loaded Not tainted 6.10.0-rc5-00019-g4236f0255ea8-dirty #3450
> [15579.900554] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> [15579.900595] Call Trace:
> [15579.900611]  <TASK>
> [15579.900635]  dump_stack_lvl+0x53/0x70
> [15579.900670]  print_address_description.constprop.0+0x2c/0x3a0
> [15579.900701]  ? long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
> [15579.900732]  print_report+0xb9/0x2b0
> [15579.900752]  ? kasan_addr_to_slab+0xd/0xb0
> [15579.900776]  ? long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
> [15579.900806]  kasan_report+0xd3/0x110
> [15579.900828]  ? long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
> [15579.900860]  long_busy_list_alloc_test+0x195/0x1c0 [test_vmalloc]
> [15579.900890]  ? ktime_get+0xa1/0x170
> [15579.900910]  ? __pfx_long_busy_list_alloc_test+0x10/0x10 [test_vmalloc]
> [15579.900943]  test_func+0x232/0x510 [test_vmalloc]
> [15579.900970]  ? __pfx_test_func+0x10/0x10 [test_vmalloc]
> [15579.900998]  ? __kthread_parkme+0x82/0x140
> [15579.901022]  ? __pfx_test_func+0x10/0x10 [test_vmalloc]
> [15579.901049]  kthread+0x2a5/0x370
> [15579.901069]  ? __pfx_kthread+0x10/0x10
> [15579.901091]  ret_from_fork+0x34/0x70
> [15579.901113]  ? __pfx_kthread+0x10/0x10
> [15579.901135]  ret_from_fork_asm+0x1a/0x30
> [15579.901161]  </TASK>
> 
> [15579.901189] The buggy address belongs to the virtual mapping at
>                 [ffffc901c0578000, ffffc901c05dd000) created by:
>                 long_busy_list_alloc_test+0x8e/0x1c0 [test_vmalloc]
> 
> [15579.901281] The buggy address belongs to the physical page:
> [15579.901309] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x50611a
> [15579.901312] flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
> [15579.901317] raw: 0017ffffc0000000 0000000000000000 dead000000000122 0000000000000000
> [15579.901320] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
> [15579.901321] page dumped because: kasan: bad access detected
> 
> [15579.901335] Memory state around the buggy address:
> [15579.901359]  ffffc901c0577f00: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
> [15579.901391]  ffffc901c0577f80: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
> [15579.901423] >ffffc901c0578000: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
> [15579.901455]                    ^
> [15579.901474]  ffffc901c0578080: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
> [15579.901506]  ffffc901c0578100: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
> [15579.901538] ==================================================================
> [15579.902332] Disabling lock debugging due to kernel taint
> 
> after applying this patch. Let me check tomorrow if it is a real BUG or
> it is a side-effect of the patch.
> 
You can trigger that BUG by running a vmalloc test-squite:

sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=15

Reworked a bit your patch to solve a false-positive above splat:

<snip>
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 70d6a8f6e25d..ddbf42a1a7b7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -55,6 +55,9 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 int kasan_populate_early_shadow(const void *shadow_start,
 				const void *shadow_end);
 
+#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
+#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
+
 #ifndef kasan_mem_to_shadow
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
@@ -511,7 +514,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end);
+			   unsigned long free_region_end,
+			   unsigned long flags);
 
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -526,7 +530,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -561,7 +566,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 static inline void *kasan_unpoison_vmalloc(const void *start,
 					   unsigned long size,
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d6210ca48dda..88d1c9dcb507 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  */
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end)
+			   unsigned long free_region_end,
+			   unsigned long flags)
 {
 	void *shadow_start, *shadow_end;
 	unsigned long region_start, region_end;
@@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
 			return;
 		}
-		apply_to_existing_page_range(&init_mm,
+
+
+		if (flags & KASAN_VMALLOC_PAGE_RANGE)
+			apply_to_existing_page_range(&init_mm,
 					     (unsigned long)shadow_start,
 					     size, kasan_depopulate_vmalloc_pte,
 					     NULL);
-		flush_tlb_kernel_range((unsigned long)shadow_start,
-				       (unsigned long)shadow_end);
+
+		if (flags & KASAN_VMALLOC_TLB_FLUSH)
+			flush_tlb_kernel_range((unsigned long)shadow_start,
+					       (unsigned long)shadow_end);
 	}
 }
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 03b82fb8ecd3..ed5eb02de545 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2186,6 +2186,25 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
 	reclaim_list_global(&decay_list);
 }
 
+static void
+kasan_release_vmalloc_node(struct vmap_node *vn)
+{
+	struct vmap_area *va;
+	unsigned long start, end;
+
+	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
+
+	list_for_each_entry(va, &vn->purge_list, list) {
+		if (is_vmalloc_or_module_addr((void *) va->va_start))
+			kasan_release_vmalloc(va->va_start, va->va_end,
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE);
+	}
+
+	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
+}
+
 static void purge_vmap_node(struct work_struct *work)
 {
 	struct vmap_node *vn = container_of(work,
@@ -2193,20 +2212,17 @@ static void purge_vmap_node(struct work_struct *work)
 	struct vmap_area *va, *n_va;
 	LIST_HEAD(local_list);
 
+	if (kasan_enabled())
+		kasan_release_vmalloc_node(vn);
+
 	vn->nr_purged = 0;
 
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
-		unsigned long orig_start = va->va_start;
-		unsigned long orig_end = va->va_end;
 		unsigned int vn_id = decode_vn_id(va->flags);
 
 		list_del_init(&va->list);
 
-		if (is_vmalloc_or_module_addr((void *)orig_start))
-			kasan_release_vmalloc(orig_start, orig_end,
-					      va->va_start, va->va_end);
-
 		atomic_long_sub(nr, &vmap_lazy_nr);
 		vn->nr_purged++;
 
@@ -4717,7 +4733,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 	}
 
@@ -4767,7 +4784,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 		kfree(vms[area]);
 	}
<snip>

--
Uladzislau Rezki
diff mbox series

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d0cbdd7c1e5b..380bdc317c8d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2193,6 +2193,15 @@  static void purge_vmap_node(struct work_struct *work)
 	struct vmap_area *va, *n_va;
 	LIST_HEAD(local_list);
 
+	/*
+	 * Add the preemption point when enabling KASAN. Each vmap_area of
+	 * vmap nodes has to flush tlb when releasing vmalloc, which might
+	 * be the time-consuming work if lots of vamp nodes have the
+	 * available purge list.
+	 */
+	if (kasan_enabled())
+		cond_resched();
+
 	vn->nr_purged = 0;
 
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {