diff mbox series

[RFC,1,1/1] sched/numa: Hot VMA and shared VMA optimization

Message ID 88d16815ef4cc2b6c08b4bb713b25421b5589bc7.1710829750.git.raghavendra.kt@amd.com (mailing list archive)
State New
Headers show
Series [RFC,1,1/1] sched/numa: Hot VMA and shared VMA optimization | expand

Commit Message

Raghavendra K T March 22, 2024, 1:41 p.m. UTC
Optimizations are based on history of PIDs accessing VMA.

- Increase tasks' access history windows (PeterZ) from 2 to 4.
( This patch is from Peter Zijlstra <peterz@infradead.org>)

Idea: A task is allowed to scan a VMA if:
- VMA was very recently accessed as indicated by the latest
  access PIDs information (hot VMA).
- VMA is shared by more than 2 tasks. Here whole history of VMA's
access PIDs is considered using bitmap_weight().

Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
I will split the patset and post if we find this pathset useful
going further. First patch is from PeterZ.

 include/linux/mm.h       | 12 ++++++---
 include/linux/mm_types.h | 11 +++++---
 kernel/sched/fair.c      | 58 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 69 insertions(+), 12 deletions(-)

Comments

Chen Yu June 25, 2024, 2:20 p.m. UTC | #1
Hi Raghavendra,

On 2024-03-22 at 19:11:12 +0530, Raghavendra K T wrote:
> Optimizations are based on history of PIDs accessing VMA.
> 
> - Increase tasks' access history windows (PeterZ) from 2 to 4.
> ( This patch is from Peter Zijlstra <peterz@infradead.org>)
> 
> Idea: A task is allowed to scan a VMA if:
> - VMA was very recently accessed as indicated by the latest
>   access PIDs information (hot VMA).
> - VMA is shared by more than 2 tasks. Here whole history of VMA's
> access PIDs is considered using bitmap_weight().
> 
> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
> ---
> I will split the patset and post if we find this pathset useful
> going further. First patch is from PeterZ.
> 

This is a good direction I think. We did an initial test using autonumabench
THREADLOCAL on a 240 CPUs 2 nodes system. It seems that this patch does not
show obvious difference, but it shows a more stable result(less run-to-run
variance). We'll enable the Sub-Numa-Cluster to see if there is any difference.
My understanding is that, if we can extend the NR_ACCESS_PID_HIST further,
the THREADLOCAL could see more benefits, as each thread has its own VMA. Or maybe
make the length of VMA access history adaptive(rather than a fixed 4) could be
more flexible.
                                          numa_scan_orig    numa_scan_4_history
Min       syst-NUMA01_THREADLOCAL      388.47 (   0.00%)      397.43 (  -2.31%)
Min       elsp-NUMA01_THREADLOCAL       40.27 (   0.00%)       38.94 (   3.30%)
Amean     syst-NUMA01_THREADLOCAL      467.62 (   0.00%)      459.10 (   1.82%)
Amean     elsp-NUMA01_THREADLOCAL       42.20 (   0.00%)       44.84 (  -6.26%)
Stddev    syst-NUMA01_THREADLOCAL       74.11 (   0.00%)       60.90 (  17.81%)
CoeffVar  syst-NUMA01_THREADLOCAL       15.85 (   0.00%)       13.27 (  16.29%)
Max       syst-NUMA01_THREADLOCAL      535.36 (   0.00%)      519.21 (   3.02%)
Max       elsp-NUMA01_THREADLOCAL       43.96 (   0.00%)       56.33 ( -28.14%)
BAmean-50 syst-NUMA01_THREADLOCAL      388.47 (   0.00%)      397.43 (  -2.31%)
BAmean-50 elsp-NUMA01_THREADLOCAL       40.27 (   0.00%)       38.94 (   3.30%)
BAmean-95 syst-NUMA01_THREADLOCAL      433.75 (   0.00%)      429.05 (   1.08%)
BAmean-95 elsp-NUMA01_THREADLOCAL       41.31 (   0.00%)       39.09 (   5.39%)
BAmean-99 syst-NUMA01_THREADLOCAL      433.75 (   0.00%)      429.05 (   1.08%)
BAmean-99 elsp-NUMA01_THREADLOCAL       41.31 (   0.00%)       39.09 (   5.39%)

thanks,
Chenyu
Raghavendra K T June 26, 2024, 7:42 a.m. UTC | #2
On 6/25/2024 7:50 PM, Chen Yu wrote:
> Hi Raghavendra,
> 
> On 2024-03-22 at 19:11:12 +0530, Raghavendra K T wrote:
>> Optimizations are based on history of PIDs accessing VMA.
>>
>> - Increase tasks' access history windows (PeterZ) from 2 to 4.
>> ( This patch is from Peter Zijlstra <peterz@infradead.org>)
>>
>> Idea: A task is allowed to scan a VMA if:
>> - VMA was very recently accessed as indicated by the latest
>>    access PIDs information (hot VMA).
>> - VMA is shared by more than 2 tasks. Here whole history of VMA's
>> access PIDs is considered using bitmap_weight().
>>
>> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
>> ---
>> I will split the patset and post if we find this pathset useful
>> going further. First patch is from PeterZ.
>>
> 
> This is a good direction I think. We did an initial test using autonumabench
> THREADLOCAL on a 240 CPUs 2 nodes system. It seems that this patch does not
> show obvious difference, but it shows a more stable result(less run-to-run
> variance). We'll enable the Sub-Numa-Cluster to see if there is any difference.
> My understanding is that, if we can extend the NR_ACCESS_PID_HIST further,
> the THREADLOCAL could see more benefits, as each thread has its own VMA. Or maybe
> make the length of VMA access history adaptive(rather than a fixed 4) could be
> more flexible.
>                                            numa_scan_orig    numa_scan_4_history
> Min       syst-NUMA01_THREADLOCAL      388.47 (   0.00%)      397.43 (  -2.31%)
> Min       elsp-NUMA01_THREADLOCAL       40.27 (   0.00%)       38.94 (   3.30%)
> Amean     syst-NUMA01_THREADLOCAL      467.62 (   0.00%)      459.10 (   1.82%)
> Amean     elsp-NUMA01_THREADLOCAL       42.20 (   0.00%)       44.84 (  -6.26%)
> Stddev    syst-NUMA01_THREADLOCAL       74.11 (   0.00%)       60.90 (  17.81%)
> CoeffVar  syst-NUMA01_THREADLOCAL       15.85 (   0.00%)       13.27 (  16.29%)
> Max       syst-NUMA01_THREADLOCAL      535.36 (   0.00%)      519.21 (   3.02%)
> Max       elsp-NUMA01_THREADLOCAL       43.96 (   0.00%)       56.33 ( -28.14%)
> BAmean-50 syst-NUMA01_THREADLOCAL      388.47 (   0.00%)      397.43 (  -2.31%)
> BAmean-50 elsp-NUMA01_THREADLOCAL       40.27 (   0.00%)       38.94 (   3.30%)
> BAmean-95 syst-NUMA01_THREADLOCAL      433.75 (   0.00%)      429.05 (   1.08%)
> BAmean-95 elsp-NUMA01_THREADLOCAL       41.31 (   0.00%)       39.09 (   5.39%)
> BAmean-99 syst-NUMA01_THREADLOCAL      433.75 (   0.00%)      429.05 (   1.08%)
> BAmean-99 elsp-NUMA01_THREADLOCAL       41.31 (   0.00%)       39.09 (   5.39%)
> 

Thanks for the test and report. I will split the patches and also test 
for N=6,8.
(on top of your patch perhaps to make sure we have benefits further).

Making adaptive may be little difficult. How to assess which size is 
doing better dynamically seems to be little hard to imagine for me. (/me
Need to think here)

Thanks and Regards
- Raghu
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..1bf1df064b60 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1744,10 +1744,14 @@  static inline int folio_xchg_access_time(struct folio *folio, int time)
 static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 {
 	unsigned int pid_bit;
-
-	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
-	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
-		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
+	unsigned long *pids, pid_idx;
+
+	if (vma->numab_state) {
+		pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+		pid_idx = READ_ONCE(vma->numab_state->pids_active_idx);
+		pids = vma->numab_state->pids_active + pid_idx;
+		if (!test_bit(pid_bit, pids))
+			__set_bit(pid_bit, pids);
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..050ceef1e9d5 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -574,6 +574,7 @@  struct vma_lock {
 	struct rw_semaphore lock;
 };
 
+#define NR_ACCESS_PID_HIST	4
 struct vma_numab_state {
 	/*
 	 * Initialised as time in 'jiffies' after which VMA
@@ -588,17 +589,21 @@  struct vma_numab_state {
 	 */
 	unsigned long pids_active_reset;
 
+	/* Points to current active PID tracking index. */
+	unsigned long pids_active_idx;
+
 	/*
 	 * Approximate tracking of PIDs that trapped a NUMA hinting
 	 * fault. May produce false positives due to hash collisions.
 	 *
-	 *   [0] Previous PID tracking
-	 *   [1] Current PID tracking
+	 *   [pids_active_idx - 1] Previous PID tracking
+	 *   [pids_active_idx] Current PID tracking
 	 *
+	 * Whole array is used in a rotating manner to track latest PIDs.
 	 * Window moves after next_pid_reset has expired approximately
 	 * every VMA_PID_RESET_PERIOD jiffies:
 	 */
-	unsigned long pids_active[2];
+	unsigned long pids_active[NR_ACCESS_PID_HIST];
 
 	/* MM scan sequence ID when scan first started after VMA creation */
 	int start_scan_seq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a16129f9a5c..ed329b2f4d53 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3157,9 +3157,44 @@  static void reset_ptenuma_scan(struct task_struct *p)
 	p->mm->numa_scan_offset = 0;
 }
 
+static inline bool vma_test_access_pid_history(struct vm_area_struct *vma)
+{
+	unsigned int i, pid_bit;
+	unsigned long pids = 0;
+
+	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+
+	for (i = 0; i < NR_ACCESS_PID_HIST; i++)
+		pids  |= vma->numab_state->pids_active[i];
+
+	return test_bit(pid_bit, &pids);
+}
+
+static inline bool vma_accessed_recent(struct vm_area_struct *vma)
+{
+	unsigned long *pids, pid_idx;
+
+	pid_idx = vma->numab_state->pids_active_idx;
+	pids = vma->numab_state->pids_active + pid_idx;
+
+	return (bitmap_weight(pids, BITS_PER_LONG) >= 1);
+}
+
+#define SHARED_VMA_THRESH	3
+
+static inline bool vma_shared_access(struct vm_area_struct *vma)
+{
+	int i;
+	unsigned long pids = 0;
+
+	for (i = 0; i < NR_ACCESS_PID_HIST; i++)
+		pids  |= vma->numab_state->pids_active[i];
+
+	return (bitmap_weight(&pids, BITS_PER_LONG) >= SHARED_VMA_THRESH);
+}
+
 static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	unsigned long pids;
 	/*
 	 * Allow unconditional access first two times, so that all the (pages)
 	 * of VMAs get prot_none fault introduced irrespective of accesses.
@@ -3169,8 +3204,16 @@  static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
 		return true;
 
-	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
-	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+	/* Check if the current task had historically accessed VMA. */
+	if (vma_test_access_pid_history(vma))
+		return true;
+
+	/* Check at least one task had accessed VMA recently. */
+	if (vma_accessed_recent(vma))
+		return true;
+
+	/* Check if VMA is shared by many tasks. */
+	if (vma_shared_access(vma))
 		return true;
 
 	/*
@@ -3202,6 +3245,7 @@  static void task_numa_work(struct callback_head *work)
 	unsigned long nr_pte_updates = 0;
 	long pages, virtpages;
 	struct vma_iterator vmi;
+	unsigned long pid_idx;
 	bool vma_pids_skipped;
 	bool vma_pids_forced = false;
 
@@ -3341,8 +3385,12 @@  static void task_numa_work(struct callback_head *work)
 				time_after(jiffies, vma->numab_state->pids_active_reset)) {
 			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
-			vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
-			vma->numab_state->pids_active[1] = 0;
+
+			pid_idx = vma->numab_state->pids_active_idx;
+			pid_idx = (pid_idx + 1) % NR_ACCESS_PID_HIST;
+
+			vma->numab_state->pids_active_idx = pid_idx;
+			vma->numab_state->pids_active[pid_idx] = 0;
 		}
 
 		/* Do not rescan VMAs twice within the same sequence. */