diff mbox series

[mm-unstable,v7,03/18] mm/khugepaged: add struct collapse_control

Message ID 20220706235936.2197195-4-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe July 6, 2022, 11:59 p.m. UTC
Modularize hugepage collapse by introducing struct collapse_control.
This structure serves to describe the properties of the requested
collapse, as well as serve as a local scratch pad to use during the
collapse itself.

Start by moving global per-node khugepaged statistics into this
new structure.  Note that this structure is still statically allocated
since CONFIG_NODES_SHIFT might be arbitrary large, and stack-allocating
a MAX_NUMNODES-sized array could cause -Wframe-large-than= errors.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 87 ++++++++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 37 deletions(-)

Comments

Andrew Morton July 8, 2022, 9:01 p.m. UTC | #1
On Wed,  6 Jul 2022 16:59:21 -0700 "Zach O'Keefe" <zokeefe@google.com> wrote:

> Modularize hugepage collapse by introducing struct collapse_control.
> This structure serves to describe the properties of the requested
> collapse, as well as serve as a local scratch pad to use during the
> collapse itself.
> 
> Start by moving global per-node khugepaged statistics into this
> new structure.  Note that this structure is still statically allocated
> since CONFIG_NODES_SHIFT might be arbitrary large, and stack-allocating
> a MAX_NUMNODES-sized array could cause -Wframe-large-than= errors.
> 
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> ---
>  mm/khugepaged.c | 87 ++++++++++++++++++++++++++++---------------------
>  1 file changed, 50 insertions(+), 37 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 196eaadbf415..f1ef02d9fe07 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -85,6 +85,14 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
>  
>  #define MAX_PTE_MAPPED_THP 8
>  
> +struct collapse_control {
> +	/* Num pages scanned per node */
> +	int node_load[MAX_NUMNODES];

Does this actually need to be 32-bit?  Looking at the current code I'm
suspecting that khugepaged_node_load[] could be a ushort?

[And unsigned int would be more appropriate, but we always do that :(]
Zach O'Keefe July 11, 2022, 6:29 p.m. UTC | #2
On Fri, Jul 8, 2022 at 2:01 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Wed,  6 Jul 2022 16:59:21 -0700 "Zach O'Keefe" <zokeefe@google.com> wrote:
>
> > Modularize hugepage collapse by introducing struct collapse_control.
> > This structure serves to describe the properties of the requested
> > collapse, as well as serve as a local scratch pad to use during the
> > collapse itself.
> >
> > Start by moving global per-node khugepaged statistics into this
> > new structure.  Note that this structure is still statically allocated
> > since CONFIG_NODES_SHIFT might be arbitrary large, and stack-allocating
> > a MAX_NUMNODES-sized array could cause -Wframe-large-than= errors.
> >
> > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > ---
> >  mm/khugepaged.c | 87 ++++++++++++++++++++++++++++---------------------
> >  1 file changed, 50 insertions(+), 37 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 196eaadbf415..f1ef02d9fe07 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -85,6 +85,14 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
> >
> >  #define MAX_PTE_MAPPED_THP 8
> >
> > +struct collapse_control {
> > +     /* Num pages scanned per node */
> > +     int node_load[MAX_NUMNODES];
>
> Does this actually need to be 32-bit?  Looking at the current code I'm
> suspecting that khugepaged_node_load[] could be a ushort?
>
> [And unsigned int would be more appropriate, but we always do that :(]
>

Hey Andrew,

Thanks for taking the time to review, and good catch - I don't think
we need 32 bits.

Minimally, we just need to be able to hold the maximum value of
HPAGE_PMD_NR = 1 << (PMD_SHIFT - PAGE_SHIFT).

I'm not sure what arch/config options (that also use THP) produce the
minimum/maximum value here. I looked through most of the archs that
define PMD_SHIFT, and couldn't find an example where we'd need > 16
bits, with most cases still requiring > 8 bits. All the various
configs do get complicated though.

Is it acceptable to use u16, with an #error if HPAGE_PMD_ORDER >= 16?

Thanks,
Zach
Andrew Morton July 11, 2022, 6:45 p.m. UTC | #3
On Mon, 11 Jul 2022 11:29:13 -0700 "Zach O'Keefe" <zokeefe@google.com> wrote:

> On Fri, Jul 8, 2022 at 2:01 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Wed,  6 Jul 2022 16:59:21 -0700 "Zach O'Keefe" <zokeefe@google.com> wrote:
> >
> > > Modularize hugepage collapse by introducing struct collapse_control.
> > > This structure serves to describe the properties of the requested
> > > collapse, as well as serve as a local scratch pad to use during the
> > > collapse itself.
> > >
> > > Start by moving global per-node khugepaged statistics into this
> > > new structure.  Note that this structure is still statically allocated
> > > since CONFIG_NODES_SHIFT might be arbitrary large, and stack-allocating
> > > a MAX_NUMNODES-sized array could cause -Wframe-large-than= errors.
> > >
> > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > ---
> > >  mm/khugepaged.c | 87 ++++++++++++++++++++++++++++---------------------
> > >  1 file changed, 50 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 196eaadbf415..f1ef02d9fe07 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -85,6 +85,14 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
> > >
> > >  #define MAX_PTE_MAPPED_THP 8
> > >
> > > +struct collapse_control {
> > > +     /* Num pages scanned per node */
> > > +     int node_load[MAX_NUMNODES];
> >
> > Does this actually need to be 32-bit?  Looking at the current code I'm
> > suspecting that khugepaged_node_load[] could be a ushort?
> >
> > [And unsigned int would be more appropriate, but we always do that :(]
> >
> 
> Hey Andrew,
> 
> Thanks for taking the time to review, and good catch - I don't think
> we need 32 bits.
> 
> Minimally, we just need to be able to hold the maximum value of
> HPAGE_PMD_NR = 1 << (PMD_SHIFT - PAGE_SHIFT).
> 
> I'm not sure what arch/config options (that also use THP) produce the
> minimum/maximum value here. I looked through most of the archs that
> define PMD_SHIFT, and couldn't find an example where we'd need > 16
> bits, with most cases still requiring > 8 bits. All the various
> configs do get complicated though.
> 
> Is it acceptable to use u16, with an #error if HPAGE_PMD_ORDER >= 16?

It might be ;)

It was just a thought - perhaps something which you or someone else
might choose to look at, but I don't think this work needs to be part
of the current series, unless the current series consumes egregious
amounts of memory.
Yang Shi July 11, 2022, 9:51 p.m. UTC | #4
On Mon, Jul 11, 2022 at 11:29 AM Zach O'Keefe <zokeefe@google.com> wrote:
>
> On Fri, Jul 8, 2022 at 2:01 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Wed,  6 Jul 2022 16:59:21 -0700 "Zach O'Keefe" <zokeefe@google.com> wrote:
> >
> > > Modularize hugepage collapse by introducing struct collapse_control.
> > > This structure serves to describe the properties of the requested
> > > collapse, as well as serve as a local scratch pad to use during the
> > > collapse itself.
> > >
> > > Start by moving global per-node khugepaged statistics into this
> > > new structure.  Note that this structure is still statically allocated
> > > since CONFIG_NODES_SHIFT might be arbitrary large, and stack-allocating
> > > a MAX_NUMNODES-sized array could cause -Wframe-large-than= errors.
> > >
> > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > ---
> > >  mm/khugepaged.c | 87 ++++++++++++++++++++++++++++---------------------
> > >  1 file changed, 50 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 196eaadbf415..f1ef02d9fe07 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -85,6 +85,14 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
> > >
> > >  #define MAX_PTE_MAPPED_THP 8
> > >
> > > +struct collapse_control {
> > > +     /* Num pages scanned per node */
> > > +     int node_load[MAX_NUMNODES];
> >
> > Does this actually need to be 32-bit?  Looking at the current code I'm
> > suspecting that khugepaged_node_load[] could be a ushort?
> >
> > [And unsigned int would be more appropriate, but we always do that :(]
> >
>
> Hey Andrew,
>
> Thanks for taking the time to review, and good catch - I don't think
> we need 32 bits.
>
> Minimally, we just need to be able to hold the maximum value of
> HPAGE_PMD_NR = 1 << (PMD_SHIFT - PAGE_SHIFT).
>
> I'm not sure what arch/config options (that also use THP) produce the
> minimum/maximum value here. I looked through most of the archs that
> define PMD_SHIFT, and couldn't find an example where we'd need > 16
> bits, with most cases still requiring > 8 bits. All the various
> configs do get complicated though.
>
> Is it acceptable to use u16, with an #error if HPAGE_PMD_ORDER >= 16?

Fine to me.

>
> Thanks,
> Zach
Zach O'Keefe July 12, 2022, 2:17 p.m. UTC | #5
On Jul 11 11:45, Andrew Morton wrote:
> On Mon, 11 Jul 2022 11:29:13 -0700 "Zach O'Keefe" <zokeefe@google.com> wrote:
> 
> > On Fri, Jul 8, 2022 at 2:01 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > >
> > > On Wed,  6 Jul 2022 16:59:21 -0700 "Zach O'Keefe" <zokeefe@google.com> wrote:
> > >
> > > > Modularize hugepage collapse by introducing struct collapse_control.
> > > > This structure serves to describe the properties of the requested
> > > > collapse, as well as serve as a local scratch pad to use during the
> > > > collapse itself.
> > > >
> > > > Start by moving global per-node khugepaged statistics into this
> > > > new structure.  Note that this structure is still statically allocated
> > > > since CONFIG_NODES_SHIFT might be arbitrary large, and stack-allocating
> > > > a MAX_NUMNODES-sized array could cause -Wframe-large-than= errors.
> > > >
> > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > > > ---
> > > >  mm/khugepaged.c | 87 ++++++++++++++++++++++++++++---------------------
> > > >  1 file changed, 50 insertions(+), 37 deletions(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index 196eaadbf415..f1ef02d9fe07 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -85,6 +85,14 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
> > > >
> > > >  #define MAX_PTE_MAPPED_THP 8
> > > >
> > > > +struct collapse_control {
> > > > +     /* Num pages scanned per node */
> > > > +     int node_load[MAX_NUMNODES];
> > >
> > > Does this actually need to be 32-bit?  Looking at the current code I'm
> > > suspecting that khugepaged_node_load[] could be a ushort?
> > >
> > > [And unsigned int would be more appropriate, but we always do that :(]
> > >
> > 
> > Hey Andrew,
> > 
> > Thanks for taking the time to review, and good catch - I don't think
> > we need 32 bits.
> > 
> > Minimally, we just need to be able to hold the maximum value of
> > HPAGE_PMD_NR = 1 << (PMD_SHIFT - PAGE_SHIFT).
> > 
> > I'm not sure what arch/config options (that also use THP) produce the
> > minimum/maximum value here. I looked through most of the archs that
> > define PMD_SHIFT, and couldn't find an example where we'd need > 16
> > bits, with most cases still requiring > 8 bits. All the various
> > configs do get complicated though.
> > 
> > Is it acceptable to use u16, with an #error if HPAGE_PMD_ORDER >= 16?
> 
> It might be ;)
> 
> It was just a thought - perhaps something which you or someone else
> might choose to look at, but I don't think this work needs to be part
> of the current series, unless the current series consumes egregious
> amounts of memory.
> 

I think it makes sense. Reason we moved this struct to kmalloc was MAX_NUMNODES
can be pretty large - so might as well save a few bytes for a pretty small
change. Yang seems good with it, anyways :)
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 196eaadbf415..f1ef02d9fe07 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -85,6 +85,14 @@  static struct kmem_cache *mm_slot_cache __read_mostly;
 
 #define MAX_PTE_MAPPED_THP 8
 
+struct collapse_control {
+	/* Num pages scanned per node */
+	int node_load[MAX_NUMNODES];
+
+	/* Last target selected in khugepaged_find_target_node() */
+	int last_target_node;
+};
+
 /**
  * struct mm_slot - hash lookup from mm to mm_slot
  * @hash: hash collision list
@@ -735,9 +743,12 @@  static void khugepaged_alloc_sleep(void)
 	remove_wait_queue(&khugepaged_wait, &wait);
 }
 
-static int khugepaged_node_load[MAX_NUMNODES];
 
-static bool khugepaged_scan_abort(int nid)
+struct collapse_control khugepaged_collapse_control = {
+	.last_target_node = NUMA_NO_NODE,
+};
+
+static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
 {
 	int i;
 
@@ -749,11 +760,11 @@  static bool khugepaged_scan_abort(int nid)
 		return false;
 
 	/* If there is a count for this node already, it must be acceptable */
-	if (khugepaged_node_load[nid])
+	if (cc->node_load[nid])
 		return false;
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (!khugepaged_node_load[i])
+		if (!cc->node_load[i])
 			continue;
 		if (node_distance(nid, i) > node_reclaim_distance)
 			return true;
@@ -772,32 +783,31 @@  static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
 }
 
 #ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
+static int khugepaged_find_target_node(struct collapse_control *cc)
 {
-	static int last_khugepaged_target_node = NUMA_NO_NODE;
 	int nid, target_node = 0, max_value = 0;
 
 	/* find first node with max normal pages hit */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
-		if (khugepaged_node_load[nid] > max_value) {
-			max_value = khugepaged_node_load[nid];
+		if (cc->node_load[nid] > max_value) {
+			max_value = cc->node_load[nid];
 			target_node = nid;
 		}
 
 	/* do some balance if several nodes have the same hit record */
-	if (target_node <= last_khugepaged_target_node)
-		for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
-				nid++)
-			if (max_value == khugepaged_node_load[nid]) {
+	if (target_node <= cc->last_target_node)
+		for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES;
+		     nid++)
+			if (max_value == cc->node_load[nid]) {
 				target_node = nid;
 				break;
 			}
 
-	last_khugepaged_target_node = target_node;
+	cc->last_target_node = target_node;
 	return target_node;
 }
 #else
-static int khugepaged_find_target_node(void)
+static int khugepaged_find_target_node(struct collapse_control *cc)
 {
 	return 0;
 }
@@ -1075,10 +1085,9 @@  static void collapse_huge_page(struct mm_struct *mm,
 	return;
 }
 
-static int khugepaged_scan_pmd(struct mm_struct *mm,
-			       struct vm_area_struct *vma,
-			       unsigned long address,
-			       struct page **hpage)
+static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+			       unsigned long address, struct page **hpage,
+			       struct collapse_control *cc)
 {
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
@@ -1098,7 +1107,7 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 		goto out;
 	}
 
-	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+	memset(cc->node_load, 0, sizeof(cc->node_load));
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
 	     _pte++, _address += PAGE_SIZE) {
@@ -1164,16 +1173,16 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 
 		/*
 		 * Record which node the original page is from and save this
-		 * information to khugepaged_node_load[].
+		 * information to cc->node_load[].
 		 * Khugepaged will allocate hugepage from the node has the max
 		 * hit record.
 		 */
 		node = page_to_nid(page);
-		if (khugepaged_scan_abort(node)) {
+		if (khugepaged_scan_abort(node, cc)) {
 			result = SCAN_SCAN_ABORT;
 			goto out_unmap;
 		}
-		khugepaged_node_load[node]++;
+		cc->node_load[node]++;
 		if (!PageLRU(page)) {
 			result = SCAN_PAGE_LRU;
 			goto out_unmap;
@@ -1224,7 +1233,7 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (ret) {
-		node = khugepaged_find_target_node();
+		node = khugepaged_find_target_node(cc);
 		/* collapse_huge_page will return with the mmap_lock released */
 		collapse_huge_page(mm, address, hpage, node,
 				referenced, unmapped);
@@ -1879,8 +1888,9 @@  static void collapse_file(struct mm_struct *mm,
 	/* TODO: tracepoints */
 }
 
-static void khugepaged_scan_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
+				 pgoff_t start, struct page **hpage,
+				 struct collapse_control *cc)
 {
 	struct page *page = NULL;
 	struct address_space *mapping = file->f_mapping;
@@ -1891,7 +1901,7 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 
 	present = 0;
 	swap = 0;
-	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+	memset(cc->node_load, 0, sizeof(cc->node_load));
 	rcu_read_lock();
 	xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
 		if (xas_retry(&xas, page))
@@ -1916,11 +1926,11 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 		}
 
 		node = page_to_nid(page);
-		if (khugepaged_scan_abort(node)) {
+		if (khugepaged_scan_abort(node, cc)) {
 			result = SCAN_SCAN_ABORT;
 			break;
 		}
-		khugepaged_node_load[node]++;
+		cc->node_load[node]++;
 
 		if (!PageLRU(page)) {
 			result = SCAN_PAGE_LRU;
@@ -1953,7 +1963,7 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 			result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
-			node = khugepaged_find_target_node();
+			node = khugepaged_find_target_node(cc);
 			collapse_file(mm, file, start, hpage, node);
 		}
 	}
@@ -1961,8 +1971,9 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 	/* TODO: tracepoints */
 }
 #else
-static void khugepaged_scan_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm, struct file *file,
+				 pgoff_t start, struct page **hpage,
+				 struct collapse_control *cc)
 {
 	BUILD_BUG();
 }
@@ -1973,7 +1984,8 @@  static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 #endif
 
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
-					    struct page **hpage)
+					    struct page **hpage,
+					    struct collapse_control *cc)
 	__releases(&khugepaged_mm_lock)
 	__acquires(&khugepaged_mm_lock)
 {
@@ -2050,12 +2062,13 @@  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 
 				mmap_read_unlock(mm);
 				ret = 1;
-				khugepaged_scan_file(mm, file, pgoff, hpage);
+				khugepaged_scan_file(mm, file, pgoff, hpage,
+						     cc);
 				fput(file);
 			} else {
 				ret = khugepaged_scan_pmd(mm, vma,
 						khugepaged_scan.address,
-						hpage);
+						hpage, cc);
 			}
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
@@ -2111,7 +2124,7 @@  static int khugepaged_wait_event(void)
 		kthread_should_stop();
 }
 
-static void khugepaged_do_scan(void)
+static void khugepaged_do_scan(struct collapse_control *cc)
 {
 	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
@@ -2132,7 +2145,7 @@  static void khugepaged_do_scan(void)
 		if (khugepaged_has_work() &&
 		    pass_through_head < 2)
 			progress += khugepaged_scan_mm_slot(pages - progress,
-							    &hpage);
+							    &hpage, cc);
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
@@ -2188,7 +2201,7 @@  static int khugepaged(void *none)
 	set_user_nice(current, MAX_NICE);
 
 	while (!kthread_should_stop()) {
-		khugepaged_do_scan();
+		khugepaged_do_scan(&khugepaged_collapse_control);
 		khugepaged_wait_work();
 	}