diff mbox series

[v2,2/4] mm/mempolicy: Support memory hotplug in weighted interleave

Message ID 20250312075628.648-2-rakie.kim@sk.com (mailing list archive)
State New
Headers show
Series [v2,1/4] mm/mempolicy: Fix memory leaks in mempolicy_sysfs_init() | expand

Commit Message

Rakie Kim March 12, 2025, 7:56 a.m. UTC
The weighted interleave policy distributes page allocations across multiple
NUMA nodes based on their performance weight, thereby optimizing memory
bandwidth utilization. The weight values for each node are configured
through sysfs.

Previously, the sysfs entries for configuring weighted interleave were only
created during initialization. This approach had several limitations:
- Sysfs entries were generated for all possible nodes at boot time,
  including nodes without memory, leading to unnecessary sysfs creation.
- Some memory devices transition to an online state after initialization,
  but the existing implementation failed to create sysfs entries for
  these dynamically added nodes. As a result, memory hotplugged nodes
  were not properly recognized by the weighed interleave mechanism.

To resolve these issues, this patch introduces two key improvements:
1) At initialization, only nodes that are online and have memory are
   recognized, preventing the creation of unnecessary sysfs entries.
2) Nodes that become available after initialization are dynamically
   detected and integrated through the memory hotplug mechanism.

With this enhancement, the weighted interleave policy now properly supports
memory hotplug, ensuring that newly added nodes are recognized and sysfs
entries are created accordingly.

Signed-off-by: Rakie Kim <rakie.kim@sk.com>
---
 mm/mempolicy.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

Comments

Gregory Price March 12, 2025, 4:03 p.m. UTC | #1
On Wed, Mar 12, 2025 at 04:56:25PM +0900, Rakie Kim wrote:
> The weighted interleave policy distributes page allocations across multiple
> NUMA nodes based on their performance weight, thereby optimizing memory
> bandwidth utilization. The weight values for each node are configured
> through sysfs.
> 
> Previously, the sysfs entries for configuring weighted interleave were only
> created during initialization. This approach had several limitations:
> - Sysfs entries were generated for all possible nodes at boot time,
>   including nodes without memory, leading to unnecessary sysfs creation.

It's not that it's unnecessary, it's that it allowed for configuration
of nodes which may not have memory now but may have memory in the
future.  This was not well documented.

> - Some memory devices transition to an online state after initialization,
>   but the existing implementation failed to create sysfs entries for
>   these dynamically added nodes. As a result, memory hotplugged nodes
>   were not properly recognized by the weighed interleave mechanism.
> 

The current system creates 1 node per N_POSSIBLE nodes, and since nodes
can't transition between possible and not-possible your claims here are
contradictory.

I think you mean that simply switching from N_POSSIBLE to N_MEMORY is
insufficient since nodes may transition in and out of the N_MEMORY
state.  Therefore this patch utilizes a hotplug callback to add and
remove sysfs entries based on whether a node is in the N_MEMORY set.

> To resolve these issues, this patch introduces two key improvements:
> 1) At initialization, only nodes that are online and have memory are
>    recognized, preventing the creation of unnecessary sysfs entries.
> 2) Nodes that become available after initialization are dynamically
>    detected and integrated through the memory hotplug mechanism.
> 
> With this enhancement, the weighted interleave policy now properly supports
> memory hotplug, ensuring that newly added nodes are recognized and sysfs
> entries are created accordingly.
>

It doesn't "support memory hotplug" so much as it "Minimizes weighted
interleave to exclude memoryless nodes".  

> Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> ---
>  mm/mempolicy.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 42 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 1691748badb2..94efff89e0be 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -113,6 +113,7 @@
>  #include <asm/tlbflush.h>
>  #include <asm/tlb.h>
>  #include <linux/uaccess.h>
> +#include <linux/memory.h>
>  
>  #include "internal.h"
>  
> @@ -3489,9 +3490,38 @@ static int add_weight_node(int nid, struct kobject *wi_kobj)
>  	return 0;
>  }
>  
> +struct kobject *wi_kobj;
> +
> +static int wi_node_notifier(struct notifier_block *nb,
> +			       unsigned long action, void *data)
> +{
> +	int err;
> +	struct memory_notify *arg = data;
> +	int nid = arg->status_change_nid;
> +
> +	if (nid < 0)
> +		goto notifier_end;
> +
> +	switch(action) {
> +	case MEM_ONLINE:
> +		err = add_weight_node(nid, wi_kobj);
> +		if (err) {
> +			pr_err("failed to add sysfs [node%d]\n", nid);
> +			kobject_put(wi_kobj);
> +			return NOTIFY_BAD;
> +		}
> +		break;
> +	case MEM_OFFLINE:
> +		sysfs_wi_node_release(node_attrs[nid], wi_kobj);
> +		break;
> +	}

I'm fairly certain this logic is wrong.  If I add two memory blocks and
then remove one, would this logic not remove the sysfs entries despite
there being a block remaining?

> +
> +notifier_end:
> +	return NOTIFY_OK;
> +}
> +
>  static int add_weighted_interleave_group(struct kobject *root_kobj)
>  {
> -	struct kobject *wi_kobj;
>  	int nid, err;
>  
>  	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
> @@ -3505,16 +3535,23 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
>  		return err;
>  	}
>  
> -	for_each_node_state(nid, N_POSSIBLE) {
> +	for_each_online_node(nid) {
> +		if (!node_state(nid, N_MEMORY))

Rather than online node, why not just add for each N_MEMORY node -
regardless of if its memory is online or not?  If the memory is offline,
then it will be excluded from the weighted interleave mechanism by
nature of the node being invalid for allocations anyway.

> +			continue;
> +
>  		err = add_weight_node(nid, wi_kobj);
>  		if (err) {
>  			pr_err("failed to add sysfs [node%d]\n", nid);
> -			break;
> +			goto err_out;
>  		}
>  	}
> -	if (err)
> -		kobject_put(wi_kobj);
> +
> +	hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
>  	return 0;
> +
> +err_out:
> +	kobject_put(wi_kobj);
> +	return err;
>  }
>  
>  static void mempolicy_kobj_release(struct kobject *kobj)
> -- 
> 2.34.1
>
diff mbox series

Patch

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1691748badb2..94efff89e0be 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -113,6 +113,7 @@ 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <linux/uaccess.h>
+#include <linux/memory.h>
 
 #include "internal.h"
 
@@ -3489,9 +3490,38 @@  static int add_weight_node(int nid, struct kobject *wi_kobj)
 	return 0;
 }
 
+struct kobject *wi_kobj;
+
+static int wi_node_notifier(struct notifier_block *nb,
+			       unsigned long action, void *data)
+{
+	int err;
+	struct memory_notify *arg = data;
+	int nid = arg->status_change_nid;
+
+	if (nid < 0)
+		goto notifier_end;
+
+	switch(action) {
+	case MEM_ONLINE:
+		err = add_weight_node(nid, wi_kobj);
+		if (err) {
+			pr_err("failed to add sysfs [node%d]\n", nid);
+			kobject_put(wi_kobj);
+			return NOTIFY_BAD;
+		}
+		break;
+	case MEM_OFFLINE:
+		sysfs_wi_node_release(node_attrs[nid], wi_kobj);
+		break;
+	}
+
+notifier_end:
+	return NOTIFY_OK;
+}
+
 static int add_weighted_interleave_group(struct kobject *root_kobj)
 {
-	struct kobject *wi_kobj;
 	int nid, err;
 
 	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
@@ -3505,16 +3535,23 @@  static int add_weighted_interleave_group(struct kobject *root_kobj)
 		return err;
 	}
 
-	for_each_node_state(nid, N_POSSIBLE) {
+	for_each_online_node(nid) {
+		if (!node_state(nid, N_MEMORY))
+			continue;
+
 		err = add_weight_node(nid, wi_kobj);
 		if (err) {
 			pr_err("failed to add sysfs [node%d]\n", nid);
-			break;
+			goto err_out;
 		}
 	}
-	if (err)
-		kobject_put(wi_kobj);
+
+	hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
 	return 0;
+
+err_out:
+	kobject_put(wi_kobj);
+	return err;
 }
 
 static void mempolicy_kobj_release(struct kobject *kobj)