diff mbox series

[v4,3/3] mm/mempolicy: Support memory hotplug in weighted interleave

Message ID 20250401090901.1050-4-rakie.kim@sk.com
State Superseded
Headers show
Series Enhance sysfs handling for memory hotplug in weighted interleave | expand

Commit Message

Rakie Kim April 1, 2025, 9:08 a.m. UTC
The weighted interleave policy distributes page allocations across multiple
NUMA nodes based on their performance weight, thereby improving memory
bandwidth utilization. The weight values for each node are configured
through sysfs.

Previously, sysfs entries for configuring weighted interleave were created
for all possible nodes (N_POSSIBLE) at initialization, including nodes that
might not have memory. However, not all nodes in N_POSSIBLE are usable at
runtime, as some may remain memoryless or offline.
This led to sysfs entries being created for unusable nodes, causing
potential misconfiguration issues.

To address this issue, this patch modifies the sysfs creation logic to:
1) Limit sysfs entries to nodes that are online and have memory, avoiding
   the creation of sysfs entries for nodes that cannot be used.
2) Support memory hotplug by dynamically adding and removing sysfs entries
   based on whether a node transitions into or out of the N_MEMORY state.

Additionally, the patch ensures that sysfs attributes are properly managed
when nodes go offline, preventing stale or redundant entries from persisting
in the system.

By making these changes, the weighted interleave policy now manages its
sysfs entries more efficiently, ensuring that only relevant nodes are
considered for interleaving, and dynamically adapting to memory hotplug
events.

Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com>
---
 mm/mempolicy.c | 113 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 90 insertions(+), 23 deletions(-)

Comments

Gregory Price April 1, 2025, 8:32 p.m. UTC | #1
On Tue, Apr 01, 2025 at 06:08:59PM +0900, Rakie Kim wrote:
>  static void sysfs_wi_release(struct kobject *wi_kobj)
> @@ -3464,35 +3477,84 @@ static const struct kobj_type wi_ktype = {
>  
>  static int sysfs_wi_node_add(int nid)
>  {
... snip ..
> +	mutex_lock(&wi_group->kobj_lock);
> +	if (wi_group->nattrs[nid]) {
> +		mutex_unlock(&wi_group->kobj_lock);
> +		pr_info("Node [%d] already exists\n", nid);
> +		kfree(new_attr);
> +		kfree(name);
> +		return 0;
> +	}
>  
> -	if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) {
> -		kfree(node_attr->kobj_attr.attr.name);
> -		kfree(node_attr);
> -		pr_err("failed to add attribute to weighted_interleave\n");
> -		return -ENOMEM;
> +	wi_group->nattrs[nid] = new_attr;
> +	mutex_unlock(&wi_group->kobj_lock);
> +

Shouldn't all of this
vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
> +	sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr);
> +	wi_group->nattrs[nid]->kobj_attr.attr.name = name;
> +	wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644;
> +	wi_group->nattrs[nid]->kobj_attr.show = node_show;
> +	wi_group->nattrs[nid]->kobj_attr.store = node_store;
> +	wi_group->nattrs[nid]->nid = nid;
> +
> +	ret = sysfs_create_file(&wi_group->wi_kobj,
> +				&wi_group->nattrs[nid]->kobj_attr.attr);
> +	if (ret) {
> +		kfree(wi_group->nattrs[nid]->kobj_attr.attr.name);
> +		kfree(wi_group->nattrs[nid]);
> +		wi_group->nattrs[nid] = NULL;
> +		pr_err("Failed to add attribute to weighted_interleave: %d\n", ret);
>  	}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Be happening inside the lock as well?

> +
> +	switch(action) {
> +	case MEM_ONLINE:
> +		if (node_state(nid, N_MEMORY)) {

Hm, I see the issue here, ok, this node_state check isn't needed, as it
will always be true.  So this function needs to handle duplicates still.
                          vvv 
> +			err = sysfs_wi_node_add(nid);
> +			if (err) {
> +				pr_err("failed to add sysfs [node%d]\n", nid);
> +				return NOTIFY_BAD;
> +			}
> +		}
> +		break;
> +	case MEM_OFFLINE:
> +		if (!node_state(nid, N_MEMORY))

This check is good for the time being.

> +			sysfs_wi_node_release(nid);
> +		break;
> +	}
> +
> +notifier_end:
> +	return NOTIFY_OK;
>  }
>  
> 

But really I think we probably just want to change to build on top of this:
https://lore.kernel.org/all/20250401092716.537512-2-osalvador@suse.de/

And use register_node_notifier with NODE_BECAME_MEMORYLESS and NODE_BECAME_MEM_AWARE

~Gregory
Rakie Kim April 2, 2025, 1:28 a.m. UTC | #2
On Tue, 1 Apr 2025 16:32:42 -0400 Gregory Price <gourry@gourry.net> wrote:
> On Tue, Apr 01, 2025 at 06:08:59PM +0900, Rakie Kim wrote:
> >  static void sysfs_wi_release(struct kobject *wi_kobj)
> > @@ -3464,35 +3477,84 @@ static const struct kobj_type wi_ktype = {
> >  
> >  static int sysfs_wi_node_add(int nid)
> >  {
> ... snip ..
> > +	mutex_lock(&wi_group->kobj_lock);
> > +	if (wi_group->nattrs[nid]) {
> > +		mutex_unlock(&wi_group->kobj_lock);
> > +		pr_info("Node [%d] already exists\n", nid);
> > +		kfree(new_attr);
> > +		kfree(name);
> > +		return 0;
> > +	}
> >  
> > -	if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) {
> > -		kfree(node_attr->kobj_attr.attr.name);
> > -		kfree(node_attr);
> > -		pr_err("failed to add attribute to weighted_interleave\n");
> > -		return -ENOMEM;
> > +	wi_group->nattrs[nid] = new_attr;
> > +	mutex_unlock(&wi_group->kobj_lock);
> > +
> 
> Shouldn't all of this
> vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
> > +	sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr);
> > +	wi_group->nattrs[nid]->kobj_attr.attr.name = name;
> > +	wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644;
> > +	wi_group->nattrs[nid]->kobj_attr.show = node_show;
> > +	wi_group->nattrs[nid]->kobj_attr.store = node_store;
> > +	wi_group->nattrs[nid]->nid = nid;
> > +
> > +	ret = sysfs_create_file(&wi_group->wi_kobj,
> > +				&wi_group->nattrs[nid]->kobj_attr.attr);
> > +	if (ret) {
> > +		kfree(wi_group->nattrs[nid]->kobj_attr.attr.name);
> > +		kfree(wi_group->nattrs[nid]);
> > +		wi_group->nattrs[nid] = NULL;
> > +		pr_err("Failed to add attribute to weighted_interleave: %d\n", ret);
> >  	}
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> Be happening inside the lock as well?

I agree that applying your suggestion would make the code more robust.
I will update the patch to follow your recommendation.

> 
> > +
> > +	switch(action) {
> > +	case MEM_ONLINE:
> > +		if (node_state(nid, N_MEMORY)) {
> 
> Hm, I see the issue here, ok, this node_state check isn't needed, as it
> will always be true.  So this function needs to handle duplicates still.

Yes, you're right. The `node_state(nid, N_MEMORY)` check I added here is
redundant because it will always be true in this context. I will remove it
to avoid unnecessary duplication.

>                           vvv 
> > +			err = sysfs_wi_node_add(nid);
> > +			if (err) {
> > +				pr_err("failed to add sysfs [node%d]\n", nid);
> > +				return NOTIFY_BAD;
> > +			}
> > +		}
> > +		break;
> > +	case MEM_OFFLINE:
> > +		if (!node_state(nid, N_MEMORY))
> 
> This check is good for the time being.

This check looks appropriate for now and I'll keep it as-is.

> 
> > +			sysfs_wi_node_release(nid);
> > +		break;
> > +	}
> > +
> > +notifier_end:
> > +	return NOTIFY_OK;
> >  }
> >  
> > 
> 
> But really I think we probably just want to change to build on top of this:
> https://lore.kernel.org/all/20250401092716.537512-2-osalvador@suse.de/
> 
> And use register_node_notifier with NODE_BECAME_MEMORYLESS and NODE_BECAME_MEM_AWARE
> 
> ~Gregory

Thank you for sharing the link regarding `node_notify`. I agree that the
mechanism you pointed out would be a better fit for this patch.

By using `register_node_notifier` with `NODE_BECAME_MEMORYLESS` and
`NODE_BECAME_MEM_AWARE`, we can avoid unnecessary callbacks and implement
this functionality more efficiently.

However, I think it would be better to apply the current patch first and
then update it to use `node_notify` once that support is finalized and
available upstream.

Rakie
diff mbox series

Patch

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3092a792bd28..fa755d20780c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -113,6 +113,7 @@ 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <linux/uaccess.h>
+#include <linux/memory.h>
 
 #include "internal.h"
 
@@ -3390,6 +3391,7 @@  struct iw_node_attr {
 
 struct sysfs_wi_group {
 	struct kobject wi_kobj;
+	struct mutex kobj_lock;
 	struct iw_node_attr *nattrs[];
 };
 
@@ -3439,13 +3441,24 @@  static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 static void sysfs_wi_node_release(int nid)
 {
-	if (!wi_group->nattrs[nid])
+	struct iw_node_attr *attr;
+
+	if (nid < 0 || nid >= nr_node_ids)
+		return;
+
+	mutex_lock(&wi_group->kobj_lock);
+	attr = wi_group->nattrs[nid];
+	if (!attr) {
+		mutex_unlock(&wi_group->kobj_lock);
 		return;
+	}
 
-	sysfs_remove_file(&wi_group->wi_kobj,
-			  &wi_group->nattrs[nid]->kobj_attr.attr);
-	kfree(wi_group->nattrs[nid]->kobj_attr.attr.name);
-	kfree(wi_group->nattrs[nid]);
+	wi_group->nattrs[nid] = NULL;
+	mutex_unlock(&wi_group->kobj_lock);
+
+	sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+	kfree(attr->kobj_attr.attr.name);
+	kfree(attr);
 }
 
 static void sysfs_wi_release(struct kobject *wi_kobj)
@@ -3464,35 +3477,84 @@  static const struct kobj_type wi_ktype = {
 
 static int sysfs_wi_node_add(int nid)
 {
-	struct iw_node_attr *node_attr;
+	int ret = 0;
 	char *name;
+	struct iw_node_attr *new_attr = NULL;
 
-	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
-	if (!node_attr)
+	if (nid < 0 || nid >= nr_node_ids) {
+		pr_err("Invalid node id: %d\n", nid);
+		return -EINVAL;
+	}
+
+	new_attr = kzalloc(sizeof(struct iw_node_attr), GFP_KERNEL);
+	if (!new_attr)
 		return -ENOMEM;
 
 	name = kasprintf(GFP_KERNEL, "node%d", nid);
 	if (!name) {
-		kfree(node_attr);
+		kfree(new_attr);
 		return -ENOMEM;
 	}
 
-	sysfs_attr_init(&node_attr->kobj_attr.attr);
-	node_attr->kobj_attr.attr.name = name;
-	node_attr->kobj_attr.attr.mode = 0644;
-	node_attr->kobj_attr.show = node_show;
-	node_attr->kobj_attr.store = node_store;
-	node_attr->nid = nid;
+	mutex_lock(&wi_group->kobj_lock);
+	if (wi_group->nattrs[nid]) {
+		mutex_unlock(&wi_group->kobj_lock);
+		pr_info("Node [%d] already exists\n", nid);
+		kfree(new_attr);
+		kfree(name);
+		return 0;
+	}
 
-	if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) {
-		kfree(node_attr->kobj_attr.attr.name);
-		kfree(node_attr);
-		pr_err("failed to add attribute to weighted_interleave\n");
-		return -ENOMEM;
+	wi_group->nattrs[nid] = new_attr;
+	mutex_unlock(&wi_group->kobj_lock);
+
+	sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr);
+	wi_group->nattrs[nid]->kobj_attr.attr.name = name;
+	wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644;
+	wi_group->nattrs[nid]->kobj_attr.show = node_show;
+	wi_group->nattrs[nid]->kobj_attr.store = node_store;
+	wi_group->nattrs[nid]->nid = nid;
+
+	ret = sysfs_create_file(&wi_group->wi_kobj,
+				&wi_group->nattrs[nid]->kobj_attr.attr);
+	if (ret) {
+		kfree(wi_group->nattrs[nid]->kobj_attr.attr.name);
+		kfree(wi_group->nattrs[nid]);
+		wi_group->nattrs[nid] = NULL;
+		pr_err("Failed to add attribute to weighted_interleave: %d\n", ret);
 	}
 
-	wi_group->nattrs[nid] = node_attr;
-	return 0;
+	return ret;
+}
+
+static int wi_node_notifier(struct notifier_block *nb,
+			       unsigned long action, void *data)
+{
+	int err;
+	struct memory_notify *arg = data;
+	int nid = arg->status_change_nid;
+
+	if (nid < 0)
+		goto notifier_end;
+
+	switch(action) {
+	case MEM_ONLINE:
+		if (node_state(nid, N_MEMORY)) {
+			err = sysfs_wi_node_add(nid);
+			if (err) {
+				pr_err("failed to add sysfs [node%d]\n", nid);
+				return NOTIFY_BAD;
+			}
+		}
+		break;
+	case MEM_OFFLINE:
+		if (!node_state(nid, N_MEMORY))
+			sysfs_wi_node_release(nid);
+		break;
+	}
+
+notifier_end:
+	return NOTIFY_OK;
 }
 
 static int add_weighted_interleave_group(struct kobject *mempolicy_kobj)
@@ -3504,13 +3566,17 @@  static int add_weighted_interleave_group(struct kobject *mempolicy_kobj)
 		       GFP_KERNEL);
 	if (!wi_group)
 		return -ENOMEM;
+	mutex_init(&wi_group->kobj_lock);
 
 	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
 				   "weighted_interleave");
 	if (err)
 		goto err_out;
 
-	for_each_node_state(nid, N_POSSIBLE) {
+	for_each_online_node(nid) {
+		if (!node_state(nid, N_MEMORY))
+			continue;
+
 		err = sysfs_wi_node_add(nid);
 		if (err) {
 			pr_err("failed to add sysfs [node%d]\n", nid);
@@ -3518,6 +3584,7 @@  static int add_weighted_interleave_group(struct kobject *mempolicy_kobj)
 		}
 	}
 
+	hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
 	return 0;
 
 err_out: