Message ID | 20250402014906.1086-4-rakie.kim@sk.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Enhance sysfs handling for memory hotplug in weighted interleave | expand |
Hi Rakie, This is to fix the following broken status. https://lore.kernel.org/linux-mm/b8ac8654-92bd-4c08-a3fc-e28a7be5e0e6@sk.com So we must add the following tag for this patch. Fixes: fa3bea4e1f82 ("mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving") Hi Gregory, This patch is already in Andrew's tree. Is the current version okay for you? Hi Andrew, I'm not sure if this is going to the final version but could you please add this patch to stable with Cc: <stable@vger.kernel.org>? We might need to bring the whole series to avoid conflicts to stable tree. Thanks, Honggyu On 4/2/2025 10:49 AM, Rakie Kim wrote: > The weighted interleave policy distributes page allocations across multiple > NUMA nodes based on their performance weight, thereby improving memory > bandwidth utilization. The weight values for each node are configured > through sysfs. > > Previously, sysfs entries for configuring weighted interleave were created > for all possible nodes (N_POSSIBLE) at initialization, including nodes that > might not have memory. However, not all nodes in N_POSSIBLE are usable at > runtime, as some may remain memoryless or offline. > This led to sysfs entries being created for unusable nodes, causing > potential misconfiguration issues. > > To address this issue, this patch modifies the sysfs creation logic to: > 1) Limit sysfs entries to nodes that are online and have memory, avoiding > the creation of sysfs entries for nodes that cannot be used. > 2) Support memory hotplug by dynamically adding and removing sysfs entries > based on whether a node transitions into or out of the N_MEMORY state. > > Additionally, the patch ensures that sysfs attributes are properly managed > when nodes go offline, preventing stale or redundant entries from persisting > in the system. > > By making these changes, the weighted interleave policy now manages its > sysfs entries more efficiently, ensuring that only relevant nodes are > considered for interleaving, and dynamically adapting to memory hotplug > events. > > Signed-off-by: Rakie Kim <rakie.kim@sk.com> > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com> > --- > mm/mempolicy.c | 110 ++++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 87 insertions(+), 23 deletions(-) > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 3092a792bd28..ea4f3f3f2cdd 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -113,6 +113,7 @@ > #include <asm/tlbflush.h> > #include <asm/tlb.h> > #include <linux/uaccess.h> > +#include <linux/memory.h> > > #include "internal.h" > > @@ -3390,6 +3391,7 @@ struct iw_node_attr { > > struct sysfs_wi_group { > struct kobject wi_kobj; > + struct mutex kobj_lock; > struct iw_node_attr *nattrs[]; > }; > > @@ -3439,13 +3441,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, > > static void sysfs_wi_node_release(int nid) > { > - if (!wi_group->nattrs[nid]) > + struct iw_node_attr *attr; > + > + if (nid < 0 || nid >= nr_node_ids) > + return; > + > + mutex_lock(&wi_group->kobj_lock); > + attr = wi_group->nattrs[nid]; > + if (!attr) { > + mutex_unlock(&wi_group->kobj_lock); > return; > + } > + > + wi_group->nattrs[nid] = NULL; > + mutex_unlock(&wi_group->kobj_lock); > > - sysfs_remove_file(&wi_group->wi_kobj, > - &wi_group->nattrs[nid]->kobj_attr.attr); > - kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); > - kfree(wi_group->nattrs[nid]); > + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); > + kfree(attr->kobj_attr.attr.name); > + kfree(attr); > } > > static void sysfs_wi_release(struct kobject *wi_kobj) > @@ -3464,35 +3477,81 @@ static const struct kobj_type wi_ktype = { > > static int sysfs_wi_node_add(int nid) > { > - struct iw_node_attr *node_attr; > + int ret = 0; > char *name; > + struct iw_node_attr *new_attr = NULL; > > - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); > - if (!node_attr) > + if (nid < 0 || nid >= nr_node_ids) { > + pr_err("Invalid node id: %d\n", nid); > + return -EINVAL; > + } > + > + new_attr = kzalloc(sizeof(struct iw_node_attr), GFP_KERNEL); > + if (!new_attr) > return -ENOMEM; > > name = kasprintf(GFP_KERNEL, "node%d", nid); > if (!name) { > - kfree(node_attr); > + kfree(new_attr); > return -ENOMEM; > } > > - sysfs_attr_init(&node_attr->kobj_attr.attr); > - node_attr->kobj_attr.attr.name = name; > - node_attr->kobj_attr.attr.mode = 0644; > - node_attr->kobj_attr.show = node_show; > - node_attr->kobj_attr.store = node_store; > - node_attr->nid = nid; > + mutex_lock(&wi_group->kobj_lock); > + if (wi_group->nattrs[nid]) { > + mutex_unlock(&wi_group->kobj_lock); > + pr_info("Node [%d] already exists\n", nid); > + kfree(new_attr); > + kfree(name); > + return 0; > + } > + wi_group->nattrs[nid] = new_attr; > > - if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) { > - kfree(node_attr->kobj_attr.attr.name); > - kfree(node_attr); > - pr_err("failed to add attribute to weighted_interleave\n"); > - return -ENOMEM; > + sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr); > + wi_group->nattrs[nid]->kobj_attr.attr.name = name; > + wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644; > + wi_group->nattrs[nid]->kobj_attr.show = node_show; > + wi_group->nattrs[nid]->kobj_attr.store = node_store; > + wi_group->nattrs[nid]->nid = nid; > + > + ret = sysfs_create_file(&wi_group->wi_kobj, > + &wi_group->nattrs[nid]->kobj_attr.attr); > + if (ret) { > + kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); > + kfree(wi_group->nattrs[nid]); > + wi_group->nattrs[nid] = NULL; > + pr_err("Failed to add attribute to weighted_interleave: %d\n", ret); > } > + mutex_unlock(&wi_group->kobj_lock); > > - wi_group->nattrs[nid] = node_attr; > - return 0; > + return ret; > +} > + > +static int wi_node_notifier(struct notifier_block *nb, > + unsigned long action, void *data) > +{ > + int err; > + struct memory_notify *arg = data; > + int nid = arg->status_change_nid; > + > + if (nid < 0) > + goto notifier_end; > + > + switch(action) { > + case MEM_ONLINE: > + err = sysfs_wi_node_add(nid); > + if (err) { > + pr_err("failed to add sysfs [node%d]\n", nid); > + return NOTIFY_BAD; > + } > + break; > + case MEM_OFFLINE: > + if (!node_state(nid, N_MEMORY)) > + sysfs_wi_node_release(nid); > + break; > + } > + > +notifier_end: > + return NOTIFY_OK; > } > > static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) > @@ -3504,13 +3563,17 @@ static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) > GFP_KERNEL); > if (!wi_group) > return -ENOMEM; > + mutex_init(&wi_group->kobj_lock); > > err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, > "weighted_interleave"); > if (err) > goto err_out; > > - for_each_node_state(nid, N_POSSIBLE) { > + for_each_online_node(nid) { > + if (!node_state(nid, N_MEMORY)) > + continue; > + > err = sysfs_wi_node_add(nid); > if (err) { > pr_err("failed to add sysfs [node%d]\n", nid); > @@ -3518,6 +3581,7 @@ static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) > } > } > > + hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); > return 0; > > err_out:
On Wed, 2 Apr 2025 13:18:51 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote: > This is to fix the following broken status. > https://lore.kernel.org/linux-mm/b8ac8654-92bd-4c08-a3fc-e28a7be5e0e6@sk.com > > So we must add the following tag for this patch. > Fixes: fa3bea4e1f82 ("mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for > weighted interleaving") > > > Hi Gregory, > > This patch is already in Andrew's tree. Is the current version okay for you? > > > Hi Andrew, > > I'm not sure if this is going to the final version but could you please add this > patch to stable with Cc: <stable@vger.kernel.org>? > We might need to bring the whole series to avoid conflicts to stable tree. This is all rather confused. Do we need to backport all three patches into -stable? If so, all three should have Fixes:. Preferably they all have the same Fixes: so we aren't backporting untested patch combinations. I presently have: Subject: mm/mempolicy: fix memory leaks in weighted interleave sysfs Fixes: dce41f5ae253 ("mm/mempolicy: implement the sysfs-based weighted_interleave interface") Cc: <stable@vger.kernel.org> Subject: mm/mempolicy: support dynamic sysfs updates for weighted interleave Cc: <stable@vger.kernel.org> Subject: mm/mempolicy: support memory hotplug in weighted interleave Fixes: fa3bea4e1f82 ("mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving") Cc: <stable@vger.kernel.org> [2/3] doesn't look like a fix. Perhaps the whole thing should be redone: a two-patch bugfix series, each with a Fixes: and a cc:stable and feature patch with neither a Fixes: nor a cc:stable.
On Tue, Apr 01, 2025 at 09:34:39PM -0700, Andrew Morton wrote: > > I'm not sure if this is going to the final version but could you please add this > > patch to stable with Cc: <stable@vger.kernel.org>? > > We might need to bring the whole series to avoid conflicts to stable tree. > > This is all rather confused. > > Do we need to backport all three patches into -stable? If so, all three > should have Fixes:. Preferably they all have the same Fixes: so we > aren't backporting untested patch combinations. > I'd just as soon leave the entire thing out of stable backports. Only the first patch actually fixes a bug (very small memory leak during system tear down, so it's not even a critical bug). This could be added to stable. Patches 2 and 3 aren't fixes, they're desired behavioral changes to the feature. An interface being confusing doesn't mean it's broken. ~Gregory
On Wed, Apr 02, 2025 at 01:15:48AM -0400, Gregory Price wrote: > On Tue, Apr 01, 2025 at 09:34:39PM -0700, Andrew Morton wrote: > > > > Do we need to backport all three patches into -stable? If so, all three > > should have Fixes:. Preferably they all have the same Fixes: so we > > aren't backporting untested patch combinations. > > > > Patches 2 and 3 aren't fixes, they're desired behavioral changes > to the feature. An interface being confusing doesn't mean it's broken. > Just some added clarity: The behavioral change in sysfs change goes from N_POSSIBLE nodes always being exposed - to sysfs entries coming and going as nodes transition between having memory and not having memory. I can see the argument for wanting this backported as a fix to ensure userspace software is consistent across LTS. So for backporting: all or nothing - and all the fixes tags should be the same. ~Gregory
Hi Gregory, On 4/2/2025 2:15 PM, Gregory Price wrote: > On Tue, Apr 01, 2025 at 09:34:39PM -0700, Andrew Morton wrote: >>> I'm not sure if this is going to the final version but could you please add this >>> patch to stable with Cc: <stable@vger.kernel.org>? >>> We might need to bring the whole series to avoid conflicts to stable tree. >> >> This is all rather confused. >> >> Do we need to backport all three patches into -stable? If so, all three >> should have Fixes:. Preferably they all have the same Fixes: so we >> aren't backporting untested patch combinations. >> > > I'd just as soon leave the entire thing out of stable backports. > > Only the first patch actually fixes a bug (very small memory leak > during system tear down, so it's not even a critical bug). This could > be added to stable. > > Patches 2 and 3 aren't fixes, they're desired behavioral changes > to the feature. An interface being confusing doesn't mean it's broken. I think those are fixes but I also agree that this is behavioral changes as well. So I'm fine leaving these out of stable backports. Hi Andrew, I won't ask changes so please keep the current status as is. Sorry for making confusion. Thanks, Honggyu > > ~Gregory
On Wed, Apr 02, 2025 at 10:49:04AM +0900, Rakie Kim wrote: > The weighted interleave policy distributes page allocations across multiple > NUMA nodes based on their performance weight, thereby improving memory > bandwidth utilization. The weight values for each node are configured > through sysfs. > > Previously, sysfs entries for configuring weighted interleave were created > for all possible nodes (N_POSSIBLE) at initialization, including nodes that > might not have memory. However, not all nodes in N_POSSIBLE are usable at > runtime, as some may remain memoryless or offline. > This led to sysfs entries being created for unusable nodes, causing > potential misconfiguration issues. > > To address this issue, this patch modifies the sysfs creation logic to: > 1) Limit sysfs entries to nodes that are online and have memory, avoiding > the creation of sysfs entries for nodes that cannot be used. > 2) Support memory hotplug by dynamically adding and removing sysfs entries > based on whether a node transitions into or out of the N_MEMORY state. > > Additionally, the patch ensures that sysfs attributes are properly managed > when nodes go offline, preventing stale or redundant entries from persisting > in the system. > > By making these changes, the weighted interleave policy now manages its > sysfs entries more efficiently, ensuring that only relevant nodes are > considered for interleaving, and dynamically adapting to memory hotplug > events. > > Signed-off-by: Rakie Kim <rakie.kim@sk.com> > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> > Signed-off-by: Yunjeong Mun <yunjeong.mun@sk.com> Sorry to jump at v5, and maybe this was already discussed but: > +static int wi_node_notifier(struct notifier_block *nb, > + unsigned long action, void *data) > +{ > + int err; > + struct memory_notify *arg = data; > + int nid = arg->status_change_nid; > + > + if (nid < 0) > + goto notifier_end; > + > + switch(action) { > + case MEM_ONLINE: > + err = sysfs_wi_node_add(nid); > + if (err) { > + pr_err("failed to add sysfs [node%d]\n", nid); > + return NOTIFY_BAD; > + } > + break; > + case MEM_OFFLINE: > + if (!node_state(nid, N_MEMORY)) > + sysfs_wi_node_release(nid); This check is not needed. If status_change_nid is different than NUMA_NO_NODE, it means that the memory state of the numa node was effectively changed. So doing: case MEM_OFFLINE: sysfs_wi_node_release(nid) is enough.
On Wed, Apr 02, 2025 at 11:51:17AM +0200, Oscar Salvador wrote: > > + case MEM_OFFLINE: > > + if (!node_state(nid, N_MEMORY)) > > + sysfs_wi_node_release(nid); > > This check is not needed. > > If status_change_nid is different than NUMA_NO_NODE, it means that the memory > state of the numa node was effectively changed. > So doing: > > case MEM_OFFLINE: > sysfs_wi_node_release(nid) > > is enough. offline_pages will call this callback unconditionally any time it's called (and succeeds). If 2 dax devices online into the same node, it's possible to offline one chunk of blocks without offlining the other chunks of blocks - which is why we did the additional check. ~Gregory
On Wed, Apr 02, 2025 at 10:14:42AM -0400, Gregory Price wrote: > On Wed, Apr 02, 2025 at 11:51:17AM +0200, Oscar Salvador wrote: > > > + case MEM_OFFLINE: > > > + if (!node_state(nid, N_MEMORY)) > > > + sysfs_wi_node_release(nid); > > > > This check is not needed. > > > > If status_change_nid is different than NUMA_NO_NODE, it means that the memory > > state of the numa node was effectively changed. > > So doing: > > > > case MEM_OFFLINE: > > sysfs_wi_node_release(nid) > > > > is enough. > > offline_pages will call this callback unconditionally any time it's > called (and succeeds). If 2 dax devices online into the same node, it's Yes, this callback will be called whenever {online,offline}_pages succeeds, but status_change_nid will be != NUMA_NO_NODE IFF the node state changes. And you already have the check if (nid < 0) goto out at the beginning, which means that all {offline,online}_pages operation that do not carry a numa node state change will be filtered out there. Makes sense, or am I missing something?
On Wed, Apr 02, 2025 at 05:41:57PM +0200, Oscar Salvador wrote: > > Yes, this callback will be called whenever {online,offline}_pages succeeds, but > status_change_nid will be != NUMA_NO_NODE IFF the node state changes. > And you already have the check > > if (nid < 0) > goto out > > at the beginning, which means that all {offline,online}_pages operation that > do not carry a numa node state change will be filtered out there. > > Makes sense, or am I missing something? > Ah, you're quite right. That was difficult to see on the surface, so the check in fact superfluous. No need for an extra version, can just add a patch to squash and drop it. ~Gregory
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3092a792bd28..ea4f3f3f2cdd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -113,6 +113,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> +#include <linux/memory.h> #include "internal.h" @@ -3390,6 +3391,7 @@ struct iw_node_attr { struct sysfs_wi_group { struct kobject wi_kobj; + struct mutex kobj_lock; struct iw_node_attr *nattrs[]; }; @@ -3439,13 +3441,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, static void sysfs_wi_node_release(int nid) { - if (!wi_group->nattrs[nid]) + struct iw_node_attr *attr; + + if (nid < 0 || nid >= nr_node_ids) + return; + + mutex_lock(&wi_group->kobj_lock); + attr = wi_group->nattrs[nid]; + if (!attr) { + mutex_unlock(&wi_group->kobj_lock); return; + } + + wi_group->nattrs[nid] = NULL; + mutex_unlock(&wi_group->kobj_lock); - sysfs_remove_file(&wi_group->wi_kobj, - &wi_group->nattrs[nid]->kobj_attr.attr); - kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); - kfree(wi_group->nattrs[nid]); + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); + kfree(attr->kobj_attr.attr.name); + kfree(attr); } static void sysfs_wi_release(struct kobject *wi_kobj) @@ -3464,35 +3477,81 @@ static const struct kobj_type wi_ktype = { static int sysfs_wi_node_add(int nid) { - struct iw_node_attr *node_attr; + int ret = 0; char *name; + struct iw_node_attr *new_attr = NULL; - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); - if (!node_attr) + if (nid < 0 || nid >= nr_node_ids) { + pr_err("Invalid node id: %d\n", nid); + return -EINVAL; + } + + new_attr = kzalloc(sizeof(struct iw_node_attr), GFP_KERNEL); + if (!new_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { - kfree(node_attr); + kfree(new_attr); return -ENOMEM; } - sysfs_attr_init(&node_attr->kobj_attr.attr); - node_attr->kobj_attr.attr.name = name; - node_attr->kobj_attr.attr.mode = 0644; - node_attr->kobj_attr.show = node_show; - node_attr->kobj_attr.store = node_store; - node_attr->nid = nid; + mutex_lock(&wi_group->kobj_lock); + if (wi_group->nattrs[nid]) { + mutex_unlock(&wi_group->kobj_lock); + pr_info("Node [%d] already exists\n", nid); + kfree(new_attr); + kfree(name); + return 0; + } + wi_group->nattrs[nid] = new_attr; - if (sysfs_create_file(&wi_group->wi_kobj, &node_attr->kobj_attr.attr)) { - kfree(node_attr->kobj_attr.attr.name); - kfree(node_attr); - pr_err("failed to add attribute to weighted_interleave\n"); - return -ENOMEM; + sysfs_attr_init(&wi_group->nattrs[nid]->kobj_attr.attr); + wi_group->nattrs[nid]->kobj_attr.attr.name = name; + wi_group->nattrs[nid]->kobj_attr.attr.mode = 0644; + wi_group->nattrs[nid]->kobj_attr.show = node_show; + wi_group->nattrs[nid]->kobj_attr.store = node_store; + wi_group->nattrs[nid]->nid = nid; + + ret = sysfs_create_file(&wi_group->wi_kobj, + &wi_group->nattrs[nid]->kobj_attr.attr); + if (ret) { + kfree(wi_group->nattrs[nid]->kobj_attr.attr.name); + kfree(wi_group->nattrs[nid]); + wi_group->nattrs[nid] = NULL; + pr_err("Failed to add attribute to weighted_interleave: %d\n", ret); } + mutex_unlock(&wi_group->kobj_lock); - wi_group->nattrs[nid] = node_attr; - return 0; + return ret; +} + +static int wi_node_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int err; + struct memory_notify *arg = data; + int nid = arg->status_change_nid; + + if (nid < 0) + goto notifier_end; + + switch(action) { + case MEM_ONLINE: + err = sysfs_wi_node_add(nid); + if (err) { + pr_err("failed to add sysfs [node%d]\n", nid); + return NOTIFY_BAD; + } + break; + case MEM_OFFLINE: + if (!node_state(nid, N_MEMORY)) + sysfs_wi_node_release(nid); + break; + } + +notifier_end: + return NOTIFY_OK; } static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) @@ -3504,13 +3563,17 @@ static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) GFP_KERNEL); if (!wi_group) return -ENOMEM; + mutex_init(&wi_group->kobj_lock); err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, "weighted_interleave"); if (err) goto err_out; - for_each_node_state(nid, N_POSSIBLE) { + for_each_online_node(nid) { + if (!node_state(nid, N_MEMORY)) + continue; + err = sysfs_wi_node_add(nid); if (err) { pr_err("failed to add sysfs [node%d]\n", nid); @@ -3518,6 +3581,7 @@ static int add_weighted_interleave_group(struct kobject *mempolicy_kobj) } } + hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); return 0; err_out: