[1/2,v6] mm/mempolicy: Weighted Interleave Auto-tuning

Message ID	20250226213518.767670-1-joshua.hahnjy@gmail.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Joshua Hahn <joshua.hahnjy@gmail.com> To: gourry@gourry.net, harry.yoo@oracle.com, ying.huang@linux.alibaba.com Cc: honggyu.kim@sk.com, gregkh@linuxfoundation.org, rakie.kim@sk.com, akpm@linux-foundation.org, rafael@kernel.org, lenb@kernel.org, dan.j.williams@intel.com, Jonathan.Cameron@huawei.com, dave.jiang@intel.com, horen.chuang@linux.dev, hannes@cmpxchg.org, linux-kernel@vger.kernel.org, linux-acpi@vger.kernel.org, linux-mm@kvack.org, kernel-team@meta.com Subject: [PATCH 1/2 v6] mm/mempolicy: Weighted Interleave Auto-tuning Date: Wed, 26 Feb 2025 13:35:17 -0800 Message-ID: <20250226213518.767670-1-joshua.hahnjy@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[1/2,v6] mm/mempolicy: Weighted Interleave Auto-tuning \| expand [1/2,v6] mm/mempolicy: Weighted Interleave Auto-tuning [2/2,v6] mm/mempolicy: Don't create weight sysfs for memoryless nodes

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave index 0b7972de04e9..862b19943a85 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave @@ -20,6 +20,34 @@ Description: Weight configuration interface for nodeN Minimum weight: 1 Maximum weight: 255 - Writing an empty string or `0` will reset the weight to the - system default. The system default may be set by the kernel - or drivers at boot or during hotplug events. + Writing invalid values (i.e. any values not in [1,255], + empty string, ...) will return -EINVAL. + + Changing the weight to a valid value will automatically + update the system to manual mode as well. + +What: /sys/kernel/mm/mempolicy/weighted_interleave/auto +Date: February 2025 +Contact: Linux memory management mailing list <linux-mm@kvack.org> +Description: Auto-weighting configuration interface + + Configuration mode for weighted interleave. A 'Y' indicates + that the system is in auto mode, and a 'N' indicates that + the system is in manual mode. All other values are invalid. + + In auto mode, all node weights are re-calculated and overwritten + (visible via the nodeN interfaces) whenever new bandwidth data + is made available during either boot or hotplug events. + + In manual mode, node weights can only be updated by the user. + Note that nodes that are onlined with previously set weights + will inherit those weights. If they were not previously set or + are onlined with missing bandwidth data, the weights will use + a default weight of 1. + + Writing Y or 1 to the interface will enable auto mode, while + writing N or 0 will enable manual mode. All other strings will + be ignored, and -EINVAL will be returned. + + Writing a new weight to a node directly via the nodeN interface + will also automatically update the system to manual mode. diff --git a/drivers/base/node.c b/drivers/base/node.c index 0ea653fa3433..f3c01fb90db1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/memory.h> +#include <linux/mempolicy.h> #include <linux/vmstat.h> #include <linux/notifier.h> #include <linux/node.h> @@ -214,6 +215,14 @@ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, break; } } + + /* When setting CPU access coordinates, update mempolicy */ + if (access == ACCESS_COORDINATE_CPU) { + if (mempolicy_set_node_perf(nid, coord)) { + pr_info("failed to set mempolicy attrs for node %d\n", + nid); + } + } } EXPORT_SYMBOL_GPL(node_set_perf_attrs); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ce9885e0178a..78f1299bdd42 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> +#include <linux/node.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> @@ -56,6 +57,11 @@ struct mempolicy { } w; }; +struct weighted_interleave_state { + bool mode_auto; + u8 iw_table[]; /* A null iw_table is interpreted as an array of 1s. */ +}; + /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. @@ -178,6 +184,9 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); +extern int mempolicy_set_node_perf(unsigned int node, + struct access_coordinate *coords); + #else struct mempolicy {}; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bbaadbeeb291..4cc04ff8f12c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -109,6 +109,7 @@ #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> +#include <linux/gcd.h> #include <asm/tlbflush.h> #include <asm/tlb.h> @@ -139,31 +140,151 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* - * iw_table is the sysfs-set interleave weight table, a value of 0 denotes - * system-default value should be used. A NULL iw_table also denotes that - * system-default values should be used. Until the system-default table - * is implemented, the system-default is always 1. - * - * iw_table is RCU protected + * weightiness balances the tradeoff between small weights (cycles through nodes + * faster, more fair/even distribution) and large weights (smaller errors + * between actual bandwidth ratios and weight ratios). 32 is a number that has + * been found to perform at a reasonable compromise between the two goals. + */ +static const int weightiness = 32; + +/* wi_state is RCU protected */ +static struct weighted_interleave_state __rcu *wi_state; +static unsigned int *node_bw_table; + +/* + * iw_table_lock protects both wi_state and node_bw_table. + * node_bw_table is only used by writers to update wi_state. */ -static u8 __rcu *iw_table; static DEFINE_MUTEX(iw_table_lock); static u8 get_il_weight(int node) { - u8 *table; - u8 weight; + u8 weight = 1; rcu_read_lock(); - table = rcu_dereference(iw_table); - /* if no iw_table, use system default */ - weight = table ? table[node] : 1; - /* if value in iw_table is 0, use system default */ - weight = weight ? weight : 1; + if (rcu_access_pointer(wi_state)) + weight = rcu_dereference(wi_state)->iw_table[node]; rcu_read_unlock(); + return weight; } +/* + * Convert bandwidth values into weighted interleave weights. + * Call with iw_table_lock. + */ +static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) +{ + u64 sum_bw = 0; + unsigned int cast_sum_bw, sum_iw = 0; + unsigned int scaling_factor = 1, iw_gcd = 1; + int nid; + + /* Recalculate the bandwidth distribution given the new info */ + for_each_node_state(nid, N_MEMORY) + sum_bw += bw[nid]; + + for (nid = 0; nid < nr_node_ids; nid++) { + /* Set memoryless nodes' weights to 1 to prevent div/0 later */ + if (!node_state(nid, N_MEMORY)) { + new_iw[nid] = 1; + continue; + } + + scaling_factor = 100 * bw[nid]; + + /* + * Try not to perform 64-bit division. + * If sum_bw < scaling_factor, then sum_bw < U32_MAX. + * If sum_bw > scaling_factor, then bw[nid] is less than + * 1% of the total bandwidth. Round up to 1%. + */ + if (bw[nid] && sum_bw < scaling_factor) { + cast_sum_bw = (unsigned int)sum_bw; + new_iw[nid] = scaling_factor / cast_sum_bw; + } else { + new_iw[nid] = 1; + } + sum_iw += new_iw[nid]; + } + + /* + * Scale each node's share of the total bandwidth from percentages + * to whole numbers in the range [1, weightiness] + */ + for_each_node_state(nid, N_MEMORY) { + scaling_factor = weightiness * new_iw[nid]; + new_iw[nid] = max(scaling_factor / sum_iw, 1); + if (nid == 0) + iw_gcd = new_iw[0]; + iw_gcd = gcd(iw_gcd, new_iw[nid]); + } + + /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ + for_each_node_state(nid, N_MEMORY) + new_iw[nid] /= iw_gcd; +} + +int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *old_bw, *new_bw; + unsigned int bw_val; + + bw_val = min(coords->read_bandwidth, coords->write_bandwidth); + new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); + if (!new_bw) + return -ENOMEM; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) { + kfree(new_bw); + return -ENOMEM; + } + + /* + * Update bandwidth info, even in manual mode. That way, when switching + * to auto mode in the future, iw_table can be overwritten using + * accurate bw data. + */ + mutex_lock(&iw_table_lock); + + old_bw = node_bw_table; + if (old_bw) + memcpy(new_bw, old_bw, nr_node_ids * sizeof(unsigned int)); + new_bw[node] = bw_val; + node_bw_table = new_bw; + + /* wi_state not initialized yet; assume auto == true */ + if (!rcu_access_pointer(wi_state)) + goto reduce; + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&iw_table_lock)); + if (old_wi_state->mode_auto) + goto reduce; + + mutex_unlock(&iw_table_lock); + kfree(new_wi_state); + kfree(old_bw); + return 0; + +reduce: + new_wi_state->mode_auto = true; + reduce_interleave_weights(new_bw, new_wi_state->iw_table); + + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&iw_table_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + kfree(old_bw); + + return 0; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -1988,34 +2109,33 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) u8 *table; unsigned int weight_total = 0; u8 weight; - int nid; + int nid = 0; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); - table = rcu_dereference(iw_table); + if (!rcu_access_pointer(wi_state)) + goto out; + + table = rcu_dereference(wi_state)->iw_table; /* calculate the total weight */ - for_each_node_mask(nid, nodemask) { - /* detect system default usage */ - weight = table ? table[nid] : 1; - weight = weight ? weight : 1; - weight_total += weight; - } + for_each_node_mask(nid, nodemask) + weight_total += table ? table[nid] : 1; /* Calculate the node offset based on totals */ target = ilx % weight_total; nid = first_node(nodemask); while (target) { /* detect system default usage */ - weight = table ? table[nid] : 1; - weight = weight ? weight : 1; + weight = table[nid]; if (target < weight) break; target -= weight; nid = next_node_in(nid, nodemask); } +out: rcu_read_unlock(); return nid; } @@ -2411,13 +2531,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { + struct weighted_interleave_state *state; struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; - u8 *table, *weights, weight; + u8 *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; @@ -2467,17 +2588,19 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, return total_allocated; rcu_read_lock(); - table = rcu_dereference(iw_table); - if (table) - memcpy(weights, table, nr_node_ids); - rcu_read_unlock(); + if (rcu_access_pointer(wi_state)) { + state = rcu_dereference(wi_state); + memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + for (i = 0; i < nr_node_ids; i++) + weights[i] = 1; + } /* calculate total, detect system default usage */ - for_each_node_mask(node, nodes) { - if (!weights[node]) - weights[node] = 1; + for_each_node_mask(node, nodes) weight_total += weights[node]; - } /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. @@ -3402,36 +3525,113 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; struct iw_node_attr *node_attr; - u8 *new; - u8 *old; u8 weight = 0; + int i; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); if (count == 0 || sysfs_streq(buf, "")) weight = 0; - else if (kstrtou8(buf, 0, &weight)) + else if (kstrtou8(buf, 0, &weight) || weight == 0) return -EINVAL; - new = kzalloc(nr_node_ids, GFP_KERNEL); - if (!new) + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) return -ENOMEM; mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, + if (rcu_access_pointer(wi_state)) { + old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&iw_table_lock)); - if (old) - memcpy(new, old, nr_node_ids); - new[node_attr->nid] = weight; - rcu_assign_pointer(iw_table, new); + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + } else { + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + } + new_wi_state->iw_table[node_attr->nid] = weight; + new_wi_state->mode_auto = false; + + rcu_assign_pointer(wi_state, new_wi_state); mutex_unlock(&iw_table_lock); - synchronize_rcu(); - kfree(old); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } return count; } static struct iw_node_attr **node_attrs; +static ssize_t weighted_interleave_auto_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + bool wi_auto = true; + + rcu_read_lock(); + if (rcu_access_pointer(wi_state)) + wi_auto = rcu_dereference(wi_state)->mode_auto; + rcu_read_unlock(); + + return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); +} + +static ssize_t weighted_interleave_auto_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *bw; + bool input; + int i; + + if (kstrtobool(buf, &input)) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; + mutex_lock(&iw_table_lock); + + if (!input) { + if (rcu_access_pointer(wi_state)) { + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&iw_table_lock)); + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + } else { + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + } + goto update_wi_state; + } + + bw = node_bw_table; + if (!bw) { + mutex_unlock(&iw_table_lock); + kfree(new_wi_state); + return -ENODEV; + } + + new_wi_state->mode_auto = true; + reduce_interleave_weights(bw, new_wi_state->iw_table); + +update_wi_state: + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&iw_table_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static struct kobj_attribute wi_attr = + __ATTR(auto, 0664, weighted_interleave_auto_show, + weighted_interleave_auto_store); + static void sysfs_wi_node_release(struct iw_node_attr *node_attr, struct kobject *parent) { @@ -3489,6 +3689,15 @@ static int add_weight_node(int nid, struct kobject *wi_kobj) return 0; } +static struct attribute *wi_default_attrs[] = { + &wi_attr.attr, + NULL +}; + +static const struct attribute_group wi_attr_group = { + .attrs = wi_default_attrs, +}; + static int add_weighted_interleave_group(struct kobject *root_kobj) { struct kobject *wi_kobj; @@ -3505,6 +3714,13 @@ static int add_weighted_interleave_group(struct kobject *root_kobj) return err; } + err = sysfs_create_group(wi_kobj, &wi_attr_group); + if (err) { + pr_err("failed to add sysfs [auto]\n"); + kobject_put(wi_kobj); + return err; + } + for_each_node_state(nid, N_POSSIBLE) { err = add_weight_node(nid, wi_kobj); if (err) { @@ -3519,15 +3735,22 @@ static int add_weighted_interleave_group(struct kobject *root_kobj) static void mempolicy_kobj_release(struct kobject *kobj) { - u8 *old; + struct weighted_interleave_state *old_wi_state; mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - rcu_assign_pointer(iw_table, NULL); + if (!rcu_access_pointer(wi_state)) { + mutex_unlock(&iw_table_lock); + goto out; + } + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&iw_table_lock)); + + rcu_assign_pointer(wi_state, NULL); mutex_unlock(&iw_table_lock); synchronize_rcu(); - kfree(old); + kfree(old_wi_state); +out: kfree(node_attrs); kfree(kobj); }

[1/2,v6] mm/mempolicy: Weighted Interleave Auto-tuning

Commit Message

Patch