@@ -258,6 +258,13 @@ config ARCH_ENABLE_THP_MIGRATION
config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+config SMO_NODE
+ bool "Enable per node control of Slab Movable Objects"
+ depends on SLUB && SYSFS
+ select SLUB_DEBUG
+ help
+ On NUMA systems enable moving objects to and from a specified node.
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT
@@ -4336,6 +4336,106 @@ static void move_slab_page(struct page *page, void *scratch, int node)
s->migrate(s, vector, count, node, private);
}
+#ifdef CONFIG_SMO_NODE
+/*
+ * kmem_cache_move() - Attempt to move all slab objects.
+ * @s: The cache we are working on.
+ * @node: The node to move objects away from.
+ * @target_node: The node to move objects on to.
+ *
+ * Attempts to move all objects (partial slabs and full slabs) to target
+ * node.
+ *
+ * Context: Takes the list_lock.
+ * Return: The number of slabs remaining on node.
+ */
+static unsigned long kmem_cache_move(struct kmem_cache *s,
+ int node, int target_node)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+ LIST_HEAD(move_list);
+ struct page *page, *page2;
+ unsigned long flags;
+ void **scratch;
+
+ if (!s->migrate) {
+ pr_warn("%s SMO not enabled, cannot move objects\n", s->name);
+ goto out;
+ }
+
+ scratch = alloc_scratch(s);
+ if (!scratch)
+ goto out;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ if (!slab_trylock(page))
+ /* Busy slab. Get out of the way */
+ continue;
+
+ if (page->inuse) {
+ list_move(&page->lru, &move_list);
+ /* Stop page being considered for allocations */
+ n->nr_partial--;
+ page->frozen = 1;
+
+ slab_unlock(page);
+ } else { /* Empty slab page */
+ list_del(&page->lru);
+ n->nr_partial--;
+ slab_unlock(page);
+ discard_slab(s, page);
+ }
+ }
+ list_for_each_entry_safe(page, page2, &n->full, lru) {
+ if (!slab_trylock(page))
+ continue;
+
+ list_move(&page->lru, &move_list);
+ page->frozen = 1;
+ slab_unlock(page);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ list_for_each_entry(page, &move_list, lru) {
+ if (page->inuse)
+ move_slab_page(page, scratch, target_node);
+ }
+ kfree(scratch);
+
+ /* Bail here to save taking the list_lock */
+ if (list_empty(&move_list))
+ goto out;
+
+ /* Inspect results and dispose of pages */
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry_safe(page, page2, &move_list, lru) {
+ list_del(&page->lru);
+ slab_lock(page);
+ page->frozen = 0;
+
+ if (page->inuse) {
+ if (page->inuse == page->objects) {
+ list_add(&page->lru, &n->full);
+ slab_unlock(page);
+ } else {
+ n->nr_partial++;
+ list_add_tail(&page->lru, &n->partial);
+ slab_unlock(page);
+ }
+ } else {
+ slab_unlock(page);
+ discard_slab(s, page);
+ }
+ }
+ spin_unlock_irqrestore(&n->list_lock, flags);
+out:
+ return atomic_long_read(&n->nr_slabs);
+}
+#endif /* CONFIG_SMO_NODE */
+
/*
* kmem_cache_defrag() - Defragment node.
* @s: cache we are working on.
@@ -4450,6 +4550,32 @@ static unsigned long kmem_cache_defrag(struct kmem_cache *s,
return n->nr_partial;
}
+#ifdef CONFIG_SMO_NODE
+/*
+ * kmem_cache_move_to_node() - Move all slab objects to node.
+ * @s: The cache we are working on.
+ * @node: The target node to move objects to.
+ *
+ * Attempt to move all slab objects from all nodes to @node.
+ *
+ * Return: The total number of slabs left on emptied nodes.
+ */
+static unsigned long kmem_cache_move_to_node(struct kmem_cache *s, int node)
+{
+ unsigned long left = 0;
+ int nid;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
+ if (nid == node)
+ continue;
+
+ left += kmem_cache_move(s, nid, node);
+ }
+
+ return left;
+}
+#endif
+
/**
* kmem_defrag_slabs() - Defrag slab caches.
* @node: The node to defrag or -1 for all nodes.
@@ -5594,6 +5720,126 @@ static ssize_t shrink_store(struct kmem_cache *s,
}
SLAB_ATTR(shrink);
+#ifdef CONFIG_SMO_NODE
+static ssize_t move_show(struct kmem_cache *s, char *buf)
+{
+ return 0;
+}
+
+/*
+ * parse_move_store_input() - Parse buf getting integer arguments.
+ * @buf: Buffer to parse.
+ * @length: Length of @buf.
+ * @arg0: Return parameter, first argument.
+ * @arg1: Return parameter, second argument.
+ *
+ * Parses the input from user write to sysfs file 'move'. Input string
+ * should contain either one or two node specifiers of form Nx where x
+ * is an integer specifying the NUMA node ID. 'N' or 'n' may be used.
+ * n/N may be omitted.
+ *
+ * e.g.
+ * echo 'N1' > /sysfs/kernel/slab/cache/move
+ * or
+ * echo 'N0 N2' > /sysfs/kernel/slab/cache/move
+ *
+ * Regex matching accepted forms: '[nN]?[0-9]( [nN]?[0-9])?'
+ *
+ * FIXME: This is really fragile. Input must be exactly correct,
+ * spurious whitespace causes parse errors.
+ *
+ * Return: 0 if an argument was successfully converted, or an error code.
+ */
+static ssize_t parse_move_store_input(const char *buf, size_t length,
+ long *arg0, long *arg1)
+{
+ char *s, *save, *ptr;
+ int ret = 0;
+
+ if (!buf)
+ return -EINVAL;
+
+ s = kstrdup(buf, GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+ save = s;
+
+ if (s[length - 1] == '\n') {
+ s[length - 1] = '\0';
+ length--;
+ }
+
+ ptr = strsep(&s, " ");
+ if (!ptr || strcmp(ptr, "") == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ if (*ptr == 'N' || *ptr == 'n')
+ ptr++;
+ ret = kstrtol(ptr, 10, arg0);
+ if (ret < 0)
+ goto out;
+
+ if (s) {
+ if (*s == 'N' || *s == 'n')
+ s++;
+ ret = kstrtol(s, 10, arg1);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = 0;
+out:
+ kfree(save);
+ return ret;
+}
+
+static bool is_valid_node(int node)
+{
+ int nid;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
+ if (nid == node)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * move_store() - Move objects between nodes.
+ * @s: The cache we are working on.
+ * @buf: String received.
+ * @length: Length of @buf.
+ *
+ * Writes to /sys/kernel/slab/<cache>/move are interpreted as follows:
+ *
+ * echo "N1" > move : Move all objects (from all nodes) to node 1.
+ * echo "N0 N1" > move : Move all objects from node 0 to node 1.
+ *
+ * 'N' may be omitted:
+ */
+static ssize_t move_store(struct kmem_cache *s, const char *buf, size_t length)
+{
+ long arg0 = -1;
+ long arg1 = -1;
+ int ret;
+
+ ret = parse_move_store_input(buf, length, &arg0, &arg1);
+ if (ret < 0)
+ return -EINVAL;
+
+ if (is_valid_node(arg0) && is_valid_node(arg1))
+ (void)kmem_cache_move(s, arg0, arg1);
+ else if (is_valid_node(arg0))
+ (void)kmem_cache_move_to_node(s, arg0);
+
+ /* FIXME: What should we be returning here? */
+ return length;
+}
+SLAB_ATTR(move);
+#endif /* CONFIG_SMO_NODE */
+
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
@@ -5718,6 +5964,9 @@ static struct attribute *slab_attrs[] = {
&reclaim_account_attr.attr,
&destroy_by_rcu_attr.attr,
&shrink_attr.attr,
+#ifdef CONFIG_SMO_NODE
+ &move_attr.attr,
+#endif
&slabs_cpu_partial_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
We have just implemented Slab Movable Objects (object migration). Currently object migration is used to defrag a cache. On NUMA systems it would be nice to be able to control the source and destination nodes when moving objects. Add CONFIG_SMO_NODE to guard this feature. CONFIG_SMO_NODE depends on CONFIG_SLUB_DEBUG because we use the full list. Implement moving all objects (including those in full slabs) to a specific node. Expose this functionality to userspace via a sysfs entry. Add sysfs entry: /sysfs/kernel/slab/<cache>/move With this users get access to the following functionality: - Move all objects to specified node. echo "N1" > move - Move all objects from specified node to other specified node (from N1 -> to N2): echo "N1 N2" > move This also enables shrinking slabs on a specific node: echo "N1 N1" > move Signed-off-by: Tobin C. Harding <tobin@kernel.org> --- mm/Kconfig | 7 ++ mm/slub.c | 249 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 256 insertions(+)