diff mbox series

[RFC,3/7] mm: memory_hotplug: decouple memory_block size with section size.

Message ID 20210506152623.178731-4-zi.yan@sent.com (mailing list archive)
State New, archived
Headers show
Series Memory hotplug/hotremove at subsection size | expand

Commit Message

Zi Yan May 6, 2021, 3:26 p.m. UTC
From: Zi Yan <ziy@nvidia.com>

To enable subsection memory online/offline, we need to remove the
assumption of memory_block size being greater or equal to section size.

The following changes are made:
1. use (start_pfn, nr_pages) pair to specify memory_block size instead of
   start_section_nr.
2. calculate memory_block id using phys / memory_block_size_bytes()
   instead of section number.

The memory_block minimum size is set to the smaller of 128MB (the old
x86_64 section size) and section size instead.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 drivers/base/memory.c  | 176 ++++++++++++++++++++---------------------
 drivers/base/node.c    |   2 +-
 include/linux/memory.h |   8 +-
 mm/memory_hotplug.c    |   6 +-
 4 files changed, 98 insertions(+), 94 deletions(-)
diff mbox series

Patch

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b31b3af5c490..141431eb64a4 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -50,19 +50,15 @@  int mhp_online_type_from_str(const char *str)
 
 static int sections_per_block;
 
-static inline unsigned long memory_block_id(unsigned long section_nr)
+static inline unsigned long phys_to_block_id(unsigned long phys)
 {
-	return section_nr / sections_per_block;
+	return phys / memory_block_size_bytes();
 }
 
 static inline unsigned long pfn_to_block_id(unsigned long pfn)
 {
-	return memory_block_id(pfn_to_section_nr(pfn));
-}
-
-static inline unsigned long phys_to_block_id(unsigned long phys)
-{
-	return pfn_to_block_id(PFN_DOWN(phys));
+	/* calculate using memory_block_size_bytes() */
+	return phys_to_block_id(PFN_PHYS(pfn));
 }
 
 static int memory_subsys_online(struct device *dev);
@@ -118,7 +114,7 @@  static ssize_t phys_index_show(struct device *dev,
 	struct memory_block *mem = to_memory_block(dev);
 	unsigned long phys_index;
 
-	phys_index = mem->start_section_nr / sections_per_block;
+	phys_index = pfn_to_section_nr(mem->start_pfn);
 
 	return sysfs_emit(buf, "%08lx\n", phys_index);
 }
@@ -171,8 +167,8 @@  int memory_notify(unsigned long val, void *v)
 
 static int memory_block_online(struct memory_block *mem)
 {
-	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
-	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+	unsigned long start_pfn = mem->start_pfn;
+	unsigned long nr_pages = mem->nr_pages;
 	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
 	struct zone *zone;
 	int ret;
@@ -212,8 +208,8 @@  static int memory_block_online(struct memory_block *mem)
 
 static int memory_block_offline(struct memory_block *mem)
 {
-	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
-	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+	unsigned long start_pfn = mem->start_pfn;
+	unsigned long nr_pages = mem->nr_pages;
 	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
 	struct zone *zone;
 	int ret;
@@ -260,7 +256,7 @@  memory_block_action(struct memory_block *mem, unsigned long action)
 		break;
 	default:
 		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
-		     "%ld\n", __func__, mem->start_section_nr, action, action);
+		     "%ld\n", __func__, mem->start_pfn, mem->nr_pages, action);
 		ret = -EINVAL;
 	}
 
@@ -366,7 +362,7 @@  static ssize_t phys_device_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct memory_block *mem = to_memory_block(dev);
-	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+	unsigned long start_pfn = mem->start_pfn;
 
 	return sysfs_emit(buf, "%d\n",
 			  arch_get_memory_phys_device(start_pfn));
@@ -390,8 +386,8 @@  static ssize_t valid_zones_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct memory_block *mem = to_memory_block(dev);
-	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
-	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+	unsigned long start_pfn = mem->start_pfn;
+	unsigned long nr_pages = mem->nr_pages;
 	struct zone *default_zone;
 	int len = 0;
 	int nid;
@@ -575,16 +571,6 @@  static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 	return mem;
 }
 
-/*
- * Called under device_hotplug_lock.
- */
-struct memory_block *find_memory_block(struct mem_section *section)
-{
-	unsigned long block_id = memory_block_id(__section_nr(section));
-
-	return find_memory_block_by_id(block_id);
-}
-
 static struct attribute *memory_memblk_attrs[] = {
 	&dev_attr_phys_index.attr,
 	&dev_attr_state.attr,
@@ -614,7 +600,7 @@  int register_memory(struct memory_block *memory)
 	int ret;
 
 	memory->dev.bus = &memory_subsys;
-	memory->dev.id = memory->start_section_nr / sections_per_block;
+	memory->dev.id = memory->start_pfn / (memory_block_size_bytes() >> PAGE_SHIFT);
 	memory->dev.release = memory_block_release;
 	memory->dev.groups = memory_memblk_attr_groups;
 	memory->dev.offline = memory->state == MEM_OFFLINE;
@@ -633,57 +619,89 @@  int register_memory(struct memory_block *memory)
 	return ret;
 }
 
-static int init_memory_block(unsigned long block_id, unsigned long state,
+static void unregister_memory(struct memory_block *memory)
+{
+	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
+		return;
+
+	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
+
+	/* drop the ref. we got via find_memory_block() */
+	put_device(&memory->dev);
+	device_unregister(&memory->dev);
+}
+
+static int init_memory_blocks(unsigned long start_pfn, unsigned long num_pages, unsigned long state,
 			     unsigned long nr_vmemmap_pages)
 {
 	struct memory_block *mem;
 	int ret = 0;
+	unsigned long block_nr_pages = memory_block_size_bytes() / PAGE_SIZE;
+	unsigned long block_start_pfn;
 
-	mem = find_memory_block_by_id(block_id);
-	if (mem) {
-		put_device(&mem->dev);
-		return -EEXIST;
-	}
-	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
-	if (!mem)
-		return -ENOMEM;
-
-	mem->start_section_nr = block_id * sections_per_block;
-	mem->state = state;
-	mem->nid = NUMA_NO_NODE;
-	mem->nr_vmemmap_pages = nr_vmemmap_pages;
+	for (block_start_pfn = start_pfn; num_pages != 0; block_start_pfn += block_nr_pages) {
+		unsigned long block_id = pfn_to_block_id(block_start_pfn);
 
-	ret = register_memory(mem);
-
-	return ret;
+		mem = find_memory_block_by_id(block_id);
+		if (mem) {
+			put_device(&mem->dev);
+			return -EEXIST;
+		}
+		mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+		if (!mem)
+			return -ENOMEM;
+
+		mem->start_pfn = block_start_pfn;
+		mem->nr_pages = min(num_pages, block_nr_pages);
+		mem->state = state;
+		mem->nid = NUMA_NO_NODE;
+		mem->nr_vmemmap_pages = nr_vmemmap_pages;
+
+		ret = register_memory(mem);
+
+		if (ret) {
+			unsigned long unregister_block_pfn;
+
+			for (unregister_block_pfn = start_pfn;
+			     unregister_block_pfn < block_start_pfn;
+			     unregister_block_pfn -= block_nr_pages) {
+				block_id = pfn_to_block_id(unregister_block_pfn);
+				mem = find_memory_block_by_id(block_id);
+				if (WARN_ON_ONCE(!mem))
+					continue;
+				unregister_memory(mem);
+			}
+			return -EINVAL;
+		}
+		if (num_pages > block_nr_pages)
+			num_pages -= block_nr_pages;
+		else
+			num_pages = 0;
+	}
+	return 0;
 }
 
-static int add_memory_block(unsigned long base_section_nr)
+static void add_whole_section_memory_block(unsigned long base_section_nr)
 {
-	int section_count = 0;
-	unsigned long nr;
+	int ret;
+	unsigned long start_pfn = section_nr_to_pfn(base_section_nr);
+	unsigned long nr_pages = 0;
+	struct mem_section *ms = __nr_to_section(base_section_nr);
 
-	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
-	     nr++)
-		if (present_section_nr(nr))
-			section_count++;
+	if (bitmap_full(ms->usage->subsection_map, SUBSECTIONS_PER_SECTION))
+		nr_pages = PAGES_PER_SECTION;
+	else
+		nr_pages = PAGES_PER_SUBSECTION *
+			bitmap_weight(ms->usage->subsection_map, SUBSECTIONS_PER_SECTION);
 
-	if (section_count == 0)
-		return 0;
-	return init_memory_block(memory_block_id(base_section_nr),
-				 MEM_ONLINE, 0);
-}
 
-static void unregister_memory(struct memory_block *memory)
-{
-	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
+	if (!nr_pages)
 		return;
 
-	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
-
-	/* drop the ref. we got via find_memory_block() */
-	put_device(&memory->dev);
-	device_unregister(&memory->dev);
+	ret = init_memory_blocks(start_pfn, nr_pages, MEM_ONLINE, 0);
+	if (ret)
+		panic("%s() failed to add memory block: %d\n", __func__,
+		      ret);
 }
 
 /*
@@ -696,31 +714,16 @@  static void unregister_memory(struct memory_block *memory)
 int create_memory_block_devices(unsigned long start, unsigned long size,
 				unsigned long vmemmap_pages)
 {
-	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
-	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
-	struct memory_block *mem;
-	unsigned long block_id;
+	unsigned long start_pfn = PFN_DOWN(start);
+	unsigned long end_pfn = PFN_DOWN(start + size);
 	int ret = 0;
 
 	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 			 !IS_ALIGNED(size, memory_block_size_bytes())))
 		return -EINVAL;
 
-	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
-		if (ret)
-			break;
-	}
-	if (ret) {
-		end_block_id = block_id;
-		for (block_id = start_block_id; block_id != end_block_id;
-		     block_id++) {
-			mem = find_memory_block_by_id(block_id);
-			if (WARN_ON_ONCE(!mem))
-				continue;
-			unregister_memory(mem);
-		}
-	}
+	ret = init_memory_blocks(start_pfn, end_pfn - start_pfn, MEM_OFFLINE, vmemmap_pages);
+
 	return ret;
 }
 
@@ -807,10 +810,7 @@  void __init memory_dev_init(void)
 	 */
 	for (nr = 0; nr <= __highest_present_section_nr;
 	     nr += sections_per_block) {
-		ret = add_memory_block(nr);
-		if (ret)
-			panic("%s() failed to add memory block: %d\n", __func__,
-			      ret);
+		add_whole_section_memory_block(nr);
 	}
 }
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 2c36f61d30bc..76d67b8ddf1b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -809,7 +809,7 @@  static int register_mem_block_under_node_early(struct memory_block *mem_blk,
 					       void *arg)
 {
 	unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
-	unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+	unsigned long start_pfn = mem_blk->start_pfn;
 	unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
 	int nid = *(int *)arg;
 	unsigned long pfn;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 97e92e8b556a..e9590c7c6a9e 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -21,10 +21,15 @@ 
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 
+#if SECTION_SIZE_BITS > 27  /* 128MB */
+#define MIN_MEMORY_BLOCK_SIZE     (1UL << 27)
+#else
 #define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
+#endif
 
 struct memory_block {
-	unsigned long start_section_nr;
+	unsigned long start_pfn;
+	unsigned long nr_pages;
 	unsigned long state;		/* serialized by the dev->lock */
 	int online_type;		/* for passing data to online routine */
 	int nid;			/* NID for this memory block */
@@ -90,7 +95,6 @@  int create_memory_block_devices(unsigned long start, unsigned long size,
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block(struct mem_section *);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
 			      void *arg, walk_memory_blocks_func_t func);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70620d0dd923..6e93b0ecc5cb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1872,8 +1872,8 @@  static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 	if (unlikely(ret)) {
 		phys_addr_t beginpa, endpa;
 
-		beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
-		endpa = beginpa + memory_block_size_bytes() - 1;
+		beginpa = PFN_PHYS(mem->start_pfn);
+		endpa = beginpa + mem->nr_pages * PAGE_SIZE - 1;
 		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
 			&beginpa, &endpa);
 
@@ -2079,7 +2079,7 @@  static int try_offline_memory_block(struct memory_block *mem, void *arg)
 	 * with multiple zones within one memory block will be rejected
 	 * by offlining code ... so we don't care about that.
 	 */
-	page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
+	page = pfn_to_online_page(mem->start_pfn);
 	if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
 		online_type = MMOP_ONLINE_MOVABLE;