diff mbox series

[v5,4/7] mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks

Message ID 20230725100212.531277-5-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series Add support for memmap on memory feature on ppc64 | expand

Commit Message

Aneesh Kumar K.V July 25, 2023, 10:02 a.m. UTC
Currently, memmap_on_memory feature is only supported with memory block
sizes that result in vmemmap pages covering full page blocks. This is
because memory onlining/offlining code requires applicable ranges to be
pageblock-aligned, for example, to set the migratetypes properly.

This patch helps to lift that restriction by reserving more pages than
required for vmemmap space. This helps the start address to be page
block aligned with different memory block sizes. Using this facility
implies the kernel will be reserving some pages for every memoryblock.
This allows the memmap on memory feature to be widely useful with
different memory block size values.

For ex: with 64K page size and 256MiB memory block size, we require 4
pages to map vmemmap pages, To align things correctly we end up adding a
reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 .../admin-guide/mm/memory-hotplug.rst         |  12 ++
 mm/memory_hotplug.c                           | 121 ++++++++++++++++--
 2 files changed, 119 insertions(+), 14 deletions(-)

Comments

David Hildenbrand July 25, 2023, 6:06 p.m. UTC | #1
On 25.07.23 12:02, Aneesh Kumar K.V wrote:
> Currently, memmap_on_memory feature is only supported with memory block
> sizes that result in vmemmap pages covering full page blocks. This is
> because memory onlining/offlining code requires applicable ranges to be
> pageblock-aligned, for example, to set the migratetypes properly.
> 
> This patch helps to lift that restriction by reserving more pages than
> required for vmemmap space. This helps the start address to be page
> block aligned with different memory block sizes. Using this facility
> implies the kernel will be reserving some pages for every memoryblock.
> This allows the memmap on memory feature to be widely useful with
> different memory block size values.
> 
> For ex: with 64K page size and 256MiB memory block size, we require 4
> pages to map vmemmap pages, To align things correctly we end up adding a
> reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   .../admin-guide/mm/memory-hotplug.rst         |  12 ++
>   mm/memory_hotplug.c                           | 121 ++++++++++++++++--
>   2 files changed, 119 insertions(+), 14 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
> index bd77841041af..2994958c7ce8 100644
> --- a/Documentation/admin-guide/mm/memory-hotplug.rst
> +++ b/Documentation/admin-guide/mm/memory-hotplug.rst
> @@ -433,6 +433,18 @@ The following module parameters are currently defined:
>   				 memory in a way that huge pages in bigger
>   				 granularity cannot be formed on hotplugged
>   				 memory.
> +
> +				 With value "force" it could result in memory
> +				 wastage due to memmap size limitations. For
> +				 example, if the memmap for a memory block
> +				 requires 1 MiB, but the pageblock size is 2
> +				 MiB, 1 MiB of hotplugged memory will be wasted.
> +				 Note that there are still cases where the
> +				 feature cannot be enforced: for example, if the
> +				 memmap is smaller than a single page, or if the
> +				 architecture does not support the forced mode
> +				 in all configurations.
> +
>   ``online_policy``		 read-write: Set the basic policy used for
>   				 automatic zone selection when onlining memory
>   				 blocks without specifying a target zone.
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 457824a6ecb8..5b472e137898 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -41,17 +41,89 @@
>   #include "internal.h"
>   #include "shuffle.h"
>   
> +enum {
> +	MEMMAP_ON_MEMORY_DISABLE = 0,
> +	MEMMAP_ON_MEMORY_ENABLE,
> +	MEMMAP_ON_MEMORY_FORCE,
> +};
> +
> +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
> +
> +static inline unsigned long memory_block_memmap_pages(void)
> +{
> +	unsigned long memmap_size;
> +
> +	memmap_size = PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
> +	return memmap_size >> PAGE_SHIFT;

I'd really move a !page variant (memory_block_memmap_size()) to the 
previous patch and use it in mhp_supports_memmap_on_memory() and 
arch_supports_memmap_on_memory().

Then, in this patch, reuse that function in 
memory_block_memmap_on_memory_pages() and ...

> +}
> +
> +static inline unsigned long memory_block_memmap_on_memory_pages(void)
> +{
> +	unsigned long nr_pages = memory_block_memmap_pages();

... do here a

nr_pages = PHYS_PFN(memory_block_memmap_size());


Conceptually, it would be even cleaner to have here

nr_pages = PFN_UP(memory_block_memmap_size());

even though one can argue that mhp_supports_memmap_on_memory() will make 
sure that the unaligned value (memory_block_memmap_size()) covers full 
pages, but at least to me it looks cleaner that way. No strong opinion.


> +
> +	/*
> +	 * In "forced" memmap_on_memory mode, we add extra pages to align the
> +	 * vmemmap size to cover full pageblocks. That way, we can add memory
> +	 * even if the vmemmap size is not properly aligned, however, we might waste
> +	 * memory.
> +	 */
> +	if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
> +		return pageblock_align(nr_pages);
> +	return nr_pages;
> +}
> +
>   #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
>   /*
>    * memory_hotplug.memmap_on_memory parameter
>    */
> -static bool memmap_on_memory __ro_after_init;
> -module_param(memmap_on_memory, bool, 0444);
> -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
> +static int set_memmap_mode(const char *val, const struct kernel_param *kp)
> +{
> +	int ret, mode;
> +	bool enabled;
> +
> +	if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
> +		mode =  MEMMAP_ON_MEMORY_FORCE;
> +		goto matched;
> +	}

Avoid the goto + label

} else {
	ret = kstrtobool(val, &enabled);
	...
}

*((int *)kp->arg) =  mode;

> +
> +	ret = kstrtobool(val, &enabled);
> +	if (ret < 0)
> +		return ret;
> +	if (enabled)
> +		mode =  MEMMAP_ON_MEMORY_ENABLE;
> +	else
> +		mode =  MEMMAP_ON_MEMORY_DISABLE;
> +
> +matched:
> +	*((int *)kp->arg) =  mode;
> +	if (mode == MEMMAP_ON_MEMORY_FORCE) {
> +		unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
> +
> +		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
> +			memmap_pages - memory_block_memmap_pages());

pr_info_once() ?

> +	}
> +	return 0;
> +}
> +

[...]

>   	/*
>   	 * Besides having arch support and the feature enabled at runtime, we
> @@ -1294,10 +1366,28 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
>   	 *       altmap as an alternative source of memory, and we do not exactly
>   	 *       populate a single PMD.
>   	 */
> -	return mhp_memmap_on_memory() &&
> -	       size == memory_block_size_bytes() &&
> -	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
> -	       arch_supports_memmap_on_memory(size);
> +	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
> +		return false;
> +
> +	/*
> +	 * Make sure the vmemmap allocation is fully contained
> +	 * so that we always allocate vmemmap memory from altmap area.
> +	 */
> +	if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE))
> +		return false;
> +
> +	/*
> +	 * start pfn should be pageblock_nr_pages aligned for correctly
> +	 * setting migrate types
> +	 */
> +	if (!pageblock_aligned(memmap_pages))
> +		return false;
> +
> +	if (memmap_pages == PHYS_PFN(memory_block_size_bytes()))
> +		/* No effective hotplugged memory doesn't make sense. */
> +		return false;
> +
> +	return arch_supports_memmap_on_memory(size);
>   }
>   
>   /*
> @@ -1310,7 +1400,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>   {
>   	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>   	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
> -	struct vmem_altmap mhp_altmap = {};
> +	struct vmem_altmap mhp_altmap = {
> +		.base_pfn =  PHYS_PFN(res->start),
> +		.end_pfn  =  PHYS_PFN(res->end),

Is it required to set .end_pfn, and if so, shouldn't we also set it to 
base_pfn + memory_block_memmap_on_memory_pages()) ?

We also don't set it on the try_remove_memory() part,.



With these things addressed, feel free to add

Acked-by: David Hildenbrand <david@redhat.com>
Aneesh Kumar K.V July 26, 2023, 4:25 a.m. UTC | #2
David Hildenbrand <david@redhat.com> writes:

> On 25.07.23 12:02, Aneesh Kumar K.V wrote:
>> Currently, memmap_on_memory feature is only supported with memory block
>> sizes that result in vmemmap pages covering full page blocks. This is
>> because memory onlining/offlining code requires applicable ranges to be
>> pageblock-aligned, for example, to set the migratetypes properly.
>> 
>> This patch helps to lift that restriction by reserving more pages than
>> required for vmemmap space. This helps the start address to be page
>> block aligned with different memory block sizes. Using this facility
>> implies the kernel will be reserving some pages for every memoryblock.
>> This allows the memmap on memory feature to be widely useful with
>> different memory block size values.
>> 
>> For ex: with 64K page size and 256MiB memory block size, we require 4
>> pages to map vmemmap pages, To align things correctly we end up adding a
>> reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   .../admin-guide/mm/memory-hotplug.rst         |  12 ++
>>   mm/memory_hotplug.c                           | 121 ++++++++++++++++--
>>   2 files changed, 119 insertions(+), 14 deletions(-)
>> 
>> diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
>> index bd77841041af..2994958c7ce8 100644
>> --- a/Documentation/admin-guide/mm/memory-hotplug.rst
>> +++ b/Documentation/admin-guide/mm/memory-hotplug.rst
>> @@ -433,6 +433,18 @@ The following module parameters are currently defined:
>>   				 memory in a way that huge pages in bigger
>>   				 granularity cannot be formed on hotplugged
>>   				 memory.
>> +
>> +				 With value "force" it could result in memory
>> +				 wastage due to memmap size limitations. For
>> +				 example, if the memmap for a memory block
>> +				 requires 1 MiB, but the pageblock size is 2
>> +				 MiB, 1 MiB of hotplugged memory will be wasted.
>> +				 Note that there are still cases where the
>> +				 feature cannot be enforced: for example, if the
>> +				 memmap is smaller than a single page, or if the
>> +				 architecture does not support the forced mode
>> +				 in all configurations.
>> +
>>   ``online_policy``		 read-write: Set the basic policy used for
>>   				 automatic zone selection when onlining memory
>>   				 blocks without specifying a target zone.
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 457824a6ecb8..5b472e137898 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -41,17 +41,89 @@
>>   #include "internal.h"
>>   #include "shuffle.h"
>>   
>> +enum {
>> +	MEMMAP_ON_MEMORY_DISABLE = 0,
>> +	MEMMAP_ON_MEMORY_ENABLE,
>> +	MEMMAP_ON_MEMORY_FORCE,
>> +};
>> +
>> +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
>> +
>> +static inline unsigned long memory_block_memmap_pages(void)
>> +{
>> +	unsigned long memmap_size;
>> +
>> +	memmap_size = PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
>> +	return memmap_size >> PAGE_SHIFT;
>
> I'd really move a !page variant (memory_block_memmap_size()) to the 
> previous patch and use it in mhp_supports_memmap_on_memory() and 
> arch_supports_memmap_on_memory().
>
> Then, in this patch, reuse that function in 
> memory_block_memmap_on_memory_pages() and ...
>
>> +}
>> +
>> +static inline unsigned long memory_block_memmap_on_memory_pages(void)
>> +{
>> +	unsigned long nr_pages = memory_block_memmap_pages();
>
> ... do here a
>
> nr_pages = PHYS_PFN(memory_block_memmap_size());
>
>
> Conceptually, it would be even cleaner to have here
>
> nr_pages = PFN_UP(memory_block_memmap_size());
>
> even though one can argue that mhp_supports_memmap_on_memory() will make 
> sure that the unaligned value (memory_block_memmap_size()) covers full 
> pages, but at least to me it looks cleaner that way. No strong opinion.
>
>
>> +
>> +	/*
>> +	 * In "forced" memmap_on_memory mode, we add extra pages to align the
>> +	 * vmemmap size to cover full pageblocks. That way, we can add memory
>> +	 * even if the vmemmap size is not properly aligned, however, we might waste
>> +	 * memory.
>> +	 */
>> +	if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
>> +		return pageblock_align(nr_pages);
>> +	return nr_pages;
>> +}
>> +
>>   #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
>>   /*
>>    * memory_hotplug.memmap_on_memory parameter
>>    */
>> -static bool memmap_on_memory __ro_after_init;
>> -module_param(memmap_on_memory, bool, 0444);
>> -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
>> +static int set_memmap_mode(const char *val, const struct kernel_param *kp)
>> +{
>> +	int ret, mode;
>> +	bool enabled;
>> +
>> +	if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
>> +		mode =  MEMMAP_ON_MEMORY_FORCE;
>> +		goto matched;
>> +	}
>
> Avoid the goto + label
>
> } else {
> 	ret = kstrtobool(val, &enabled);
> 	...
> }
>
> *((int *)kp->arg) =  mode;
>
>> +
>> +	ret = kstrtobool(val, &enabled);
>> +	if (ret < 0)
>> +		return ret;
>> +	if (enabled)
>> +		mode =  MEMMAP_ON_MEMORY_ENABLE;
>> +	else
>> +		mode =  MEMMAP_ON_MEMORY_DISABLE;
>> +
>> +matched:
>> +	*((int *)kp->arg) =  mode;
>> +	if (mode == MEMMAP_ON_MEMORY_FORCE) {
>> +		unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
>> +
>> +		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
>> +			memmap_pages - memory_block_memmap_pages());
>
> pr_info_once() ?
>
>> +	}
>> +	return 0;
>> +}
>> +
>
> [...]
>
>>   	/*
>>   	 * Besides having arch support and the feature enabled at runtime, we
>> @@ -1294,10 +1366,28 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
>>   	 *       altmap as an alternative source of memory, and we do not exactly
>>   	 *       populate a single PMD.
>>   	 */
>> -	return mhp_memmap_on_memory() &&
>> -	       size == memory_block_size_bytes() &&
>> -	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
>> -	       arch_supports_memmap_on_memory(size);
>> +	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
>> +		return false;
>> +
>> +	/*
>> +	 * Make sure the vmemmap allocation is fully contained
>> +	 * so that we always allocate vmemmap memory from altmap area.
>> +	 */
>> +	if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE))
>> +		return false;
>> +
>> +	/*
>> +	 * start pfn should be pageblock_nr_pages aligned for correctly
>> +	 * setting migrate types
>> +	 */
>> +	if (!pageblock_aligned(memmap_pages))
>> +		return false;
>> +
>> +	if (memmap_pages == PHYS_PFN(memory_block_size_bytes()))
>> +		/* No effective hotplugged memory doesn't make sense. */
>> +		return false;
>> +
>> +	return arch_supports_memmap_on_memory(size);
>>   }
>>   
>>   /*
>> @@ -1310,7 +1400,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>>   {
>>   	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>   	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>> -	struct vmem_altmap mhp_altmap = {};
>> +	struct vmem_altmap mhp_altmap = {
>> +		.base_pfn =  PHYS_PFN(res->start),
>> +		.end_pfn  =  PHYS_PFN(res->end),
>
> Is it required to set .end_pfn, and if so, shouldn't we also set it to 
> base_pfn + memory_block_memmap_on_memory_pages()) ?
>

We use that in ppc64 for checking altmap boundary condition. As we
discussed earlier, ppc64 due to vmemmap mapping size restrictions can't
always allocate vmemmap pages from altmap area even if requested. We
fallback to regular memory alocation in that case (only used now with
pmem). We use altmap.end_pfn for that boundary check. You can refer to
altmap_cross_boundary() for more details.

>
> We also don't set it on the try_remove_memory() part,.
>

We use altmap.end_pfn only in the allocation path. This confusion is
also another motivation to embed altmap in memory block structure as
done in patch 6.

>
>
> With these things addressed, feel free to add
>
> Acked-by: David Hildenbrand <david@redhat.com>
>

Thanks for all your help with review. Attaching the diff against
previous version taking care of the feedback above.

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 7e46369d57c9..2209d66a034e 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -176,11 +176,8 @@ bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
  * alignment requirement is met using altmap->reserve blocks.
  */
 #define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
-static inline bool arch_supports_memmap_on_memory(unsigned long size)
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
 {
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-	unsigned long vmemmap_size = nr_pages * sizeof(struct page);
-
 	if (!radix_enabled())
 		return false;
 	/*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1c9069cc0263..cadf4209d7c4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -49,17 +49,14 @@ enum {
 
 static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
 
-static inline unsigned long memory_block_memmap_pages(void)
+static inline unsigned long memory_block_memmap_size(void)
 {
-	unsigned long memmap_size;
-
-	memmap_size = PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
-	return memmap_size >> PAGE_SHIFT;
+	return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
 }
 
 static inline unsigned long memory_block_memmap_on_memory_pages(void)
 {
-	unsigned long nr_pages = memory_block_memmap_pages();
+	unsigned long nr_pages = PFN_UP(memory_block_memmap_size());
 
 	/*
 	 * In "forced" memmap_on_memory mode, we add extra pages to align the
@@ -83,18 +80,15 @@ static int set_memmap_mode(const char *val, const struct kernel_param *kp)
 
 	if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
 		mode =  MEMMAP_ON_MEMORY_FORCE;
-		goto matched;
+	} else {
+		ret = kstrtobool(val, &enabled);
+		if (ret < 0)
+			return ret;
+		if (enabled)
+			mode = MEMMAP_ON_MEMORY_ENABLE;
+		else
+			mode = MEMMAP_ON_MEMORY_DISABLE;
 	}
-
-	ret = kstrtobool(val, &enabled);
-	if (ret < 0)
-		return ret;
-	if (enabled)
-		mode =  MEMMAP_ON_MEMORY_ENABLE;
-	else
-		mode =  MEMMAP_ON_MEMORY_DISABLE;
-
-matched:
 	/*
 	 * Avoid changing memmap mode during hotplug.
 	 */
@@ -104,8 +98,8 @@ static int set_memmap_mode(const char *val, const struct kernel_param *kp)
 	if (mode == MEMMAP_ON_MEMORY_FORCE) {
 		unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
 
-		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
-			memmap_pages - memory_block_memmap_pages());
+		pr_info_once("Memory hotplug will reserve %ld pages in each memory block\n",
+			     memmap_pages - PFN_UP(memory_block_memmap_size()));
 	}
 	return 0;
 }
@@ -1327,11 +1321,8 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 }
 
 #ifndef arch_supports_memmap_on_memory
-static inline bool arch_supports_memmap_on_memory(unsigned long size)
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
 {
-	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
-	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
-
 	/*
 	 * As default, we want the vmemmap to span a complete PMD such that we
 	 * can map the vmemmap using a single PMD if supported by the
@@ -1343,8 +1334,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long size)
 
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
-	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
-	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+	unsigned long vmemmap_size = memory_block_memmap_size();
 	unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
 
 	/*
@@ -1394,7 +1384,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 		/* No effective hotplugged memory doesn't make sense. */
 		return false;
 
-	return arch_supports_memmap_on_memory(size);
+	return arch_supports_memmap_on_memory(vmemmap_size);
 }
 
 /*
David Hildenbrand July 26, 2023, 9:04 a.m. UTC | #3
>>>    /*
>>> @@ -1310,7 +1400,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>>>    {
>>>    	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>>    	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>>> -	struct vmem_altmap mhp_altmap = {};
>>> +	struct vmem_altmap mhp_altmap = {
>>> +		.base_pfn =  PHYS_PFN(res->start),
>>> +		.end_pfn  =  PHYS_PFN(res->end),
>>
>> Is it required to set .end_pfn, and if so, shouldn't we also set it to
>> base_pfn + memory_block_memmap_on_memory_pages()) ?
>>
> 
> We use that in ppc64 for checking altmap boundary condition. As we
> discussed earlier, ppc64 due to vmemmap mapping size restrictions can't
> always allocate vmemmap pages from altmap area even if requested. We
> fallback to regular memory alocation in that case (only used now with
> pmem). We use altmap.end_pfn for that boundary check. You can refer to
> altmap_cross_boundary() for more details.

But even then, setting the end to the end of the resource size is wrong, 
no? We don't want anybody to allocate beyond base_pfn + 
memory_block_memmap_on_memory_pages().


Apart from that, LGTM.
David Hildenbrand July 26, 2023, 4:39 p.m. UTC | #4
On 26.07.23 11:57, Aneesh Kumar K V wrote:
> On 7/26/23 2:34 PM, David Hildenbrand wrote:
>>
>>>>>     /*
>>>>> @@ -1310,7 +1400,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>>>>>     {
>>>>>         struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>>>>         enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>>>>> -    struct vmem_altmap mhp_altmap = {};
>>>>> +    struct vmem_altmap mhp_altmap = {
>>>>> +        .base_pfn =  PHYS_PFN(res->start),
>>>>> +        .end_pfn  =  PHYS_PFN(res->end),
>>>>
>>>> Is it required to set .end_pfn, and if so, shouldn't we also set it to
>>>> base_pfn + memory_block_memmap_on_memory_pages()) ?
>>>>
>>>
>>> We use that in ppc64 for checking altmap boundary condition. As we
>>> discussed earlier, ppc64 due to vmemmap mapping size restrictions can't
>>> always allocate vmemmap pages from altmap area even if requested. We
>>> fallback to regular memory alocation in that case (only used now with
>>> pmem). We use altmap.end_pfn for that boundary check. You can refer to
>>> altmap_cross_boundary() for more details.
>>
>> But even then, setting the end to the end of the resource size is wrong, no? We don't want anybody to allocate beyond base_pfn + memory_block_memmap_on_memory_pages().
>>
> 
> altmap.end is the end pfn of the resource

Oh, thanks for pointing that out. I wonder why the altmap even has to 
care about that ...
diff mbox series

Patch

diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index bd77841041af..2994958c7ce8 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -433,6 +433,18 @@  The following module parameters are currently defined:
 				 memory in a way that huge pages in bigger
 				 granularity cannot be formed on hotplugged
 				 memory.
+
+				 With value "force" it could result in memory
+				 wastage due to memmap size limitations. For
+				 example, if the memmap for a memory block
+				 requires 1 MiB, but the pageblock size is 2
+				 MiB, 1 MiB of hotplugged memory will be wasted.
+				 Note that there are still cases where the
+				 feature cannot be enforced: for example, if the
+				 memmap is smaller than a single page, or if the
+				 architecture does not support the forced mode
+				 in all configurations.
+
 ``online_policy``		 read-write: Set the basic policy used for
 				 automatic zone selection when onlining memory
 				 blocks without specifying a target zone.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 457824a6ecb8..5b472e137898 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,89 @@ 
 #include "internal.h"
 #include "shuffle.h"
 
+enum {
+	MEMMAP_ON_MEMORY_DISABLE = 0,
+	MEMMAP_ON_MEMORY_ENABLE,
+	MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_pages(void)
+{
+	unsigned long memmap_size;
+
+	memmap_size = PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+	return memmap_size >> PAGE_SHIFT;
+}
+
+static inline unsigned long memory_block_memmap_on_memory_pages(void)
+{
+	unsigned long nr_pages = memory_block_memmap_pages();
+
+	/*
+	 * In "forced" memmap_on_memory mode, we add extra pages to align the
+	 * vmemmap size to cover full pageblocks. That way, we can add memory
+	 * even if the vmemmap size is not properly aligned, however, we might waste
+	 * memory.
+	 */
+	if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+		return pageblock_align(nr_pages);
+	return nr_pages;
+}
+
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+	int ret, mode;
+	bool enabled;
+
+	if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
+		mode =  MEMMAP_ON_MEMORY_FORCE;
+		goto matched;
+	}
+
+	ret = kstrtobool(val, &enabled);
+	if (ret < 0)
+		return ret;
+	if (enabled)
+		mode =  MEMMAP_ON_MEMORY_ENABLE;
+	else
+		mode =  MEMMAP_ON_MEMORY_DISABLE;
+
+matched:
+	*((int *)kp->arg) =  mode;
+	if (mode == MEMMAP_ON_MEMORY_FORCE) {
+		unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
+
+		pr_info("Memory hotplug will reserve %ld pages in each memory block\n",
+			memmap_pages - memory_block_memmap_pages());
+	}
+	return 0;
+}
+
+static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
+{
+	if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
+		return sprintf(buffer,  "force\n");
+	return param_get_bool(buffer, kp);
+}
+
+static const struct kernel_param_ops memmap_mode_ops = {
+	.set = set_memmap_mode,
+	.get = get_memmap_mode,
+};
+module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
+		 "With value \"force\" it could result in memory wastage due "
+		 "to memmap size limitations (Y/N/force)");
 
 static inline bool mhp_memmap_on_memory(void)
 {
-	return memmap_on_memory;
+	return memmap_mode != MEMMAP_ON_MEMORY_DISABLE;
 }
 #else
 static inline bool mhp_memmap_on_memory(void)
@@ -1266,7 +1338,7 @@  static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
 	unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
 	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
-	unsigned long remaining_size = size - vmemmap_size;
+	unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
 
 	/*
 	 * Besides having arch support and the feature enabled at runtime, we
@@ -1294,10 +1366,28 @@  static bool mhp_supports_memmap_on_memory(unsigned long size)
 	 *       altmap as an alternative source of memory, and we do not exactly
 	 *       populate a single PMD.
 	 */
-	return mhp_memmap_on_memory() &&
-	       size == memory_block_size_bytes() &&
-	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
-	       arch_supports_memmap_on_memory(size);
+	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+		return false;
+
+	/*
+	 * Make sure the vmemmap allocation is fully contained
+	 * so that we always allocate vmemmap memory from altmap area.
+	 */
+	if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE))
+		return false;
+
+	/*
+	 * start pfn should be pageblock_nr_pages aligned for correctly
+	 * setting migrate types
+	 */
+	if (!pageblock_aligned(memmap_pages))
+		return false;
+
+	if (memmap_pages == PHYS_PFN(memory_block_size_bytes()))
+		/* No effective hotplugged memory doesn't make sense. */
+		return false;
+
+	return arch_supports_memmap_on_memory(size);
 }
 
 /*
@@ -1310,7 +1400,10 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
 	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
 	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
-	struct vmem_altmap mhp_altmap = {};
+	struct vmem_altmap mhp_altmap = {
+		.base_pfn =  PHYS_PFN(res->start),
+		.end_pfn  =  PHYS_PFN(res->end),
+	};
 	struct memory_group *group = NULL;
 	u64 start, size;
 	bool new_node = false;
@@ -1355,8 +1448,7 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
 		if (mhp_supports_memmap_on_memory(size)) {
-			mhp_altmap.free = PHYS_PFN(size);
-			mhp_altmap.base_pfn = PHYS_PFN(start);
+			mhp_altmap.free = memory_block_memmap_on_memory_pages();
 			params.altmap = &mhp_altmap;
 		}
 		/* fallback to not using altmap  */
@@ -1368,8 +1460,7 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		goto error;
 
 	/* create memory block devices after memory was added */
-	ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
-					  group);
+	ret = create_memory_block_devices(start, size, mhp_altmap.free, group);
 	if (ret) {
 		arch_remove_memory(start, size, NULL);
 		goto error;
@@ -2095,6 +2186,8 @@  static int __ref try_remove_memory(u64 start, u64 size)
 			 * right thing if we used vmem_altmap when hot-adding
 			 * the range.
 			 */
+			mhp_altmap.base_pfn = PHYS_PFN(start);
+			mhp_altmap.free = nr_vmemmap_pages;
 			mhp_altmap.alloc = nr_vmemmap_pages;
 			altmap = &mhp_altmap;
 		}