diff mbox series

[v2,08/22] memblock: Introduce a generic phys_addr_to_target_node()

Message ID 159457120893.754248.7783260004248722175.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Superseded
Headers show
Series device-dax: Support sub-dividing soft-reserved ranges | expand

Commit Message

Dan Williams July 12, 2020, 4:26 p.m. UTC
Similar to how generic memory_add_physaddr_to_nid() interrogates
memblock data for numa information, introduce
get_reserved_pfn_range_from_nid() to enable the same operation for
reserved memory ranges. Example memory ranges that are reserved, but
still have associated numa-info are persistent memory or Soft Reserved
(EFI_MEMORY_SP) memory.

Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Jia He <justin.he@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/memblock.h |    4 +++
 include/linux/mm.h       |    2 +
 include/linux/numa.h     |    2 +
 mm/memblock.c            |   22 ++++++++++++++--
 mm/page_alloc.c          |   63 +++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 87 insertions(+), 6 deletions(-)

Comments

Mike Rapoport July 13, 2020, 7:03 a.m. UTC | #1
Hi Dan,

On Sun, Jul 12, 2020 at 09:26:48AM -0700, Dan Williams wrote:
> Similar to how generic memory_add_physaddr_to_nid() interrogates
> memblock data for numa information, introduce
> get_reserved_pfn_range_from_nid() to enable the same operation for
> reserved memory ranges. Example memory ranges that are reserved, but
> still have associated numa-info are persistent memory or Soft Reserved
> (EFI_MEMORY_SP) memory.

Here again, I would prefer to add a weak default for
phys_to_target_node() because the "generic" implementation is not really
generic.

The fallback to reserved ranges is x86 specfic because on x86 most of the
reserved areas is not in memblock.memory. AFAIK, no other architecture
does this.

And x86 anyway has implementation of phys_to_target_node().

> Cc: Mike Rapoport <rppt@linux.ibm.com>
> Cc: Jia He <justin.he@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  include/linux/memblock.h |    4 +++
>  include/linux/mm.h       |    2 +
>  include/linux/numa.h     |    2 +
>  mm/memblock.c            |   22 ++++++++++++++--
>  mm/page_alloc.c          |   63 +++++++++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 87 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 017fae833d4a..0655e8376c72 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -234,6 +234,10 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
>  	for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
>  	     i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
>  
> +#define for_each_reserved_pfn_range(i, nid, p_start, p_end, p_nid)		\
> +	for (i = -1, __next_reserved_pfn_range(&i, nid, p_start, p_end, p_nid); \
> +	     i >= 0; __next_reserved_pfn_range(&i, nid, p_start, p_end, p_nid))
> +
>  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
>  void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
>  				  unsigned long *out_spfn,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1e76ee5da20b..82dac9f42c46 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2438,6 +2438,8 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
>  
>  extern void get_pfn_range_for_nid(unsigned int nid,
>  			unsigned long *start_pfn, unsigned long *end_pfn);
> +extern void get_reserved_pfn_range_for_nid(unsigned int nid,
> +			unsigned long *start_pfn, unsigned long *end_pfn);
>  extern unsigned long find_min_pfn_with_active_regions(void);
>  extern void sparse_memory_present_with_active_regions(int nid);
>  
> diff --git a/include/linux/numa.h b/include/linux/numa.h
> index 5d25c5de1322..52b2430bc759 100644
> --- a/include/linux/numa.h
> +++ b/include/linux/numa.h
> @@ -19,7 +19,7 @@ int numa_map_to_online_node(int node);
>  
>  /*
>   * Optional architecture specific implementation, users need a "depends
> - * on $ARCH"
> + * on $ARCH" or depends on CONFIG_MEMBLOCK_NUMA_INFO
>   */
>  int phys_to_target_node(phys_addr_t addr);
>  #else
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 39aceafc57f6..43c3abab705e 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -1200,11 +1200,11 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
>  /*
>   * Common iterator interface used to define for_each_mem_pfn_range().
>   */
> -void __init_memblock __next_mem_pfn_range(int *idx, int nid,
> +static void __init_memblock __next_memblock_pfn_range(int *idx, int nid,
>  				unsigned long *out_start_pfn,
> -				unsigned long *out_end_pfn, int *out_nid)
> +				unsigned long *out_end_pfn, int *out_nid,
> +				struct memblock_type *type)
>  {
> -	struct memblock_type *type = &memblock.memory;
>  	struct memblock_region *r;
>  	int r_nid;
>  
> @@ -1230,6 +1230,22 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
>  		*out_nid = r_nid;
>  }
>  
> +void __init_memblock __next_mem_pfn_range(int *idx, int nid,
> +				unsigned long *out_start_pfn,
> +				unsigned long *out_end_pfn, int *out_nid)
> +{
> +	__next_memblock_pfn_range(idx, nid, out_start_pfn, out_end_pfn, out_nid,
> +				  &memblock.memory);
> +}
> +
> +void __init_memblock __next_reserved_pfn_range(int *idx, int nid,
> +				unsigned long *out_start_pfn,
> +				unsigned long *out_end_pfn, int *out_nid)
> +{
> +	__next_memblock_pfn_range(idx, nid, out_start_pfn, out_end_pfn, out_nid,
> +				  &memblock.reserved);
> +}
> +
>  /**
>   * memblock_set_node - set node ID on memblock regions
>   * @base: base of area to set node ID for
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index df8bd169dbb4..94ad77c0c338 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6369,12 +6369,39 @@ void __init_or_memblock get_pfn_range_for_nid(unsigned int nid,
>  		*start_pfn = 0;
>  }
>  
> +/**
> + * get_reserved_pfn_range_for_nid - Return the start and end page frames for a node
> + * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
> + * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
> + * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
> + *
> + * Mostly identical to get_pfn_range_for_nid() except it operates on
> + * reserved ranges rather than online memory.
> + */
> +void __init_or_memblock get_reserved_pfn_range_for_nid(unsigned int nid,
> +			unsigned long *start_pfn, unsigned long *end_pfn)
> +{
> +	unsigned long this_start_pfn, this_end_pfn;
> +	int i;
> +
> +	*start_pfn = -1UL;
> +	*end_pfn = 0;
> +
> +	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
> +		*start_pfn = min(*start_pfn, this_start_pfn);
> +		*end_pfn = max(*end_pfn, this_end_pfn);
> +	}
> +
> +	if (*start_pfn == -1UL)
> +		*start_pfn = 0;
> +}
> +
>  /*
>   * Generic implementation of memory_add_physaddr_to_nid() depends on
>   * architecture using memblock data for numa information.
>   */
>  #ifdef CONFIG_MEMBLOCK_NUMA_INFO
> -int __init_or_memblock memory_add_physaddr_to_nid(u64 addr)
> +static int __init_or_memblock __memory_add_physaddr_to_nid(u64 addr)
>  {
>  	unsigned long start_pfn, end_pfn, pfn = PHYS_PFN(addr);
>  	int nid;
> @@ -6384,10 +6411,42 @@ int __init_or_memblock memory_add_physaddr_to_nid(u64 addr)
>  		if (pfn >= start_pfn && pfn <= end_pfn)
>  			return nid;
>  	}
> +	return NUMA_NO_NODE;
> +}
> +
> +int __init_or_memblock memory_add_physaddr_to_nid(u64 addr)
> +{
> +	int nid = __memory_add_physaddr_to_nid(addr);
> +
>  	/* Default to node0 as not all callers are prepared for this to fail */
> -	return 0;
> +	if (nid == NUMA_NO_NODE)
> +		return 0;
> +	return nid;
>  }
>  EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
> +
> +int __init_or_memblock phys_to_target_node(u64 addr)
> +{
> +	unsigned long start_pfn, end_pfn, pfn = PHYS_PFN(addr);
> +	int nid = __memory_add_physaddr_to_nid(addr);
> +
> +	if (nid != NUMA_NO_NODE)
> +		return nid;
> +
> +	/*
> +	 * Search reserved memory ranges since the memory address does
> +	 * not appear to be online
> +	 */
> +	for_each_possible_node(nid) {
> +		if (node_online(nid))
> +			continue;
> +		get_reserved_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
> +		if (pfn >= start_pfn && pfn <= end_pfn)
> +			return nid;
> +	}
> +	return NUMA_NO_NODE;
> +}
> +EXPORT_SYMBOL_GPL(phys_to_target_node);
>  #endif /* CONFIG_MEMBLOCK_NUMA_INFO */
>  
>  /*
>
Dan Williams July 13, 2020, 3:48 p.m. UTC | #2
On Mon, Jul 13, 2020 at 12:04 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
>
> Hi Dan,
>
> On Sun, Jul 12, 2020 at 09:26:48AM -0700, Dan Williams wrote:
> > Similar to how generic memory_add_physaddr_to_nid() interrogates
> > memblock data for numa information, introduce
> > get_reserved_pfn_range_from_nid() to enable the same operation for
> > reserved memory ranges. Example memory ranges that are reserved, but
> > still have associated numa-info are persistent memory or Soft Reserved
> > (EFI_MEMORY_SP) memory.
>
> Here again, I would prefer to add a weak default for
> phys_to_target_node() because the "generic" implementation is not really
> generic.
>
> The fallback to reserved ranges is x86 specfic because on x86 most of the
> reserved areas is not in memblock.memory. AFAIK, no other architecture
> does this.

True, I was pre-fetching ARM using the new EFI "Special Purpose"
memory attribute. However, until that becomes something that platforms
deploy in practice I'm ok with not solving that problem for now.

> And x86 anyway has implementation of phys_to_target_node().

Sure, let's go with the default stub for non-x86.

Justin, do you think it would make sense to fold your dax_kmem
enabling for arm64 series into my enabling of dax_hmem for all
memory-hotplug archs?
Jia He July 14, 2020, 1:36 a.m. UTC | #3
Hi Dan

> -----Original Message-----
> From: Dan Williams <dan.j.williams@intel.com>
> Sent: Monday, July 13, 2020 11:48 PM
> To: Mike Rapoport <rppt@linux.ibm.com>
> Cc: linux-nvdimm <linux-nvdimm@lists.01.org>; Justin He
> <Justin.He@arm.com>; Will Deacon <will@kernel.org>; David Hildenbrand
> <david@redhat.com>; Andrew Morton <akpm@linux-foundation.org>; Peter
> Zijlstra <peterz@infradead.org>; Vishal L Verma <vishal.l.verma@intel.com>;
> Dave Hansen <dave.hansen@linux.intel.com>; Ard Biesheuvel
> <ard.biesheuvel@linaro.org>; Linux MM <linux-mm@kvack.org>; Linux Kernel
> Mailing List <linux-kernel@vger.kernel.org>; Linux ACPI <linux-
> acpi@vger.kernel.org>; Christoph Hellwig <hch@lst.de>; Joao Martins
> <joao.m.martins@oracle.com>
> Subject: Re: [PATCH v2 08/22] memblock: Introduce a generic
> phys_addr_to_target_node()
>
> On Mon, Jul 13, 2020 at 12:04 AM Mike Rapoport <rppt@linux.ibm.com> wrote:
> >
> > Hi Dan,
> >
> > On Sun, Jul 12, 2020 at 09:26:48AM -0700, Dan Williams wrote:
> > > Similar to how generic memory_add_physaddr_to_nid() interrogates
> > > memblock data for numa information, introduce
> > > get_reserved_pfn_range_from_nid() to enable the same operation for
> > > reserved memory ranges. Example memory ranges that are reserved, but
> > > still have associated numa-info are persistent memory or Soft Reserved
> > > (EFI_MEMORY_SP) memory.
> >
> > Here again, I would prefer to add a weak default for
> > phys_to_target_node() because the "generic" implementation is not really
> > generic.
> >
> > The fallback to reserved ranges is x86 specfic because on x86 most of
> the
> > reserved areas is not in memblock.memory. AFAIK, no other architecture
> > does this.
>
> True, I was pre-fetching ARM using the new EFI "Special Purpose"
> memory attribute. However, until that becomes something that platforms
> deploy in practice I'm ok with not solving that problem for now.
>
> > And x86 anyway has implementation of phys_to_target_node().
>
> Sure, let's go with the default stub for non-x86.
>
> Justin, do you think it would make sense to fold your dax_kmem
> enabling for arm64 series into my enabling of dax_hmem for all
> memory-hotplug archs?

It is ok with me, thanks for the folding 
diff mbox series

Patch

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 017fae833d4a..0655e8376c72 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -234,6 +234,10 @@  void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 	for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
 	     i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
 
+#define for_each_reserved_pfn_range(i, nid, p_start, p_end, p_nid)		\
+	for (i = -1, __next_reserved_pfn_range(&i, nid, p_start, p_end, p_nid); \
+	     i >= 0; __next_reserved_pfn_range(&i, nid, p_start, p_end, p_nid))
+
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
 				  unsigned long *out_spfn,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1e76ee5da20b..82dac9f42c46 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2438,6 +2438,8 @@  extern unsigned long absent_pages_in_range(unsigned long start_pfn,
 
 extern void get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn);
+extern void get_reserved_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn);
 extern unsigned long find_min_pfn_with_active_regions(void);
 extern void sparse_memory_present_with_active_regions(int nid);
 
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 5d25c5de1322..52b2430bc759 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -19,7 +19,7 @@  int numa_map_to_online_node(int node);
 
 /*
  * Optional architecture specific implementation, users need a "depends
- * on $ARCH"
+ * on $ARCH" or depends on CONFIG_MEMBLOCK_NUMA_INFO
  */
 int phys_to_target_node(phys_addr_t addr);
 #else
diff --git a/mm/memblock.c b/mm/memblock.c
index 39aceafc57f6..43c3abab705e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1200,11 +1200,11 @@  void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
 /*
  * Common iterator interface used to define for_each_mem_pfn_range().
  */
-void __init_memblock __next_mem_pfn_range(int *idx, int nid,
+static void __init_memblock __next_memblock_pfn_range(int *idx, int nid,
 				unsigned long *out_start_pfn,
-				unsigned long *out_end_pfn, int *out_nid)
+				unsigned long *out_end_pfn, int *out_nid,
+				struct memblock_type *type)
 {
-	struct memblock_type *type = &memblock.memory;
 	struct memblock_region *r;
 	int r_nid;
 
@@ -1230,6 +1230,22 @@  void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r_nid;
 }
 
+void __init_memblock __next_mem_pfn_range(int *idx, int nid,
+				unsigned long *out_start_pfn,
+				unsigned long *out_end_pfn, int *out_nid)
+{
+	__next_memblock_pfn_range(idx, nid, out_start_pfn, out_end_pfn, out_nid,
+				  &memblock.memory);
+}
+
+void __init_memblock __next_reserved_pfn_range(int *idx, int nid,
+				unsigned long *out_start_pfn,
+				unsigned long *out_end_pfn, int *out_nid)
+{
+	__next_memblock_pfn_range(idx, nid, out_start_pfn, out_end_pfn, out_nid,
+				  &memblock.reserved);
+}
+
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df8bd169dbb4..94ad77c0c338 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6369,12 +6369,39 @@  void __init_or_memblock get_pfn_range_for_nid(unsigned int nid,
 		*start_pfn = 0;
 }
 
+/**
+ * get_reserved_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
+ *
+ * Mostly identical to get_pfn_range_for_nid() except it operates on
+ * reserved ranges rather than online memory.
+ */
+void __init_or_memblock get_reserved_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	unsigned long this_start_pfn, this_end_pfn;
+	int i;
+
+	*start_pfn = -1UL;
+	*end_pfn = 0;
+
+	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+		*start_pfn = min(*start_pfn, this_start_pfn);
+		*end_pfn = max(*end_pfn, this_end_pfn);
+	}
+
+	if (*start_pfn == -1UL)
+		*start_pfn = 0;
+}
+
 /*
  * Generic implementation of memory_add_physaddr_to_nid() depends on
  * architecture using memblock data for numa information.
  */
 #ifdef CONFIG_MEMBLOCK_NUMA_INFO
-int __init_or_memblock memory_add_physaddr_to_nid(u64 addr)
+static int __init_or_memblock __memory_add_physaddr_to_nid(u64 addr)
 {
 	unsigned long start_pfn, end_pfn, pfn = PHYS_PFN(addr);
 	int nid;
@@ -6384,10 +6411,42 @@  int __init_or_memblock memory_add_physaddr_to_nid(u64 addr)
 		if (pfn >= start_pfn && pfn <= end_pfn)
 			return nid;
 	}
+	return NUMA_NO_NODE;
+}
+
+int __init_or_memblock memory_add_physaddr_to_nid(u64 addr)
+{
+	int nid = __memory_add_physaddr_to_nid(addr);
+
 	/* Default to node0 as not all callers are prepared for this to fail */
-	return 0;
+	if (nid == NUMA_NO_NODE)
+		return 0;
+	return nid;
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+int __init_or_memblock phys_to_target_node(u64 addr)
+{
+	unsigned long start_pfn, end_pfn, pfn = PHYS_PFN(addr);
+	int nid = __memory_add_physaddr_to_nid(addr);
+
+	if (nid != NUMA_NO_NODE)
+		return nid;
+
+	/*
+	 * Search reserved memory ranges since the memory address does
+	 * not appear to be online
+	 */
+	for_each_possible_node(nid) {
+		if (node_online(nid))
+			continue;
+		get_reserved_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+		if (pfn >= start_pfn && pfn <= end_pfn)
+			return nid;
+	}
+	return NUMA_NO_NODE;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
 #endif /* CONFIG_MEMBLOCK_NUMA_INFO */
 
 /*