diff mbox series

[v3] memblock: make memblock_find_in_range method private

Message ID 20210803064218.6611-1-rppt@kernel.org (mailing list archive)
State New, archived
Headers show
Series [v3] memblock: make memblock_find_in_range method private | expand

Commit Message

Mike Rapoport Aug. 3, 2021, 6:42 a.m. UTC
From: Mike Rapoport <rppt@linux.ibm.com>

There are a lot of uses of memblock_find_in_range() along with
memblock_reserve() from the times memblock allocation APIs did not exist.

memblock_find_in_range() is the very core of memblock allocations, so any
future changes to its internal behaviour would mandate updates of all the
users outside memblock.

Replace the calls to memblock_find_in_range() with an equivalent calls to
memblock_phys_alloc() and memblock_phys_alloc_range() and make
memblock_find_in_range() private method of memblock.

This simplifies the callers, ensures that (unlikely) errors in
memblock_reserve() are handled and improves maintainability of
memblock_find_in_range().

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
v3:
* simplify check for exact crash kerenl allocation on arm, per Rob
* make crash_max unsigned long long on arm64, per Rob
 
v2: https://lore.kernel.org/lkml/20210802063737.22733-1-rppt@kernel.org 
* don't change error message in arm::reserve_crashkernel(), per Russell

v1: https://lore.kernel.org/lkml/20210730104039.7047-1-rppt@kernel.org

 arch/arm/kernel/setup.c           | 20 +++++---------
 arch/arm64/kvm/hyp/reserved_mem.c |  9 +++----
 arch/arm64/mm/init.c              | 36 ++++++++-----------------
 arch/mips/kernel/setup.c          | 14 +++++-----
 arch/riscv/mm/init.c              | 44 ++++++++++---------------------
 arch/s390/kernel/setup.c          | 10 ++++---
 arch/x86/kernel/aperture_64.c     |  5 ++--
 arch/x86/mm/init.c                | 21 +++++++++------
 arch/x86/mm/numa.c                |  5 ++--
 arch/x86/mm/numa_emulation.c      |  5 ++--
 arch/x86/realmode/init.c          |  2 +-
 drivers/acpi/tables.c             |  5 ++--
 drivers/base/arch_numa.c          |  5 +---
 drivers/of/of_reserved_mem.c      | 12 ++++++---
 include/linux/memblock.h          |  2 --
 mm/memblock.c                     |  2 +-
 16 files changed, 79 insertions(+), 118 deletions(-)


base-commit: ff1176468d368232b684f75e82563369208bc371

Comments

Catalin Marinas Aug. 3, 2021, 6:05 p.m. UTC | #1
On Tue, Aug 03, 2021 at 09:42:18AM +0300, Mike Rapoport wrote:
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 8490ed2917ff..0bffd2d1854f 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -74,6 +74,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
>  static void __init reserve_crashkernel(void)
>  {
>  	unsigned long long crash_base, crash_size;
> +	unsigned long long crash_max = arm64_dma_phys_limit;
>  	int ret;
>  
>  	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> @@ -84,33 +85,18 @@ static void __init reserve_crashkernel(void)
>  
>  	crash_size = PAGE_ALIGN(crash_size);
>  
> -	if (crash_base == 0) {
> -		/* Current arm64 boot protocol requires 2MB alignment */
> -		crash_base = memblock_find_in_range(0, arm64_dma_phys_limit,
> -				crash_size, SZ_2M);
> -		if (crash_base == 0) {
> -			pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> -				crash_size);
> -			return;
> -		}
> -	} else {
> -		/* User specifies base address explicitly. */
> -		if (!memblock_is_region_memory(crash_base, crash_size)) {
> -			pr_warn("cannot reserve crashkernel: region is not memory\n");
> -			return;
> -		}
> +	/* User specifies base address explicitly. */
> +	if (crash_base)
> +		crash_max = crash_base + crash_size;
>  
> -		if (memblock_is_region_reserved(crash_base, crash_size)) {
> -			pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
> -			return;
> -		}
> -
> -		if (!IS_ALIGNED(crash_base, SZ_2M)) {
> -			pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
> -			return;
> -		}
> +	/* Current arm64 boot protocol requires 2MB alignment */
> +	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> +					       crash_base, crash_max);
> +	if (!crash_base) {
> +		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> +			crash_size);
> +		return;
>  	}
> -	memblock_reserve(crash_base, crash_size);

We'll miss a bit on debug information provided to the user in case of a
wrong crash_base/size option on the command line. Not sure we care much,
though the alignment would probably be useful (maybe we document it
somewhere).

What I haven't checked is whether memblock_phys_alloc_range() aims to
get a 2MB aligned end (size) as well. If crash_size is not 2MB aligned,
crash_max wouldn't be either and the above could fail. We only care
about the crash_base to be aligned but the memblock_phys_alloc_range()
doc says that both the start and size would be aligned to this.
Mike Rapoport Aug. 3, 2021, 7:07 p.m. UTC | #2
On Tue, Aug 03, 2021 at 07:05:26PM +0100, Catalin Marinas wrote:
> On Tue, Aug 03, 2021 at 09:42:18AM +0300, Mike Rapoport wrote:
> > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> > index 8490ed2917ff..0bffd2d1854f 100644
> > --- a/arch/arm64/mm/init.c
> > +++ b/arch/arm64/mm/init.c
> > @@ -74,6 +74,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
> >  static void __init reserve_crashkernel(void)
> >  {
> >  	unsigned long long crash_base, crash_size;
> > +	unsigned long long crash_max = arm64_dma_phys_limit;
> >  	int ret;
> >  
> >  	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> > @@ -84,33 +85,18 @@ static void __init reserve_crashkernel(void)
> >  
> >  	crash_size = PAGE_ALIGN(crash_size);
> >  
> > -	if (crash_base == 0) {
> > -		/* Current arm64 boot protocol requires 2MB alignment */
> > -		crash_base = memblock_find_in_range(0, arm64_dma_phys_limit,
> > -				crash_size, SZ_2M);
> > -		if (crash_base == 0) {
> > -			pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > -				crash_size);
> > -			return;
> > -		}
> > -	} else {
> > -		/* User specifies base address explicitly. */
> > -		if (!memblock_is_region_memory(crash_base, crash_size)) {
> > -			pr_warn("cannot reserve crashkernel: region is not memory\n");
> > -			return;
> > -		}
> > +	/* User specifies base address explicitly. */
> > +	if (crash_base)
> > +		crash_max = crash_base + crash_size;
> >  
> > -		if (memblock_is_region_reserved(crash_base, crash_size)) {
> > -			pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
> > -			return;
> > -		}
> > -
> > -		if (!IS_ALIGNED(crash_base, SZ_2M)) {
> > -			pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
> > -			return;
> > -		}
> > +	/* Current arm64 boot protocol requires 2MB alignment */
> > +	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> > +					       crash_base, crash_max);
> > +	if (!crash_base) {
> > +		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > +			crash_size);
> > +		return;
> >  	}
> > -	memblock_reserve(crash_base, crash_size);
> 
> We'll miss a bit on debug information provided to the user in case of a
> wrong crash_base/size option on the command line. Not sure we care much,
> though the alignment would probably be useful (maybe we document it
> somewhere).

It is already documented:

Documentation/admin-guide/kdump/kdump.rst:
   On arm64, use "crashkernel=Y[@X]".  Note that the start address of
   the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000).
 
> What I haven't checked is whether memblock_phys_alloc_range() aims to
> get a 2MB aligned end (size) as well. If crash_size is not 2MB aligned,
> crash_max wouldn't be either and the above could fail. We only care
> about the crash_base to be aligned but the memblock_phys_alloc_range()
> doc says that both the start and size would be aligned to this.

The doc lies :)

memblock_phys_alloc_range() boils down to 

	for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
					NULL) {

		/* clamp this_{start,end} to the user defined limits */

		cand = round_down(this_end - size, align);
		if (cand >= this_start)
			return cand;
	}
Catalin Marinas Aug. 4, 2021, 10:02 a.m. UTC | #3
On Tue, Aug 03, 2021 at 10:07:37PM +0300, Mike Rapoport wrote:
> On Tue, Aug 03, 2021 at 07:05:26PM +0100, Catalin Marinas wrote:
> > On Tue, Aug 03, 2021 at 09:42:18AM +0300, Mike Rapoport wrote:
> > > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> > > index 8490ed2917ff..0bffd2d1854f 100644
> > > --- a/arch/arm64/mm/init.c
> > > +++ b/arch/arm64/mm/init.c
> > > @@ -74,6 +74,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
> > >  static void __init reserve_crashkernel(void)
> > >  {
> > >  	unsigned long long crash_base, crash_size;
> > > +	unsigned long long crash_max = arm64_dma_phys_limit;
> > >  	int ret;
> > >  
> > >  	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> > > @@ -84,33 +85,18 @@ static void __init reserve_crashkernel(void)
> > >  
> > >  	crash_size = PAGE_ALIGN(crash_size);
> > >  
> > > -	if (crash_base == 0) {
> > > -		/* Current arm64 boot protocol requires 2MB alignment */
> > > -		crash_base = memblock_find_in_range(0, arm64_dma_phys_limit,
> > > -				crash_size, SZ_2M);
> > > -		if (crash_base == 0) {
> > > -			pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > > -				crash_size);
> > > -			return;
> > > -		}
> > > -	} else {
> > > -		/* User specifies base address explicitly. */
> > > -		if (!memblock_is_region_memory(crash_base, crash_size)) {
> > > -			pr_warn("cannot reserve crashkernel: region is not memory\n");
> > > -			return;
> > > -		}
> > > +	/* User specifies base address explicitly. */
> > > +	if (crash_base)
> > > +		crash_max = crash_base + crash_size;
> > >  
> > > -		if (memblock_is_region_reserved(crash_base, crash_size)) {
> > > -			pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
> > > -			return;
> > > -		}
> > > -
> > > -		if (!IS_ALIGNED(crash_base, SZ_2M)) {
> > > -			pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
> > > -			return;
> > > -		}
> > > +	/* Current arm64 boot protocol requires 2MB alignment */
> > > +	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> > > +					       crash_base, crash_max);
> > > +	if (!crash_base) {
> > > +		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > > +			crash_size);
> > > +		return;
> > >  	}
> > > -	memblock_reserve(crash_base, crash_size);
> > 
> > We'll miss a bit on debug information provided to the user in case of a
> > wrong crash_base/size option on the command line. Not sure we care much,
> > though the alignment would probably be useful (maybe we document it
> > somewhere).
> 
> It is already documented:
> 
> Documentation/admin-guide/kdump/kdump.rst:
>    On arm64, use "crashkernel=Y[@X]".  Note that the start address of
>    the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000).

Thanks for the pointer.

> > What I haven't checked is whether memblock_phys_alloc_range() aims to
> > get a 2MB aligned end (size) as well. If crash_size is not 2MB aligned,
> > crash_max wouldn't be either and the above could fail. We only care
> > about the crash_base to be aligned but the memblock_phys_alloc_range()
> > doc says that both the start and size would be aligned to this.
> 
> The doc lies :)
> 
> memblock_phys_alloc_range() boils down to 
> 
> 	for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
> 					NULL) {
> 
> 		/* clamp this_{start,end} to the user defined limits */
> 
> 		cand = round_down(this_end - size, align);
> 		if (cand >= this_start)
> 			return cand;
> 	}

Alright, it should work then. For arm64:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Guenter Roeck Aug. 9, 2021, 7:06 p.m. UTC | #4
On Tue, Aug 03, 2021 at 09:42:18AM +0300, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> There are a lot of uses of memblock_find_in_range() along with
> memblock_reserve() from the times memblock allocation APIs did not exist.
> 
> memblock_find_in_range() is the very core of memblock allocations, so any
> future changes to its internal behaviour would mandate updates of all the
> users outside memblock.
> 
> Replace the calls to memblock_find_in_range() with an equivalent calls to
> memblock_phys_alloc() and memblock_phys_alloc_range() and make
> memblock_find_in_range() private method of memblock.
> 
> This simplifies the callers, ensures that (unlikely) errors in
> memblock_reserve() are handled and improves maintainability of
> memblock_find_in_range().
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>

I see a number of crashes in next-20210806 when booting x86 images from efi.

[    0.000000] efi: EFI v2.70 by EDK II
[    0.000000] efi: SMBIOS=0x1fbcc000 ACPI=0x1fbfa000 ACPI 2.0=0x1fbfa014 MEMATTR=0x1f25f018
[    0.000000] SMBIOS 2.8 present.
[    0.000000] DMI: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[    0.000000] last_pfn = 0x1ff50 max_arch_pfn = 0x400000000
[    0.000000] x86/PAT: Configuration [0-7]: WB  WC  UC- UC  WB  WP  UC- WT
[    0.000000] Kernel panic - not syncing: alloc_low_pages: can not alloc memory
[    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc4-next-20210806 #1
[    0.000000] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[    0.000000] Call Trace:
[    0.000000]  ? dump_stack_lvl+0x57/0x7d
[    0.000000]  ? panic+0xfc/0x2c6
[    0.000000]  ? alloc_low_pages+0x117/0x156
[    0.000000]  ? phys_pmd_init+0x234/0x342
[    0.000000]  ? phys_pud_init+0x171/0x337
[    0.000000]  ? __kernel_physical_mapping_init+0xec/0x276
[    0.000000]  ? init_memory_mapping+0x1ea/0x2aa
[    0.000000]  ? init_range_memory_mapping+0xdf/0x12e
[    0.000000]  ? init_mem_mapping+0x1e9/0x26f
[    0.000000]  ? setup_arch+0x5ff/0xb6d
[    0.000000]  ? start_kernel+0x71/0x6b4
[    0.000000]  ? secondary_startup_64_no_verify+0xc2/0xcb

Bisect points to this patch. Reverting it fixes the problem. Key seems to
be the amount of memory configured in qemu; the problem is not seen if
there is 1G or more of memory, but it is seen with all test boots with
512M or 256M of memory. It is also seen with almost all 32-bit efi boots.

The problem is not seen when booting without efi.

Guenter

---
Bisect log:

# bad: [da454ebf578f6c542ba9f5b3ddb98db3ede109c1] Add linux-next specific files for 20210809
# good: [36a21d51725af2ce0700c6ebcb6b9594aac658a6] Linux 5.14-rc5
git bisect start 'HEAD' 'v5.14-rc5'
# good: [d22fda64bea5f33000e31e5b7e4ba876bca37436] Merge remote-tracking branch 'crypto/master'
git bisect good d22fda64bea5f33000e31e5b7e4ba876bca37436
# good: [b084da3a98fad27a39ed5ca64106b86df0417851] Merge remote-tracking branch 'irqchip/irq/irqchip-next'
git bisect good b084da3a98fad27a39ed5ca64106b86df0417851
# good: [a5383d1f57190a33c6afc25c62b9907d84ba2bc6] Merge remote-tracking branch 'staging/staging-next'
git bisect good a5383d1f57190a33c6afc25c62b9907d84ba2bc6
# good: [a439da3e6abeb054f4e6b0d37814e762b7340196] Merge remote-tracking branch 'seccomp/for-next/seccomp'
git bisect good a439da3e6abeb054f4e6b0d37814e762b7340196
# bad: [9801f3c0890c7b992b45a5c2afcb16c5cdc8388e] mm/idle_page_tracking: Make PG_idle reusable
git bisect bad 9801f3c0890c7b992b45a5c2afcb16c5cdc8388e
# good: [b4f7f4a9b542836683308d48ffdd18471c6f3e76] lazy-tlb-allow-lazy-tlb-mm-refcounting-to-be-configurable-fix
git bisect good b4f7f4a9b542836683308d48ffdd18471c6f3e76
# good: [e30842a48c36f094271eea0984bb861b49c49c87] mm/vmscan: add 'else' to remove check_pending label
git bisect good e30842a48c36f094271eea0984bb861b49c49c87
# bad: [65300b20a21214fb2043419d4e5da1d9947c6e15] mm/madvise: add MADV_WILLNEED to process_madvise()
git bisect bad 65300b20a21214fb2043419d4e5da1d9947c6e15
# bad: [7348da7a8c244d1a755bc5838b04cb9b1b6ee06c] memblock: make memblock_find_in_range method private
git bisect bad 7348da7a8c244d1a755bc5838b04cb9b1b6ee06c
# good: [98f8c467fe2ba8e553b450b2a3294d69f1f2027f] mm-mempolicy-convert-from-atomic_t-to-refcount_t-on-mempolicy-refcnt-fix
git bisect good 98f8c467fe2ba8e553b450b2a3294d69f1f2027f
# good: [760ded422ebe4f8899905b752d8378c44f2a78f3] mm/memplicy: add page allocation function for MPOL_PREFERRED_MANY policy
git bisect good 760ded422ebe4f8899905b752d8378c44f2a78f3
# good: [fbfa0492d9639b67119d3d94b7a6a3f85e064260] mm/mempolicy: advertise new MPOL_PREFERRED_MANY
git bisect good fbfa0492d9639b67119d3d94b7a6a3f85e064260
# good: [ff6d5759a871883aeea38309fb16d91666179328] mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies
git bisect good ff6d5759a871883aeea38309fb16d91666179328
# first bad commit: [7348da7a8c244d1a755bc5838b04cb9b1b6ee06c] memblock: make memblock_find_in_range method private
Mike Rapoport Aug. 10, 2021, 6:55 p.m. UTC | #5
On Mon, Aug 09, 2021 at 12:06:41PM -0700, Guenter Roeck wrote:
> On Tue, Aug 03, 2021 at 09:42:18AM +0300, Mike Rapoport wrote:
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > There are a lot of uses of memblock_find_in_range() along with
> > memblock_reserve() from the times memblock allocation APIs did not exist.
> > 
> > memblock_find_in_range() is the very core of memblock allocations, so any
> > future changes to its internal behaviour would mandate updates of all the
> > users outside memblock.
> > 
> > Replace the calls to memblock_find_in_range() with an equivalent calls to
> > memblock_phys_alloc() and memblock_phys_alloc_range() and make
> > memblock_find_in_range() private method of memblock.
> > 
> > This simplifies the callers, ensures that (unlikely) errors in
> > memblock_reserve() are handled and improves maintainability of
> > memblock_find_in_range().
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> 
> I see a number of crashes in next-20210806 when booting x86 images from efi.
> 
> [    0.000000] efi: EFI v2.70 by EDK II
> [    0.000000] efi: SMBIOS=0x1fbcc000 ACPI=0x1fbfa000 ACPI 2.0=0x1fbfa014 MEMATTR=0x1f25f018
> [    0.000000] SMBIOS 2.8 present.
> [    0.000000] DMI: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> [    0.000000] last_pfn = 0x1ff50 max_arch_pfn = 0x400000000
> [    0.000000] x86/PAT: Configuration [0-7]: WB  WC  UC- UC  WB  WP  UC- WT
> [    0.000000] Kernel panic - not syncing: alloc_low_pages: can not alloc memory
> [    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc4-next-20210806 #1
> [    0.000000] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> [    0.000000] Call Trace:
> [    0.000000]  ? dump_stack_lvl+0x57/0x7d
> [    0.000000]  ? panic+0xfc/0x2c6
> [    0.000000]  ? alloc_low_pages+0x117/0x156
> [    0.000000]  ? phys_pmd_init+0x234/0x342
> [    0.000000]  ? phys_pud_init+0x171/0x337
> [    0.000000]  ? __kernel_physical_mapping_init+0xec/0x276
> [    0.000000]  ? init_memory_mapping+0x1ea/0x2aa
> [    0.000000]  ? init_range_memory_mapping+0xdf/0x12e
> [    0.000000]  ? init_mem_mapping+0x1e9/0x26f
> [    0.000000]  ? setup_arch+0x5ff/0xb6d
> [    0.000000]  ? start_kernel+0x71/0x6b4
> [    0.000000]  ? secondary_startup_64_no_verify+0xc2/0xcb
> 
> Bisect points to this patch. Reverting it fixes the problem. Key seems to
> be the amount of memory configured in qemu; the problem is not seen if
> there is 1G or more of memory, but it is seen with all test boots with
> 512M or 256M of memory. It is also seen with almost all 32-bit efi boots.
> 
> The problem is not seen when booting without efi.

It looks like this change uncovered a problem in
x86::memory_map_top_down(). 

The allocation in alloc_low_pages() is limited by min_pfn_mapped and
max_pfn_mapped. The min_pfn_mapped is updated at every iteration of the
loop in memory_map_top_down, but there is another loop in
init_range_memory_mapping() that maps several regions below the current
min_pfn_mapped without updating this variable.

The memory layout in qemu with 256M of RAM and EFI enabled, causes
exhaustion of the memory limited by min_pfn_mapped and max_pfn_mapped
before min_pfn_mapped is updated.

Before this commit there was unconditional "reservation" of 2M in the end
of the memory that moved the initial min_pfn_mapped below the memory
reserved by EFI. The addition of check for xen_domain() removed this
reservation for !XEN and made alloc_low_pages() use the range already busy
with EFI data.

The patch below moves the update of min_pfn_mapped near the update of
max_pfn_mapped so that every time a new range is mapped both limits will be
updated accordingly.

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1152a29ce109..be279f6e5a0a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
+#define DEBUG
 #include <linux/gfp.h>
 #include <linux/initrd.h>
 #include <linux/ioport.h>
@@ -485,6 +486,7 @@ static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
 	nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);
 
 	max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+	min_pfn_mapped = min(min_pfn_mapped, start_pfn);
 
 	if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
 		max_low_pfn_mapped = max(max_low_pfn_mapped,
@@ -643,7 +645,6 @@ static void __init memory_map_top_down(unsigned long map_start,
 		mapped_ram_size += init_range_memory_mapping(start,
 							last_start);
 		last_start = start;
-		min_pfn_mapped = last_start >> PAGE_SHIFT;
 		if (mapped_ram_size >= step_size)
 			step_size = get_new_step_size(step_size);
 	}
 

> Guenter
> 
> ---
> Bisect log:
> 
> # bad: [da454ebf578f6c542ba9f5b3ddb98db3ede109c1] Add linux-next specific files for 20210809
> # good: [36a21d51725af2ce0700c6ebcb6b9594aac658a6] Linux 5.14-rc5
> git bisect start 'HEAD' 'v5.14-rc5'
> # good: [d22fda64bea5f33000e31e5b7e4ba876bca37436] Merge remote-tracking branch 'crypto/master'
> git bisect good d22fda64bea5f33000e31e5b7e4ba876bca37436
> # good: [b084da3a98fad27a39ed5ca64106b86df0417851] Merge remote-tracking branch 'irqchip/irq/irqchip-next'
> git bisect good b084da3a98fad27a39ed5ca64106b86df0417851
> # good: [a5383d1f57190a33c6afc25c62b9907d84ba2bc6] Merge remote-tracking branch 'staging/staging-next'
> git bisect good a5383d1f57190a33c6afc25c62b9907d84ba2bc6
> # good: [a439da3e6abeb054f4e6b0d37814e762b7340196] Merge remote-tracking branch 'seccomp/for-next/seccomp'
> git bisect good a439da3e6abeb054f4e6b0d37814e762b7340196
> # bad: [9801f3c0890c7b992b45a5c2afcb16c5cdc8388e] mm/idle_page_tracking: Make PG_idle reusable
> git bisect bad 9801f3c0890c7b992b45a5c2afcb16c5cdc8388e
> # good: [b4f7f4a9b542836683308d48ffdd18471c6f3e76] lazy-tlb-allow-lazy-tlb-mm-refcounting-to-be-configurable-fix
> git bisect good b4f7f4a9b542836683308d48ffdd18471c6f3e76
> # good: [e30842a48c36f094271eea0984bb861b49c49c87] mm/vmscan: add 'else' to remove check_pending label
> git bisect good e30842a48c36f094271eea0984bb861b49c49c87
> # bad: [65300b20a21214fb2043419d4e5da1d9947c6e15] mm/madvise: add MADV_WILLNEED to process_madvise()
> git bisect bad 65300b20a21214fb2043419d4e5da1d9947c6e15
> # bad: [7348da7a8c244d1a755bc5838b04cb9b1b6ee06c] memblock: make memblock_find_in_range method private
> git bisect bad 7348da7a8c244d1a755bc5838b04cb9b1b6ee06c
> # good: [98f8c467fe2ba8e553b450b2a3294d69f1f2027f] mm-mempolicy-convert-from-atomic_t-to-refcount_t-on-mempolicy-refcnt-fix
> git bisect good 98f8c467fe2ba8e553b450b2a3294d69f1f2027f
> # good: [760ded422ebe4f8899905b752d8378c44f2a78f3] mm/memplicy: add page allocation function for MPOL_PREFERRED_MANY policy
> git bisect good 760ded422ebe4f8899905b752d8378c44f2a78f3
> # good: [fbfa0492d9639b67119d3d94b7a6a3f85e064260] mm/mempolicy: advertise new MPOL_PREFERRED_MANY
> git bisect good fbfa0492d9639b67119d3d94b7a6a3f85e064260
> # good: [ff6d5759a871883aeea38309fb16d91666179328] mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies
> git bisect good ff6d5759a871883aeea38309fb16d91666179328
> # first bad commit: [7348da7a8c244d1a755bc5838b04cb9b1b6ee06c] memblock: make memblock_find_in_range method private
Guenter Roeck Aug. 10, 2021, 7:21 p.m. UTC | #6
On 8/10/21 11:55 AM, Mike Rapoport wrote:
> On Mon, Aug 09, 2021 at 12:06:41PM -0700, Guenter Roeck wrote:
>> On Tue, Aug 03, 2021 at 09:42:18AM +0300, Mike Rapoport wrote:
>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>>
>>> There are a lot of uses of memblock_find_in_range() along with
>>> memblock_reserve() from the times memblock allocation APIs did not exist.
>>>
>>> memblock_find_in_range() is the very core of memblock allocations, so any
>>> future changes to its internal behaviour would mandate updates of all the
>>> users outside memblock.
>>>
>>> Replace the calls to memblock_find_in_range() with an equivalent calls to
>>> memblock_phys_alloc() and memblock_phys_alloc_range() and make
>>> memblock_find_in_range() private method of memblock.
>>>
>>> This simplifies the callers, ensures that (unlikely) errors in
>>> memblock_reserve() are handled and improves maintainability of
>>> memblock_find_in_range().
>>>
>>> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
>>
>> I see a number of crashes in next-20210806 when booting x86 images from efi.
>>
>> [    0.000000] efi: EFI v2.70 by EDK II
>> [    0.000000] efi: SMBIOS=0x1fbcc000 ACPI=0x1fbfa000 ACPI 2.0=0x1fbfa014 MEMATTR=0x1f25f018
>> [    0.000000] SMBIOS 2.8 present.
>> [    0.000000] DMI: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
>> [    0.000000] last_pfn = 0x1ff50 max_arch_pfn = 0x400000000
>> [    0.000000] x86/PAT: Configuration [0-7]: WB  WC  UC- UC  WB  WP  UC- WT
>> [    0.000000] Kernel panic - not syncing: alloc_low_pages: can not alloc memory
>> [    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc4-next-20210806 #1
>> [    0.000000] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
>> [    0.000000] Call Trace:
>> [    0.000000]  ? dump_stack_lvl+0x57/0x7d
>> [    0.000000]  ? panic+0xfc/0x2c6
>> [    0.000000]  ? alloc_low_pages+0x117/0x156
>> [    0.000000]  ? phys_pmd_init+0x234/0x342
>> [    0.000000]  ? phys_pud_init+0x171/0x337
>> [    0.000000]  ? __kernel_physical_mapping_init+0xec/0x276
>> [    0.000000]  ? init_memory_mapping+0x1ea/0x2aa
>> [    0.000000]  ? init_range_memory_mapping+0xdf/0x12e
>> [    0.000000]  ? init_mem_mapping+0x1e9/0x26f
>> [    0.000000]  ? setup_arch+0x5ff/0xb6d
>> [    0.000000]  ? start_kernel+0x71/0x6b4
>> [    0.000000]  ? secondary_startup_64_no_verify+0xc2/0xcb
>>
>> Bisect points to this patch. Reverting it fixes the problem. Key seems to
>> be the amount of memory configured in qemu; the problem is not seen if
>> there is 1G or more of memory, but it is seen with all test boots with
>> 512M or 256M of memory. It is also seen with almost all 32-bit efi boots.
>>
>> The problem is not seen when booting without efi.
> 
> It looks like this change uncovered a problem in
> x86::memory_map_top_down().
> 
> The allocation in alloc_low_pages() is limited by min_pfn_mapped and
> max_pfn_mapped. The min_pfn_mapped is updated at every iteration of the
> loop in memory_map_top_down, but there is another loop in
> init_range_memory_mapping() that maps several regions below the current
> min_pfn_mapped without updating this variable.
> 
> The memory layout in qemu with 256M of RAM and EFI enabled, causes
> exhaustion of the memory limited by min_pfn_mapped and max_pfn_mapped
> before min_pfn_mapped is updated.
> 
> Before this commit there was unconditional "reservation" of 2M in the end
> of the memory that moved the initial min_pfn_mapped below the memory
> reserved by EFI. The addition of check for xen_domain() removed this
> reservation for !XEN and made alloc_low_pages() use the range already busy
> with EFI data.
> 
> The patch below moves the update of min_pfn_mapped near the update of
> max_pfn_mapped so that every time a new range is mapped both limits will be
> updated accordingly.
> 
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 1152a29ce109..be279f6e5a0a 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -1,3 +1,4 @@
> +#define DEBUG
>   #include <linux/gfp.h>
>   #include <linux/initrd.h>
>   #include <linux/ioport.h>
> @@ -485,6 +486,7 @@ static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
>   	nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);
>   
>   	max_pfn_mapped = max(max_pfn_mapped, end_pfn);
> +	min_pfn_mapped = min(min_pfn_mapped, start_pfn);
>   
>   	if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
>   		max_low_pfn_mapped = max(max_low_pfn_mapped,
> @@ -643,7 +645,6 @@ static void __init memory_map_top_down(unsigned long map_start,
>   		mapped_ram_size += init_range_memory_mapping(start,
>   							last_start);
>   		last_start = start;
> -		min_pfn_mapped = last_start >> PAGE_SHIFT;
>   		if (mapped_ram_size >= step_size)
>   			step_size = get_new_step_size(step_size);
>   	}
>   

The offending patch was removed from next-20210810, but I applied the above change
to next-20210809 and it does indeed fix the problem. If it is added as separate patch,
please feel free to add

Tested-by: Guenter Roeck <linux@roeck-us.net>

Thanks,
Guenter
Mike Rapoport Aug. 11, 2021, 7:36 a.m. UTC | #7
On Tue, Aug 10, 2021 at 12:21:46PM -0700, Guenter Roeck wrote:
> On 8/10/21 11:55 AM, Mike Rapoport wrote:
> > On Mon, Aug 09, 2021 at 12:06:41PM -0700, Guenter Roeck wrote:
> > > On Tue, Aug 03, 2021 at 09:42:18AM +0300, Mike Rapoport wrote:
> > > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > > 
> > > > There are a lot of uses of memblock_find_in_range() along with
> > > > memblock_reserve() from the times memblock allocation APIs did not exist.
> > > > 
> > > > memblock_find_in_range() is the very core of memblock allocations, so any
> > > > future changes to its internal behaviour would mandate updates of all the
> > > > users outside memblock.
> > > > 
> > > > Replace the calls to memblock_find_in_range() with an equivalent calls to
> > > > memblock_phys_alloc() and memblock_phys_alloc_range() and make
> > > > memblock_find_in_range() private method of memblock.
> > > > 
> > > > This simplifies the callers, ensures that (unlikely) errors in
> > > > memblock_reserve() are handled and improves maintainability of
> > > > memblock_find_in_range().
> > > > 
> > > > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > > 
> > > I see a number of crashes in next-20210806 when booting x86 images from efi.
> > > 
> > > [    0.000000] efi: EFI v2.70 by EDK II
> > > [    0.000000] efi: SMBIOS=0x1fbcc000 ACPI=0x1fbfa000 ACPI 2.0=0x1fbfa014 MEMATTR=0x1f25f018
> > > [    0.000000] SMBIOS 2.8 present.
> > > [    0.000000] DMI: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> > > [    0.000000] last_pfn = 0x1ff50 max_arch_pfn = 0x400000000
> > > [    0.000000] x86/PAT: Configuration [0-7]: WB  WC  UC- UC  WB  WP  UC- WT
> > > [    0.000000] Kernel panic - not syncing: alloc_low_pages: can not alloc memory
> > > [    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc4-next-20210806 #1
> > > [    0.000000] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> > > [    0.000000] Call Trace:
> > > [    0.000000]  ? dump_stack_lvl+0x57/0x7d
> > > [    0.000000]  ? panic+0xfc/0x2c6
> > > [    0.000000]  ? alloc_low_pages+0x117/0x156
> > > [    0.000000]  ? phys_pmd_init+0x234/0x342
> > > [    0.000000]  ? phys_pud_init+0x171/0x337
> > > [    0.000000]  ? __kernel_physical_mapping_init+0xec/0x276
> > > [    0.000000]  ? init_memory_mapping+0x1ea/0x2aa
> > > [    0.000000]  ? init_range_memory_mapping+0xdf/0x12e
> > > [    0.000000]  ? init_mem_mapping+0x1e9/0x26f
> > > [    0.000000]  ? setup_arch+0x5ff/0xb6d
> > > [    0.000000]  ? start_kernel+0x71/0x6b4
> > > [    0.000000]  ? secondary_startup_64_no_verify+0xc2/0xcb
> > > 
> > > Bisect points to this patch. Reverting it fixes the problem. Key seems to
> > > be the amount of memory configured in qemu; the problem is not seen if
> > > there is 1G or more of memory, but it is seen with all test boots with
> > > 512M or 256M of memory. It is also seen with almost all 32-bit efi boots.
> > > 
> > > The problem is not seen when booting without efi.
> > 
> > It looks like this change uncovered a problem in
> > x86::memory_map_top_down().
> > 
> > The allocation in alloc_low_pages() is limited by min_pfn_mapped and
> > max_pfn_mapped. The min_pfn_mapped is updated at every iteration of the
> > loop in memory_map_top_down, but there is another loop in
> > init_range_memory_mapping() that maps several regions below the current
> > min_pfn_mapped without updating this variable.
> > 
> > The memory layout in qemu with 256M of RAM and EFI enabled, causes
> > exhaustion of the memory limited by min_pfn_mapped and max_pfn_mapped
> > before min_pfn_mapped is updated.
> > 
> > Before this commit there was unconditional "reservation" of 2M in the end
> > of the memory that moved the initial min_pfn_mapped below the memory
> > reserved by EFI. The addition of check for xen_domain() removed this
> > reservation for !XEN and made alloc_low_pages() use the range already busy
> > with EFI data.
> > 
> > The patch below moves the update of min_pfn_mapped near the update of
> > max_pfn_mapped so that every time a new range is mapped both limits will be
> > updated accordingly.
> > 
> > diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> > index 1152a29ce109..be279f6e5a0a 100644
> > --- a/arch/x86/mm/init.c
> > +++ b/arch/x86/mm/init.c
> > @@ -1,3 +1,4 @@
> > +#define DEBUG
> >   #include <linux/gfp.h>
> >   #include <linux/initrd.h>
> >   #include <linux/ioport.h>
> > @@ -485,6 +486,7 @@ static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
> >   	nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);
> >   	max_pfn_mapped = max(max_pfn_mapped, end_pfn);
> > +	min_pfn_mapped = min(min_pfn_mapped, start_pfn);
> >   	if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
> >   		max_low_pfn_mapped = max(max_low_pfn_mapped,
> > @@ -643,7 +645,6 @@ static void __init memory_map_top_down(unsigned long map_start,
> >   		mapped_ram_size += init_range_memory_mapping(start,
> >   							last_start);
> >   		last_start = start;
> > -		min_pfn_mapped = last_start >> PAGE_SHIFT;
> >   		if (mapped_ram_size >= step_size)
> >   			step_size = get_new_step_size(step_size);
> >   	}
> 
> The offending patch was removed from next-20210810, but I applied the above change
> to next-20210809 and it does indeed fix the problem. If it is added as separate patch,
> please feel free to add
> 
> Tested-by: Guenter Roeck <linux@roeck-us.net>

Thanks!

I wonder now about that comment saying "xen has big range in reserved near
end of ram". Maybe it was the same issue with Xen and we can entirely drop
the reservation of the top 2M?

x86 folks, what do you say?
diff mbox series

Patch

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index f97eb2371672..284a80c0b6e1 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -1012,31 +1012,25 @@  static void __init reserve_crashkernel(void)
 		unsigned long long lowmem_max = __pa(high_memory - 1) + 1;
 		if (crash_max > lowmem_max)
 			crash_max = lowmem_max;
-		crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max,
-						    crash_size, CRASH_ALIGN);
+
+		crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+						       CRASH_ALIGN, crash_max);
 		if (!crash_base) {
 			pr_err("crashkernel reservation failed - No suitable area found.\n");
 			return;
 		}
 	} else {
+		unsigned long long crash_max = crash_base + crash_size;
 		unsigned long long start;
 
-		start = memblock_find_in_range(crash_base,
-					       crash_base + crash_size,
-					       crash_size, SECTION_SIZE);
-		if (start != crash_base) {
+		start = memblock_phys_alloc_range(crash_size, SECTION_SIZE,
+						  crash_base, crash_max);
+		if (!start) {
 			pr_err("crashkernel reservation failed - memory is in use.\n");
 			return;
 		}
 	}
 
-	ret = memblock_reserve(crash_base, crash_size);
-	if (ret < 0) {
-		pr_warn("crashkernel reservation failed - memory is in use (0x%lx)\n",
-			(unsigned long)crash_base);
-		return;
-	}
-
 	pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
 		(unsigned long)(crash_size >> 20),
 		(unsigned long)(crash_base >> 20),
diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c
index d654921dd09b..578670e3f608 100644
--- a/arch/arm64/kvm/hyp/reserved_mem.c
+++ b/arch/arm64/kvm/hyp/reserved_mem.c
@@ -92,12 +92,10 @@  void __init kvm_hyp_reserve(void)
 	 * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
 	 */
 	hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
-	hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
-					      ALIGN(hyp_mem_size, PMD_SIZE),
-					      PMD_SIZE);
+	hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
+					   PMD_SIZE);
 	if (!hyp_mem_base)
-		hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
-						      hyp_mem_size, PAGE_SIZE);
+		hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
 	else
 		hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
 
@@ -105,7 +103,6 @@  void __init kvm_hyp_reserve(void)
 		kvm_err("Failed to reserve hyp memory\n");
 		return;
 	}
-	memblock_reserve(hyp_mem_base, hyp_mem_size);
 
 	kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
 		 hyp_mem_base);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 8490ed2917ff..0bffd2d1854f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -74,6 +74,7 @@  phys_addr_t arm64_dma_phys_limit __ro_after_init;
 static void __init reserve_crashkernel(void)
 {
 	unsigned long long crash_base, crash_size;
+	unsigned long long crash_max = arm64_dma_phys_limit;
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
@@ -84,33 +85,18 @@  static void __init reserve_crashkernel(void)
 
 	crash_size = PAGE_ALIGN(crash_size);
 
-	if (crash_base == 0) {
-		/* Current arm64 boot protocol requires 2MB alignment */
-		crash_base = memblock_find_in_range(0, arm64_dma_phys_limit,
-				crash_size, SZ_2M);
-		if (crash_base == 0) {
-			pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
-				crash_size);
-			return;
-		}
-	} else {
-		/* User specifies base address explicitly. */
-		if (!memblock_is_region_memory(crash_base, crash_size)) {
-			pr_warn("cannot reserve crashkernel: region is not memory\n");
-			return;
-		}
+	/* User specifies base address explicitly. */
+	if (crash_base)
+		crash_max = crash_base + crash_size;
 
-		if (memblock_is_region_reserved(crash_base, crash_size)) {
-			pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
-			return;
-		}
-
-		if (!IS_ALIGNED(crash_base, SZ_2M)) {
-			pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
-			return;
-		}
+	/* Current arm64 boot protocol requires 2MB alignment */
+	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
+					       crash_base, crash_max);
+	if (!crash_base) {
+		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+			crash_size);
+		return;
 	}
-	memblock_reserve(crash_base, crash_size);
 
 	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
 		crash_base, crash_base + crash_size, crash_size >> 20);
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 23a140327a0b..f979adfd4fc2 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -452,8 +452,9 @@  static void __init mips_parse_crashkernel(void)
 		return;
 
 	if (crash_base <= 0) {
-		crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_MAX,
-							crash_size, CRASH_ALIGN);
+		crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+						       CRASH_ALIGN,
+						       CRASH_ADDR_MAX);
 		if (!crash_base) {
 			pr_warn("crashkernel reservation failed - No suitable area found.\n");
 			return;
@@ -461,8 +462,9 @@  static void __init mips_parse_crashkernel(void)
 	} else {
 		unsigned long long start;
 
-		start = memblock_find_in_range(crash_base, crash_base + crash_size,
-						crash_size, 1);
+		start = memblock_phys_alloc_range(crash_size, 1,
+						  crash_base,
+						  crash_base + crash_size);
 		if (start != crash_base) {
 			pr_warn("Invalid memory region reserved for crash kernel\n");
 			return;
@@ -656,10 +658,6 @@  static void __init arch_mem_init(char **cmdline_p)
 	mips_reserve_vmcore();
 
 	mips_parse_crashkernel();
-#ifdef CONFIG_KEXEC
-	if (crashk_res.start != crashk_res.end)
-		memblock_reserve(crashk_res.start, resource_size(&crashk_res));
-#endif
 	device_tree_init();
 
 	/*
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index a14bf3910eec..88649337c568 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -812,38 +812,22 @@  static void __init reserve_crashkernel(void)
 
 	crash_size = PAGE_ALIGN(crash_size);
 
-	if (crash_base == 0) {
-		/*
-		 * Current riscv boot protocol requires 2MB alignment for
-		 * RV64 and 4MB alignment for RV32 (hugepage size)
-		 */
-		crash_base = memblock_find_in_range(search_start, search_end,
-						    crash_size, PMD_SIZE);
-
-		if (crash_base == 0) {
-			pr_warn("crashkernel: couldn't allocate %lldKB\n",
-				crash_size >> 10);
-			return;
-		}
-	} else {
-		/* User specifies base address explicitly. */
-		if (!memblock_is_region_memory(crash_base, crash_size)) {
-			pr_warn("crashkernel: requested region is not memory\n");
-			return;
-		}
-
-		if (memblock_is_region_reserved(crash_base, crash_size)) {
-			pr_warn("crashkernel: requested region is reserved\n");
-			return;
-		}
-
+	if (crash_base) {
+		search_start = crash_base;
+		search_end = crash_base + crash_size;
+	}
 
-		if (!IS_ALIGNED(crash_base, PMD_SIZE)) {
-			pr_warn("crashkernel: requested region is misaligned\n");
-			return;
-		}
+	/*
+	 * Current riscv boot protocol requires 2MB alignment for
+	 * RV64 and 4MB alignment for RV32 (hugepage size)
+	 */
+	crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE,
+					       search_start, search_end);
+	if (crash_base == 0) {
+		pr_warn("crashkernel: couldn't allocate %lldKB\n",
+			crash_size >> 10);
+		return;
 	}
-	memblock_reserve(crash_base, crash_size);
 
 	pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n",
 		crash_base, crash_base + crash_size, crash_size >> 20);
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index ff0f9e838916..3d9efee0f43c 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -626,8 +626,9 @@  static void __init reserve_crashkernel(void)
 			return;
 		}
 		low = crash_base ?: low;
-		crash_base = memblock_find_in_range(low, high, crash_size,
-						    KEXEC_CRASH_MEM_ALIGN);
+		crash_base = memblock_phys_alloc_range(crash_size,
+						       KEXEC_CRASH_MEM_ALIGN,
+						       low, high);
 	}
 
 	if (!crash_base) {
@@ -636,14 +637,15 @@  static void __init reserve_crashkernel(void)
 		return;
 	}
 
-	if (register_memory_notifier(&kdump_mem_nb))
+	if (register_memory_notifier(&kdump_mem_nb)) {
+		memblock_free(crash_base, crash_size);
 		return;
+	}
 
 	if (!OLDMEM_BASE && MACHINE_IS_VM)
 		diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size));
 	crashk_res.start = crash_base;
 	crashk_res.end = crash_base + crash_size - 1;
-	memblock_remove(crash_base, crash_size);
 	pr_info("Reserving %lluMB of memory at %lluMB "
 		"for crashkernel (System RAM: %luMB)\n",
 		crash_size >> 20, crash_base >> 20,
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 294ed4392a0e..10562885f5fc 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -109,14 +109,13 @@  static u32 __init allocate_aperture(void)
 	 * memory. Unfortunately we cannot move it up because that would
 	 * make the IOMMU useless.
 	 */
-	addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
-				      aper_size, aper_size);
+	addr = memblock_phys_alloc_range(aper_size, aper_size,
+					 GART_MIN_ADDR, GART_MAX_ADDR);
 	if (!addr) {
 		pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
 		       addr, addr + aper_size - 1, aper_size >> 10);
 		return 0;
 	}
-	memblock_reserve(addr, aper_size);
 	pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
 		addr, addr + aper_size - 1, aper_size >> 10);
 	register_nosave_region(addr >> PAGE_SHIFT,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 75ef19aa8903..1152a29ce109 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -26,6 +26,7 @@ 
 #include <asm/pti.h>
 #include <asm/text-patching.h>
 #include <asm/memtype.h>
+#include <xen/xen.h>
 
 /*
  * We need to define the tracepoints somewhere, and tlb.c
@@ -127,14 +128,12 @@  __ref void *alloc_low_pages(unsigned int num)
 		unsigned long ret = 0;
 
 		if (min_pfn_mapped < max_pfn_mapped) {
-			ret = memblock_find_in_range(
+			ret = memblock_phys_alloc_range(
+					PAGE_SIZE * num, PAGE_SIZE,
 					min_pfn_mapped << PAGE_SHIFT,
-					max_pfn_mapped << PAGE_SHIFT,
-					PAGE_SIZE * num , PAGE_SIZE);
+					max_pfn_mapped << PAGE_SHIFT);
 		}
-		if (ret)
-			memblock_reserve(ret, PAGE_SIZE * num);
-		else if (can_use_brk_pgt)
+		if (!ret && can_use_brk_pgt)
 			ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
 
 		if (!ret)
@@ -610,9 +609,15 @@  static void __init memory_map_top_down(unsigned long map_start,
 	unsigned long addr;
 	unsigned long mapped_ram_size = 0;
 
+	real_end = ALIGN_DOWN(map_end, PMD_SIZE);
+
 	/* xen has big range in reserved near end of ram, skip it at first.*/
-	addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
-	real_end = addr + PMD_SIZE;
+	if (xen_domain()) {
+		addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE,
+						 map_start, map_end);
+		memblock_free(addr, PMD_SIZE);
+		real_end = addr + PMD_SIZE;
+	}
 
 	/* step_size need to be small so pgt_buf from BRK could cover it */
 	step_size = PMD_SIZE;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index e94da744386f..a1b5c71099e6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -376,15 +376,14 @@  static int __init numa_alloc_distance(void)
 	cnt++;
 	size = cnt * cnt * sizeof(numa_distance[0]);
 
-	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
-				      size, PAGE_SIZE);
+	phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
+					 PFN_PHYS(max_pfn_mapped));
 	if (!phys) {
 		pr_warn("Warning: can't allocate distance table!\n");
 		/* don't retry until explicitly reset */
 		numa_distance = (void *)1LU;
 		return -ENOMEM;
 	}
-	memblock_reserve(phys, size);
 
 	numa_distance = __va(phys);
 	numa_distance_cnt = cnt;
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 87d77cc52f86..737491b13728 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -447,13 +447,12 @@  void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	if (numa_dist_cnt) {
 		u64 phys;
 
-		phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
-					      phys_size, PAGE_SIZE);
+		phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
+						 PFN_PHYS(max_pfn_mapped));
 		if (!phys) {
 			pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
 			goto no_emu;
 		}
-		memblock_reserve(phys, phys_size);
 		phys_dist = __va(phys);
 
 		for (i = 0; i < numa_dist_cnt; i++)
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 6534c92d0f83..31b5856010cb 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -28,7 +28,7 @@  void __init reserve_real_mode(void)
 	WARN_ON(slab_is_available());
 
 	/* Has to be under 1M so we can execute real-mode AP code. */
-	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+	mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20);
 	if (!mem)
 		pr_info("No sub-1M memory is available for the trampoline\n");
 	else
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index a37a1532a575..f9383736fa0f 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -583,8 +583,8 @@  void __init acpi_table_upgrade(void)
 	}
 
 	acpi_tables_addr =
-		memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS,
-				       all_tables_size, PAGE_SIZE);
+		memblock_phys_alloc_range(all_tables_size, PAGE_SIZE,
+					  0, ACPI_TABLE_UPGRADE_MAX_PHYS);
 	if (!acpi_tables_addr) {
 		WARN_ON(1);
 		return;
@@ -599,7 +599,6 @@  void __init acpi_table_upgrade(void)
 	 * Both memblock_reserve and e820__range_add (via arch_reserve_mem_area)
 	 * works fine.
 	 */
-	memblock_reserve(acpi_tables_addr, all_tables_size);
 	arch_reserve_mem_area(acpi_tables_addr, all_tables_size);
 
 	/*
diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 4cc4e117727d..46c503486e96 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -279,13 +279,10 @@  static int __init numa_alloc_distance(void)
 	int i, j;
 
 	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
-	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),
-				      size, PAGE_SIZE);
+	phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn));
 	if (WARN_ON(!phys))
 		return -ENOMEM;
 
-	memblock_reserve(phys, size);
-
 	numa_distance = __va(phys);
 	numa_distance_cnt = nr_node_ids;
 
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index fd3964d24224..59c1390cdf42 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -33,18 +33,22 @@  static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
 	phys_addr_t *res_base)
 {
 	phys_addr_t base;
+	int err = 0;
 
 	end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end;
 	align = !align ? SMP_CACHE_BYTES : align;
-	base = memblock_find_in_range(start, end, size, align);
+	base = memblock_phys_alloc_range(size, align, start, end);
 	if (!base)
 		return -ENOMEM;
 
 	*res_base = base;
-	if (nomap)
-		return memblock_mark_nomap(base, size);
+	if (nomap) {
+		err = memblock_mark_nomap(base, size);
+		if (err)
+			memblock_free(base, size);
+	}
 
-	return memblock_reserve(base, size);
+	return err;
 }
 
 /*
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 4a53c3ca86bd..b066024c62e3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -99,8 +99,6 @@  void memblock_discard(void);
 static inline void memblock_discard(void) {}
 #endif
 
-phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
-				   phys_addr_t size, phys_addr_t align);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
diff --git a/mm/memblock.c b/mm/memblock.c
index de7b553baa50..28a813d9e955 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -315,7 +315,7 @@  static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
  * Return:
  * Found address on success, 0 on failure.
  */
-phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
+static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
 					phys_addr_t end, phys_addr_t size,
 					phys_addr_t align)
 {