diff mbox series

[v5,3/6] mm: make alloc_contig_range work at pageblock granularity

Message ID 20220211164135.1803616-4-zi.yan@sent.com (mailing list archive)
State New
Headers show
Series Use pageblock_order for cma and alloc_contig_range alignment. | expand

Commit Message

Zi Yan Feb. 11, 2022, 4:41 p.m. UTC
From: Zi Yan <ziy@nvidia.com>

alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
pageblocks with different migratetypes. It might unnecessarily convert
extra pageblocks at the beginning and at the end of the range. Change
alloc_contig_range() to work at pageblock granularity.

Special handling is needed for free pages and in-use pages across the
boundaries of the range specified alloc_contig_range(). Because these
partially isolated pages causes free page accounting issues. The free
pages will be split and freed into separate migratetype lists; the
in-use pages will be migrated then the freed pages will be handled.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 include/linux/page-isolation.h |   2 +-
 mm/internal.h                  |   3 +
 mm/memory_hotplug.c            |   3 +-
 mm/page_alloc.c                | 235 +++++++++++++++++++++++++--------
 mm/page_isolation.c            |  33 ++++-
 5 files changed, 211 insertions(+), 65 deletions(-)

Comments

Christoph Hellwig Feb. 14, 2022, 7:26 a.m. UTC | #1
> +int
> +isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int isolate_before_boundary);

Please avoid the completely unreadably long line. i.e.

int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
		int isolate_before_boundary);

Same in various other spots.
Christophe Leroy Feb. 14, 2022, 7:59 a.m. UTC | #2
Le 11/02/2022 à 17:41, Zi Yan a écrit :
> From: Zi Yan <ziy@nvidia.com>
> 
> alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
> pageblocks with different migratetypes. It might unnecessarily convert
> extra pageblocks at the beginning and at the end of the range. Change
> alloc_contig_range() to work at pageblock granularity.
> 
> Special handling is needed for free pages and in-use pages across the
> boundaries of the range specified alloc_contig_range(). Because these
> partially isolated pages causes free page accounting issues. The free
> pages will be split and freed into separate migratetype lists; the
> in-use pages will be migrated then the freed pages will be handled.
> 
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>   include/linux/page-isolation.h |   2 +-
>   mm/internal.h                  |   3 +
>   mm/memory_hotplug.c            |   3 +-
>   mm/page_alloc.c                | 235 +++++++++++++++++++++++++--------
>   mm/page_isolation.c            |  33 ++++-
>   5 files changed, 211 insertions(+), 65 deletions(-)
> 
> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
> index 4ef7be6def83..78ff940cc169 100644
> --- a/include/linux/page-isolation.h
> +++ b/include/linux/page-isolation.h
> @@ -54,7 +54,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
>    */
>   int
>   start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			 unsigned migratetype, int flags);
> +			 unsigned migratetype, int flags, gfp_t gfp_flags);
>   
>   /*
>    * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
> diff --git a/mm/internal.h b/mm/internal.h
> index 0d240e876831..509cbdc25992 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -319,6 +319,9 @@ isolate_freepages_range(struct compact_control *cc,
>   int
>   isolate_migratepages_range(struct compact_control *cc,
>   			   unsigned long low_pfn, unsigned long end_pfn);
> +
> +int
> +isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int isolate_before_boundary);
>   #endif
>   int find_suitable_fallback(struct free_area *area, unsigned int order,
>   			int migratetype, bool only_stealable, bool *can_steal);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index ce68098832aa..82406d2f3e46 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1863,7 +1863,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>   	/* set above range as isolated */
>   	ret = start_isolate_page_range(start_pfn, end_pfn,
>   				       MIGRATE_MOVABLE,
> -				       MEMORY_OFFLINE | REPORT_FAILURE);
> +				       MEMORY_OFFLINE | REPORT_FAILURE,
> +				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
>   	if (ret) {
>   		reason = "failure to isolate range";
>   		goto failed_removal_pcplists_disabled;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 62ef78f3d771..7a4fa21aea5c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -8985,7 +8985,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
>   #endif
>   
>   /* [start, end) must belong to a single zone. */
> -static int __alloc_contig_migrate_range(struct compact_control *cc,
> +int __alloc_contig_migrate_range(struct compact_control *cc,
>   					unsigned long start, unsigned long end)
>   {
>   	/* This function is based on compact_zone() from compaction.c. */
> @@ -9043,6 +9043,167 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>   	return 0;
>   }
>   
> +/**
> + * split_free_page() -- split a free page at split_pfn_offset
> + * @free_page:		the original free page
> + * @order:		the order of the page
> + * @split_pfn_offset:	split offset within the page
> + *
> + * It is used when the free page crosses two pageblocks with different migratetypes
> + * at split_pfn_offset within the page. The split free page will be put into
> + * separate migratetype lists afterwards. Otherwise, the function achieves
> + * nothing.
> + */
> +static inline void split_free_page(struct page *free_page,
> +				int order, unsigned long split_pfn_offset)
> +{
> +	struct zone *zone = page_zone(free_page);
> +	unsigned long free_page_pfn = page_to_pfn(free_page);
> +	unsigned long pfn;
> +	unsigned long flags;
> +	int free_page_order;
> +
> +	spin_lock_irqsave(&zone->lock, flags);
> +	del_page_from_free_list(free_page, zone, order);
> +	for (pfn = free_page_pfn;
> +	     pfn < free_page_pfn + (1UL << order);) {
> +		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
> +
> +		free_page_order = order_base_2(split_pfn_offset);
> +		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
> +				mt, FPI_NONE);
> +		pfn += 1UL << free_page_order;
> +		split_pfn_offset -= (1UL << free_page_order);
> +		/* we have done the first part, now switch to second part */
> +		if (split_pfn_offset == 0)
> +			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
> +	}
> +	spin_unlock_irqrestore(&zone->lock, flags);
> +}
> +
> +/**
> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be
> + * within a free or in-use page.
> + * @boundary_pfn:		pageblock-aligned pfn that a page might cross
> + * @gfp_flags:			GFP flags used for migrating pages
> + * @isolate_before_boundary:	isolate the pageblock before (1) or after (0)
> + *				the boundary_pfn
> + *
> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
> + * pageblock. When not all pageblocks within a page are isolated at the same
> + * time, free page accounting can go wrong. For example, in the case of
> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
> + * [        MAX_ORDER-1          ]
> + * [  pageblock0  |  pageblock1  ]
> + * When either pageblock is isolated, if it is a free page, the page is not
> + * split into separate migratetype lists, which is supposed to; if it is an
> + * in-use page and freed later, __free_one_page() does not split the free page
> + * either. The function handles this by splitting the free page or migrating
> + * the in-use page then splitting the free page.
> + */
> +int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
> +			int isolate_before_boundary)

Do you need such big param name ?

See 
https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#naming

isolate_before_boundary could probably be shorter.

And should be a bool by the way.

> +{
> +	unsigned char saved_mt;
> +	/*
> +	 * scan at max(MAX_ORDER_NR_PAGES, pageblock_nr_pages) aligned range to
> +	 * avoid isolate pageblocks belonging to a bigger free or in-use page
> +	 */
> +	unsigned long start_pfn = pfn_max_align_down(boundary_pfn);
> +	unsigned long isolated_pageblock_pfn;

Variable name too long.

> +	unsigned long pfn;
> +
> +	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
> +
> +	if (isolate_before_boundary)
> +		isolated_pageblock_pfn = boundary_pfn - pageblock_nr_pages;
> +	else
> +		isolated_pageblock_pfn = boundary_pfn;
> +
> +	saved_mt = get_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn));
> +	set_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn), MIGRATE_ISOLATE);
> +
> +	for (pfn = start_pfn; pfn < boundary_pfn;) {

This loop is a bit long a deep. Isn't there a way to put what's in "if 
(PageHuge(page) || PageTransCompound(page))" into a sub-function ?

See 
https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#functions

> +		struct page *page = pfn_to_page(pfn);
> +
> +		/*
> +		 * start_pfn is max(MAX_ORDER_NR_PAGES, pageblock_nr_pages)
> +		 * aligned, if there is any free pages in [start_pfn, boundary_pfn),
> +		 * its head page will always be in the range.
> +		 */
> +		if (PageBuddy(page)) {
> +			int order = buddy_order(page);
> +
> +			if (pfn + (1UL << order) > boundary_pfn)
> +				split_free_page(page, order, boundary_pfn - pfn);
> +			pfn += (1UL << order);
> +			continue;
> +		}
> +		/*
> +		 * migrate compound pages then let the free page handling code
> +		 * above do the rest
> +		 */
> +		if (PageHuge(page) || PageTransCompound(page)) {
> +			unsigned long nr_pages = compound_nr(page);
> +			int order = compound_order(page);
> +			struct page *head = compound_head(page);
> +			unsigned long head_pfn = page_to_pfn(head);
> +
> +			if (head_pfn + nr_pages >= boundary_pfn) {
> +				int ret;
> +				struct compact_control cc = {
> +					.nr_migratepages = 0,
> +					.order = -1,
> +					.zone = page_zone(pfn_to_page(head_pfn)),
> +					.mode = MIGRATE_SYNC,
> +					.ignore_skip_hint = true,
> +					.no_set_skip_hint = true,
> +					.gfp_mask = current_gfp_context(gfp_flags),
> +					.alloc_contig = true,
> +				};
> +
> +				INIT_LIST_HEAD(&cc.migratepages);
> +
> +				ret = __alloc_contig_migrate_range(&cc, head_pfn,
> +							head_pfn + nr_pages);
> +
> +				if (ret) {
> +					/* restore the original migratetype */
> +					set_pageblock_migratetype(
> +						pfn_to_page(isolated_pageblock_pfn),
> +						saved_mt);
> +					return -EBUSY;
> +				}
> +				/*
> +				 * reset pfn, let the free page handling code
> +				 * above split the free page to the right
> +				 * migratetype list.
> +				 *
> +				 * head_pfn is not used here as a hugetlb page
> +				 * order can be bigger than MAX_ORDER-1, but
> +				 * after it is freed, the free page order is not.
> +				 * Use pfn within the range to find the head of
> +				 * the free page and reset order to 0 if a hugetlb
> +				 * page with >MAX_ORDER-1 order is encountered.
> +				 */
> +				if (order > MAX_ORDER-1)
> +					order = 0;
> +				while (!PageBuddy(pfn_to_page(pfn))) {
> +					order++;
> +					pfn &= ~0UL << order;
> +				}
> +				continue;
> +			}
> +			pfn += nr_pages;
> +			continue;
> +		}
> +
> +		pfn++;
> +	}
> +	return 0;
> +}
> +
> +
>   /**
>    * alloc_contig_range() -- tries to allocate given range of pages
>    * @start:	start PFN to allocate
> @@ -9067,8 +9228,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>   int alloc_contig_range(unsigned long start, unsigned long end,
>   		       unsigned migratetype, gfp_t gfp_mask)
>   {
> -	unsigned long outer_start, outer_end;
> -	unsigned int order;
> +	unsigned long outer_end;
> +	unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
> +	unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
>   	int ret = 0;
>   
>   	struct compact_control cc = {
> @@ -9087,14 +9249,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>   	 * What we do here is we mark all pageblocks in range as
>   	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
>   	 * have different sizes, and due to the way page allocator
> -	 * work, we align the range to biggest of the two pages so
> -	 * that page allocator won't try to merge buddies from
> -	 * different pageblocks and change MIGRATE_ISOLATE to some
> -	 * other migration type.
> +	 * work, start_isolate_page_range() has special handlings for this.
>   	 *
>   	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
>   	 * migrate the pages from an unaligned range (ie. pages that
> -	 * we are interested in).  This will put all the pages in
> +	 * we are interested in). This will put all the pages in
>   	 * range back to page allocator as MIGRATE_ISOLATE.
>   	 *
>   	 * When this is done, we take the pages in range from page
> @@ -9107,9 +9266,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>   	 * put back to page allocator so that buddy can use them.
>   	 */
>   
> -	ret = start_isolate_page_range(start, end, migratetype, 0);
> +	ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
>   	if (ret)
> -		return ret;
> +		goto done;
>   
>   	drain_all_pages(cc.zone);
>   
> @@ -9128,68 +9287,28 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>   		goto done;
>   	ret = 0;
>   
> -	/*
> -	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
> -	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
> -	 * more, all pages in [start, end) are free in page allocator.
> -	 * What we are going to do is to allocate all pages from
> -	 * [start, end) (that is remove them from page allocator).
> -	 *
> -	 * The only problem is that pages at the beginning and at the
> -	 * end of interesting range may be not aligned with pages that
> -	 * page allocator holds, ie. they can be part of higher order
> -	 * pages.  Because of this, we reserve the bigger range and
> -	 * once this is done free the pages we are not interested in.
> -	 *
> -	 * We don't have to hold zone->lock here because the pages are
> -	 * isolated thus they won't get removed from buddy.
> -	 */
> -
> -	order = 0;
> -	outer_start = start;
> -	while (!PageBuddy(pfn_to_page(outer_start))) {
> -		if (++order >= MAX_ORDER) {
> -			outer_start = start;
> -			break;
> -		}
> -		outer_start &= ~0UL << order;
> -	}
> -
> -	if (outer_start != start) {
> -		order = buddy_order(pfn_to_page(outer_start));
> -
> -		/*
> -		 * outer_start page could be small order buddy page and
> -		 * it doesn't include start page. Adjust outer_start
> -		 * in this case to report failed page properly
> -		 * on tracepoint in test_pages_isolated()
> -		 */
> -		if (outer_start + (1UL << order) <= start)
> -			outer_start = start;
> -	}
> -
>   	/* Make sure the range is really isolated. */
> -	if (test_pages_isolated(outer_start, end, 0)) {
> +	if (test_pages_isolated(alloc_start, alloc_end, 0)) {
>   		ret = -EBUSY;
>   		goto done;
>   	}
>   
>   	/* Grab isolated pages from freelists. */
> -	outer_end = isolate_freepages_range(&cc, outer_start, end);
> +	outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
>   	if (!outer_end) {
>   		ret = -EBUSY;
>   		goto done;
>   	}
>   
>   	/* Free head and tail (if any) */
> -	if (start != outer_start)
> -		free_contig_range(outer_start, start - outer_start);
> -	if (end != outer_end)
> -		free_contig_range(end, outer_end - end);
> +	if (start != alloc_start)
> +		free_contig_range(alloc_start, start - alloc_start);
> +	if (end != alloc_end)
> +		free_contig_range(end, alloc_end - end);
>   
>   done:
> -	undo_isolate_page_range(pfn_max_align_down(start),
> -				pfn_max_align_up(end), migratetype);
> +	undo_isolate_page_range(alloc_start,
> +				alloc_end, migratetype);
>   	return ret;
>   }
>   EXPORT_SYMBOL(alloc_contig_range);
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index 64d093ab83ec..0256d5e1032c 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -285,6 +285,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>    *					 and PageOffline() pages.
>    *			REPORT_FAILURE - report details about the failure to
>    *			isolate the range
> + * @gfp_flags:		GFP flags used for migrating pages that sit across the
> + *			range boundaries.
>    *
>    * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
>    * the range will never be allocated. Any free pages and pages freed in the
> @@ -293,6 +295,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>    * pages in the range finally, the caller have to free all pages in the range.
>    * test_page_isolated() can be used for test it.
>    *
> + * The function first tries to isolate the pageblocks at the beginning and end
> + * of the range, since there might be pages across the range boundaries.
> + * Afterwards, it isolates the rest of the range.
> + *
>    * There is no high level synchronization mechanism that prevents two threads
>    * from trying to isolate overlapping ranges. If this happens, one thread
>    * will notice pageblocks in the overlapping range already set to isolate.
> @@ -313,21 +319,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>    * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
>    */
>   int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -			     unsigned migratetype, int flags)
> +			     unsigned migratetype, int flags, gfp_t gfp_flags)
>   {
>   	unsigned long pfn;
>   	struct page *page;
> +	/* isolation is done at page block granularity */
> +	unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
> +	unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
> +	int ret;
>   
> -	unsigned long isolate_start = pfn_max_align_down(start_pfn);
> -	unsigned long isolate_end = pfn_max_align_up(end_pfn);
> +	/* isolate [isolate_start, isolate_start + pageblock_nr_pages] pageblock */
> +	ret = isolate_single_pageblock(isolate_start, gfp_flags, 0);
> +	if (ret)
> +		return ret;
> +
> +	/* isolate [isolate_end - pageblock_nr_pages, isolate_end] pageblock */
> +	ret = isolate_single_pageblock(isolate_end, gfp_flags, 1);
> +	if (ret) {
> +		unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
> +		return ret;
> +	}
>   
> -	for (pfn = isolate_start;
> -	     pfn < isolate_end;
> +	/* skip isolated pageblocks at the beginning and end */
> +	for (pfn = isolate_start + pageblock_nr_pages;
> +	     pfn < isolate_end - pageblock_nr_pages;
>   	     pfn += pageblock_nr_pages) {
>   		page = __first_valid_page(pfn, pageblock_nr_pages);
>   		if (page && set_migratetype_isolate(page, migratetype, flags,
>   					start_pfn, end_pfn)) {
>   			undo_isolate_page_range(isolate_start, pfn, migratetype);
> +			unset_migratetype_isolate(
> +				pfn_to_page(isolate_end - pageblock_nr_pages),
> +				migratetype);
>   			return -EBUSY;
>   		}
>   	}
Zi Yan Feb. 14, 2022, 3:46 p.m. UTC | #3
On 14 Feb 2022, at 2:26, Christoph Hellwig wrote:

>> +int
>> +isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int isolate_before_boundary);
>
> Please avoid the completely unreadably long line. i.e.
>
> int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
> 		int isolate_before_boundary);
>
> Same in various other spots.

OK. Thanks for pointing it out. checkpatch.pl did not report any
warning about this. It seems that the column limit has been relaxed
to 100. Anyway, I will make it shorter.

--
Best Regards,
Yan, Zi
Zi Yan Feb. 14, 2022, 4:03 p.m. UTC | #4
On 14 Feb 2022, at 2:59, Christophe Leroy wrote:

> Le 11/02/2022 à 17:41, Zi Yan a écrit :
>> From: Zi Yan <ziy@nvidia.com>
>>
>> alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
>> pageblocks with different migratetypes. It might unnecessarily convert
>> extra pageblocks at the beginning and at the end of the range. Change
>> alloc_contig_range() to work at pageblock granularity.
>>
>> Special handling is needed for free pages and in-use pages across the
>> boundaries of the range specified alloc_contig_range(). Because these
>> partially isolated pages causes free page accounting issues. The free
>> pages will be split and freed into separate migratetype lists; the
>> in-use pages will be migrated then the freed pages will be handled.
>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>   include/linux/page-isolation.h |   2 +-
>>   mm/internal.h                  |   3 +
>>   mm/memory_hotplug.c            |   3 +-
>>   mm/page_alloc.c                | 235 +++++++++++++++++++++++++--------
>>   mm/page_isolation.c            |  33 ++++-
>>   5 files changed, 211 insertions(+), 65 deletions(-)
>>
>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>> index 4ef7be6def83..78ff940cc169 100644
>> --- a/include/linux/page-isolation.h
>> +++ b/include/linux/page-isolation.h
>> @@ -54,7 +54,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
>>    */
>>   int
>>   start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> -			 unsigned migratetype, int flags);
>> +			 unsigned migratetype, int flags, gfp_t gfp_flags);
>>
>>   /*
>>    * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 0d240e876831..509cbdc25992 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -319,6 +319,9 @@ isolate_freepages_range(struct compact_control *cc,
>>   int
>>   isolate_migratepages_range(struct compact_control *cc,
>>   			   unsigned long low_pfn, unsigned long end_pfn);
>> +
>> +int
>> +isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int isolate_before_boundary);
>>   #endif
>>   int find_suitable_fallback(struct free_area *area, unsigned int order,
>>   			int migratetype, bool only_stealable, bool *can_steal);
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index ce68098832aa..82406d2f3e46 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1863,7 +1863,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>   	/* set above range as isolated */
>>   	ret = start_isolate_page_range(start_pfn, end_pfn,
>>   				       MIGRATE_MOVABLE,
>> -				       MEMORY_OFFLINE | REPORT_FAILURE);
>> +				       MEMORY_OFFLINE | REPORT_FAILURE,
>> +				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
>>   	if (ret) {
>>   		reason = "failure to isolate range";
>>   		goto failed_removal_pcplists_disabled;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 62ef78f3d771..7a4fa21aea5c 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -8985,7 +8985,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
>>   #endif
>>
>>   /* [start, end) must belong to a single zone. */
>> -static int __alloc_contig_migrate_range(struct compact_control *cc,
>> +int __alloc_contig_migrate_range(struct compact_control *cc,
>>   					unsigned long start, unsigned long end)
>>   {
>>   	/* This function is based on compact_zone() from compaction.c. */
>> @@ -9043,6 +9043,167 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>>   	return 0;
>>   }
>>
>> +/**
>> + * split_free_page() -- split a free page at split_pfn_offset
>> + * @free_page:		the original free page
>> + * @order:		the order of the page
>> + * @split_pfn_offset:	split offset within the page
>> + *
>> + * It is used when the free page crosses two pageblocks with different migratetypes
>> + * at split_pfn_offset within the page. The split free page will be put into
>> + * separate migratetype lists afterwards. Otherwise, the function achieves
>> + * nothing.
>> + */
>> +static inline void split_free_page(struct page *free_page,
>> +				int order, unsigned long split_pfn_offset)
>> +{
>> +	struct zone *zone = page_zone(free_page);
>> +	unsigned long free_page_pfn = page_to_pfn(free_page);
>> +	unsigned long pfn;
>> +	unsigned long flags;
>> +	int free_page_order;
>> +
>> +	spin_lock_irqsave(&zone->lock, flags);
>> +	del_page_from_free_list(free_page, zone, order);
>> +	for (pfn = free_page_pfn;
>> +	     pfn < free_page_pfn + (1UL << order);) {
>> +		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
>> +
>> +		free_page_order = order_base_2(split_pfn_offset);
>> +		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
>> +				mt, FPI_NONE);
>> +		pfn += 1UL << free_page_order;
>> +		split_pfn_offset -= (1UL << free_page_order);
>> +		/* we have done the first part, now switch to second part */
>> +		if (split_pfn_offset == 0)
>> +			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
>> +	}
>> +	spin_unlock_irqrestore(&zone->lock, flags);
>> +}
>> +
>> +/**
>> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be
>> + * within a free or in-use page.
>> + * @boundary_pfn:		pageblock-aligned pfn that a page might cross
>> + * @gfp_flags:			GFP flags used for migrating pages
>> + * @isolate_before_boundary:	isolate the pageblock before (1) or after (0)
>> + *				the boundary_pfn
>> + *
>> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
>> + * pageblock. When not all pageblocks within a page are isolated at the same
>> + * time, free page accounting can go wrong. For example, in the case of
>> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
>> + * [        MAX_ORDER-1          ]
>> + * [  pageblock0  |  pageblock1  ]
>> + * When either pageblock is isolated, if it is a free page, the page is not
>> + * split into separate migratetype lists, which is supposed to; if it is an
>> + * in-use page and freed later, __free_one_page() does not split the free page
>> + * either. The function handles this by splitting the free page or migrating
>> + * the in-use page then splitting the free page.
>> + */
>> +int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
>> +			int isolate_before_boundary)
>
> Do you need such big param name ?

I am happy to take any suggestion.

>
> See
> https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#naming
>
> isolate_before_boundary could probably be shorter.

isolate_before instead?

>
> And should be a bool by the way.

Sure.
>
>> +{
>> +	unsigned char saved_mt;
>> +	/*
>> +	 * scan at max(MAX_ORDER_NR_PAGES, pageblock_nr_pages) aligned range to
>> +	 * avoid isolate pageblocks belonging to a bigger free or in-use page
>> +	 */
>> +	unsigned long start_pfn = pfn_max_align_down(boundary_pfn);
>> +	unsigned long isolated_pageblock_pfn;
>
> Variable name too long.
>
>> +	unsigned long pfn;
>> +
>> +	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
>> +
>> +	if (isolate_before_boundary)
>> +		isolated_pageblock_pfn = boundary_pfn - pageblock_nr_pages;
>> +	else
>> +		isolated_pageblock_pfn = boundary_pfn;
>> +
>> +	saved_mt = get_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn));
>> +	set_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn), MIGRATE_ISOLATE);
>> +
>> +	for (pfn = start_pfn; pfn < boundary_pfn;) {
>
> This loop is a bit long a deep. Isn't there a way to put what's in "if
> (PageHuge(page) || PageTransCompound(page))" into a sub-function ?
>

Let me give it a try.

> See
> https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#functions
>

Thanks for the review.


>> +		struct page *page = pfn_to_page(pfn);
>> +
>> +		/*
>> +		 * start_pfn is max(MAX_ORDER_NR_PAGES, pageblock_nr_pages)
>> +		 * aligned, if there is any free pages in [start_pfn, boundary_pfn),
>> +		 * its head page will always be in the range.
>> +		 */
>> +		if (PageBuddy(page)) {
>> +			int order = buddy_order(page);
>> +
>> +			if (pfn + (1UL << order) > boundary_pfn)
>> +				split_free_page(page, order, boundary_pfn - pfn);
>> +			pfn += (1UL << order);
>> +			continue;
>> +		}
>> +		/*
>> +		 * migrate compound pages then let the free page handling code
>> +		 * above do the rest
>> +		 */
>> +		if (PageHuge(page) || PageTransCompound(page)) {
>> +			unsigned long nr_pages = compound_nr(page);
>> +			int order = compound_order(page);
>> +			struct page *head = compound_head(page);
>> +			unsigned long head_pfn = page_to_pfn(head);
>> +
>> +			if (head_pfn + nr_pages >= boundary_pfn) {
>> +				int ret;
>> +				struct compact_control cc = {
>> +					.nr_migratepages = 0,
>> +					.order = -1,
>> +					.zone = page_zone(pfn_to_page(head_pfn)),
>> +					.mode = MIGRATE_SYNC,
>> +					.ignore_skip_hint = true,
>> +					.no_set_skip_hint = true,
>> +					.gfp_mask = current_gfp_context(gfp_flags),
>> +					.alloc_contig = true,
>> +				};
>> +
>> +				INIT_LIST_HEAD(&cc.migratepages);
>> +
>> +				ret = __alloc_contig_migrate_range(&cc, head_pfn,
>> +							head_pfn + nr_pages);
>> +
>> +				if (ret) {
>> +					/* restore the original migratetype */
>> +					set_pageblock_migratetype(
>> +						pfn_to_page(isolated_pageblock_pfn),
>> +						saved_mt);
>> +					return -EBUSY;
>> +				}
>> +				/*
>> +				 * reset pfn, let the free page handling code
>> +				 * above split the free page to the right
>> +				 * migratetype list.
>> +				 *
>> +				 * head_pfn is not used here as a hugetlb page
>> +				 * order can be bigger than MAX_ORDER-1, but
>> +				 * after it is freed, the free page order is not.
>> +				 * Use pfn within the range to find the head of
>> +				 * the free page and reset order to 0 if a hugetlb
>> +				 * page with >MAX_ORDER-1 order is encountered.
>> +				 */
>> +				if (order > MAX_ORDER-1)
>> +					order = 0;
>> +				while (!PageBuddy(pfn_to_page(pfn))) {
>> +					order++;
>> +					pfn &= ~0UL << order;
>> +				}
>> +				continue;
>> +			}
>> +			pfn += nr_pages;
>> +			continue;
>> +		}
>> +
>> +		pfn++;
>> +	}
>> +	return 0;
>> +}
>> +
>> +
>>   /**
>>    * alloc_contig_range() -- tries to allocate given range of pages
>>    * @start:	start PFN to allocate
>> @@ -9067,8 +9228,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>>   int alloc_contig_range(unsigned long start, unsigned long end,
>>   		       unsigned migratetype, gfp_t gfp_mask)
>>   {
>> -	unsigned long outer_start, outer_end;
>> -	unsigned int order;
>> +	unsigned long outer_end;
>> +	unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
>> +	unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
>>   	int ret = 0;
>>
>>   	struct compact_control cc = {
>> @@ -9087,14 +9249,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   	 * What we do here is we mark all pageblocks in range as
>>   	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
>>   	 * have different sizes, and due to the way page allocator
>> -	 * work, we align the range to biggest of the two pages so
>> -	 * that page allocator won't try to merge buddies from
>> -	 * different pageblocks and change MIGRATE_ISOLATE to some
>> -	 * other migration type.
>> +	 * work, start_isolate_page_range() has special handlings for this.
>>   	 *
>>   	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
>>   	 * migrate the pages from an unaligned range (ie. pages that
>> -	 * we are interested in).  This will put all the pages in
>> +	 * we are interested in). This will put all the pages in
>>   	 * range back to page allocator as MIGRATE_ISOLATE.
>>   	 *
>>   	 * When this is done, we take the pages in range from page
>> @@ -9107,9 +9266,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   	 * put back to page allocator so that buddy can use them.
>>   	 */
>>
>> -	ret = start_isolate_page_range(start, end, migratetype, 0);
>> +	ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
>>   	if (ret)
>> -		return ret;
>> +		goto done;
>>
>>   	drain_all_pages(cc.zone);
>>
>> @@ -9128,68 +9287,28 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   		goto done;
>>   	ret = 0;
>>
>> -	/*
>> -	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
>> -	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
>> -	 * more, all pages in [start, end) are free in page allocator.
>> -	 * What we are going to do is to allocate all pages from
>> -	 * [start, end) (that is remove them from page allocator).
>> -	 *
>> -	 * The only problem is that pages at the beginning and at the
>> -	 * end of interesting range may be not aligned with pages that
>> -	 * page allocator holds, ie. they can be part of higher order
>> -	 * pages.  Because of this, we reserve the bigger range and
>> -	 * once this is done free the pages we are not interested in.
>> -	 *
>> -	 * We don't have to hold zone->lock here because the pages are
>> -	 * isolated thus they won't get removed from buddy.
>> -	 */
>> -
>> -	order = 0;
>> -	outer_start = start;
>> -	while (!PageBuddy(pfn_to_page(outer_start))) {
>> -		if (++order >= MAX_ORDER) {
>> -			outer_start = start;
>> -			break;
>> -		}
>> -		outer_start &= ~0UL << order;
>> -	}
>> -
>> -	if (outer_start != start) {
>> -		order = buddy_order(pfn_to_page(outer_start));
>> -
>> -		/*
>> -		 * outer_start page could be small order buddy page and
>> -		 * it doesn't include start page. Adjust outer_start
>> -		 * in this case to report failed page properly
>> -		 * on tracepoint in test_pages_isolated()
>> -		 */
>> -		if (outer_start + (1UL << order) <= start)
>> -			outer_start = start;
>> -	}
>> -
>>   	/* Make sure the range is really isolated. */
>> -	if (test_pages_isolated(outer_start, end, 0)) {
>> +	if (test_pages_isolated(alloc_start, alloc_end, 0)) {
>>   		ret = -EBUSY;
>>   		goto done;
>>   	}
>>
>>   	/* Grab isolated pages from freelists. */
>> -	outer_end = isolate_freepages_range(&cc, outer_start, end);
>> +	outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
>>   	if (!outer_end) {
>>   		ret = -EBUSY;
>>   		goto done;
>>   	}
>>
>>   	/* Free head and tail (if any) */
>> -	if (start != outer_start)
>> -		free_contig_range(outer_start, start - outer_start);
>> -	if (end != outer_end)
>> -		free_contig_range(end, outer_end - end);
>> +	if (start != alloc_start)
>> +		free_contig_range(alloc_start, start - alloc_start);
>> +	if (end != alloc_end)
>> +		free_contig_range(end, alloc_end - end);
>>
>>   done:
>> -	undo_isolate_page_range(pfn_max_align_down(start),
>> -				pfn_max_align_up(end), migratetype);
>> +	undo_isolate_page_range(alloc_start,
>> +				alloc_end, migratetype);
>>   	return ret;
>>   }
>>   EXPORT_SYMBOL(alloc_contig_range);
>> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
>> index 64d093ab83ec..0256d5e1032c 100644
>> --- a/mm/page_isolation.c
>> +++ b/mm/page_isolation.c
>> @@ -285,6 +285,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    *					 and PageOffline() pages.
>>    *			REPORT_FAILURE - report details about the failure to
>>    *			isolate the range
>> + * @gfp_flags:		GFP flags used for migrating pages that sit across the
>> + *			range boundaries.
>>    *
>>    * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
>>    * the range will never be allocated. Any free pages and pages freed in the
>> @@ -293,6 +295,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    * pages in the range finally, the caller have to free all pages in the range.
>>    * test_page_isolated() can be used for test it.
>>    *
>> + * The function first tries to isolate the pageblocks at the beginning and end
>> + * of the range, since there might be pages across the range boundaries.
>> + * Afterwards, it isolates the rest of the range.
>> + *
>>    * There is no high level synchronization mechanism that prevents two threads
>>    * from trying to isolate overlapping ranges. If this happens, one thread
>>    * will notice pageblocks in the overlapping range already set to isolate.
>> @@ -313,21 +319,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
>>    */
>>   int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> -			     unsigned migratetype, int flags)
>> +			     unsigned migratetype, int flags, gfp_t gfp_flags)
>>   {
>>   	unsigned long pfn;
>>   	struct page *page;
>> +	/* isolation is done at page block granularity */
>> +	unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
>> +	unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
>> +	int ret;
>>
>> -	unsigned long isolate_start = pfn_max_align_down(start_pfn);
>> -	unsigned long isolate_end = pfn_max_align_up(end_pfn);
>> +	/* isolate [isolate_start, isolate_start + pageblock_nr_pages] pageblock */
>> +	ret = isolate_single_pageblock(isolate_start, gfp_flags, 0);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* isolate [isolate_end - pageblock_nr_pages, isolate_end] pageblock */
>> +	ret = isolate_single_pageblock(isolate_end, gfp_flags, 1);
>> +	if (ret) {
>> +		unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
>> +		return ret;
>> +	}
>>
>> -	for (pfn = isolate_start;
>> -	     pfn < isolate_end;
>> +	/* skip isolated pageblocks at the beginning and end */
>> +	for (pfn = isolate_start + pageblock_nr_pages;
>> +	     pfn < isolate_end - pageblock_nr_pages;
>>   	     pfn += pageblock_nr_pages) {
>>   		page = __first_valid_page(pfn, pageblock_nr_pages);
>>   		if (page && set_migratetype_isolate(page, migratetype, flags,
>>   					start_pfn, end_pfn)) {
>>   			undo_isolate_page_range(isolate_start, pfn, migratetype);
>> +			unset_migratetype_isolate(
>> +				pfn_to_page(isolate_end - pageblock_nr_pages),
>> +				migratetype);
>>   			return -EBUSY;
>>   		}
>>   	}

--
Best Regards,
Yan, Zi
diff mbox series

Patch

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 4ef7be6def83..78ff940cc169 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -54,7 +54,7 @@  int move_freepages_block(struct zone *zone, struct page *page,
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			 unsigned migratetype, int flags);
+			 unsigned migratetype, int flags, gfp_t gfp_flags);
 
 /*
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
diff --git a/mm/internal.h b/mm/internal.h
index 0d240e876831..509cbdc25992 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -319,6 +319,9 @@  isolate_freepages_range(struct compact_control *cc,
 int
 isolate_migratepages_range(struct compact_control *cc,
 			   unsigned long low_pfn, unsigned long end_pfn);
+
+int
+isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int isolate_before_boundary);
 #endif
 int find_suitable_fallback(struct free_area *area, unsigned int order,
 			int migratetype, bool only_stealable, bool *can_steal);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce68098832aa..82406d2f3e46 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1863,7 +1863,8 @@  int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	/* set above range as isolated */
 	ret = start_isolate_page_range(start_pfn, end_pfn,
 				       MIGRATE_MOVABLE,
-				       MEMORY_OFFLINE | REPORT_FAILURE);
+				       MEMORY_OFFLINE | REPORT_FAILURE,
+				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
 	if (ret) {
 		reason = "failure to isolate range";
 		goto failed_removal_pcplists_disabled;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62ef78f3d771..7a4fa21aea5c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8985,7 +8985,7 @@  static inline void alloc_contig_dump_pages(struct list_head *page_list)
 #endif
 
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
@@ -9043,6 +9043,167 @@  static int __alloc_contig_migrate_range(struct compact_control *cc,
 	return 0;
 }
 
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page:		the original free page
+ * @order:		the order of the page
+ * @split_pfn_offset:	split offset within the page
+ *
+ * It is used when the free page crosses two pageblocks with different migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+static inline void split_free_page(struct page *free_page,
+				int order, unsigned long split_pfn_offset)
+{
+	struct zone *zone = page_zone(free_page);
+	unsigned long free_page_pfn = page_to_pfn(free_page);
+	unsigned long pfn;
+	unsigned long flags;
+	int free_page_order;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	del_page_from_free_list(free_page, zone, order);
+	for (pfn = free_page_pfn;
+	     pfn < free_page_pfn + (1UL << order);) {
+		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+		free_page_order = order_base_2(split_pfn_offset);
+		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+				mt, FPI_NONE);
+		pfn += 1UL << free_page_order;
+		split_pfn_offset -= (1UL << free_page_order);
+		/* we have done the first part, now switch to second part */
+		if (split_pfn_offset == 0)
+			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/**
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn:		pageblock-aligned pfn that a page might cross
+ * @gfp_flags:			GFP flags used for migrating pages
+ * @isolate_before_boundary:	isolate the pageblock before (1) or after (0)
+ *				the boundary_pfn
+ *
+ * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
+ * [        MAX_ORDER-1          ]
+ * [  pageblock0  |  pageblock1  ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
+			int isolate_before_boundary)
+{
+	unsigned char saved_mt;
+	/*
+	 * scan at max(MAX_ORDER_NR_PAGES, pageblock_nr_pages) aligned range to
+	 * avoid isolate pageblocks belonging to a bigger free or in-use page
+	 */
+	unsigned long start_pfn = pfn_max_align_down(boundary_pfn);
+	unsigned long isolated_pageblock_pfn;
+	unsigned long pfn;
+
+	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+
+	if (isolate_before_boundary)
+		isolated_pageblock_pfn = boundary_pfn - pageblock_nr_pages;
+	else
+		isolated_pageblock_pfn = boundary_pfn;
+
+	saved_mt = get_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn));
+	set_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn), MIGRATE_ISOLATE);
+
+	for (pfn = start_pfn; pfn < boundary_pfn;) {
+		struct page *page = pfn_to_page(pfn);
+
+		/*
+		 * start_pfn is max(MAX_ORDER_NR_PAGES, pageblock_nr_pages)
+		 * aligned, if there is any free pages in [start_pfn, boundary_pfn),
+		 * its head page will always be in the range.
+		 */
+		if (PageBuddy(page)) {
+			int order = buddy_order(page);
+
+			if (pfn + (1UL << order) > boundary_pfn)
+				split_free_page(page, order, boundary_pfn - pfn);
+			pfn += (1UL << order);
+			continue;
+		}
+		/*
+		 * migrate compound pages then let the free page handling code
+		 * above do the rest
+		 */
+		if (PageHuge(page) || PageTransCompound(page)) {
+			unsigned long nr_pages = compound_nr(page);
+			int order = compound_order(page);
+			struct page *head = compound_head(page);
+			unsigned long head_pfn = page_to_pfn(head);
+
+			if (head_pfn + nr_pages >= boundary_pfn) {
+				int ret;
+				struct compact_control cc = {
+					.nr_migratepages = 0,
+					.order = -1,
+					.zone = page_zone(pfn_to_page(head_pfn)),
+					.mode = MIGRATE_SYNC,
+					.ignore_skip_hint = true,
+					.no_set_skip_hint = true,
+					.gfp_mask = current_gfp_context(gfp_flags),
+					.alloc_contig = true,
+				};
+
+				INIT_LIST_HEAD(&cc.migratepages);
+
+				ret = __alloc_contig_migrate_range(&cc, head_pfn,
+							head_pfn + nr_pages);
+
+				if (ret) {
+					/* restore the original migratetype */
+					set_pageblock_migratetype(
+						pfn_to_page(isolated_pageblock_pfn),
+						saved_mt);
+					return -EBUSY;
+				}
+				/*
+				 * reset pfn, let the free page handling code
+				 * above split the free page to the right
+				 * migratetype list.
+				 *
+				 * head_pfn is not used here as a hugetlb page
+				 * order can be bigger than MAX_ORDER-1, but
+				 * after it is freed, the free page order is not.
+				 * Use pfn within the range to find the head of
+				 * the free page and reset order to 0 if a hugetlb
+				 * page with >MAX_ORDER-1 order is encountered.
+				 */
+				if (order > MAX_ORDER-1)
+					order = 0;
+				while (!PageBuddy(pfn_to_page(pfn))) {
+					order++;
+					pfn &= ~0UL << order;
+				}
+				continue;
+			}
+			pfn += nr_pages;
+			continue;
+		}
+
+		pfn++;
+	}
+	return 0;
+}
+
+
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
@@ -9067,8 +9228,9 @@  static int __alloc_contig_migrate_range(struct compact_control *cc,
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype, gfp_t gfp_mask)
 {
-	unsigned long outer_start, outer_end;
-	unsigned int order;
+	unsigned long outer_end;
+	unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
+	unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
 	int ret = 0;
 
 	struct compact_control cc = {
@@ -9087,14 +9249,11 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
-	 * work, we align the range to biggest of the two pages so
-	 * that page allocator won't try to merge buddies from
-	 * different pageblocks and change MIGRATE_ISOLATE to some
-	 * other migration type.
+	 * work, start_isolate_page_range() has special handlings for this.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
-	 * we are interested in).  This will put all the pages in
+	 * we are interested in). This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
@@ -9107,9 +9266,9 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 	 * put back to page allocator so that buddy can use them.
 	 */
 
-	ret = start_isolate_page_range(start, end, migratetype, 0);
+	ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
 	if (ret)
-		return ret;
+		goto done;
 
 	drain_all_pages(cc.zone);
 
@@ -9128,68 +9287,28 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 		goto done;
 	ret = 0;
 
-	/*
-	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
-	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
-	 * more, all pages in [start, end) are free in page allocator.
-	 * What we are going to do is to allocate all pages from
-	 * [start, end) (that is remove them from page allocator).
-	 *
-	 * The only problem is that pages at the beginning and at the
-	 * end of interesting range may be not aligned with pages that
-	 * page allocator holds, ie. they can be part of higher order
-	 * pages.  Because of this, we reserve the bigger range and
-	 * once this is done free the pages we are not interested in.
-	 *
-	 * We don't have to hold zone->lock here because the pages are
-	 * isolated thus they won't get removed from buddy.
-	 */
-
-	order = 0;
-	outer_start = start;
-	while (!PageBuddy(pfn_to_page(outer_start))) {
-		if (++order >= MAX_ORDER) {
-			outer_start = start;
-			break;
-		}
-		outer_start &= ~0UL << order;
-	}
-
-	if (outer_start != start) {
-		order = buddy_order(pfn_to_page(outer_start));
-
-		/*
-		 * outer_start page could be small order buddy page and
-		 * it doesn't include start page. Adjust outer_start
-		 * in this case to report failed page properly
-		 * on tracepoint in test_pages_isolated()
-		 */
-		if (outer_start + (1UL << order) <= start)
-			outer_start = start;
-	}
-
 	/* Make sure the range is really isolated. */
-	if (test_pages_isolated(outer_start, end, 0)) {
+	if (test_pages_isolated(alloc_start, alloc_end, 0)) {
 		ret = -EBUSY;
 		goto done;
 	}
 
 	/* Grab isolated pages from freelists. */
-	outer_end = isolate_freepages_range(&cc, outer_start, end);
+	outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 
 	/* Free head and tail (if any) */
-	if (start != outer_start)
-		free_contig_range(outer_start, start - outer_start);
-	if (end != outer_end)
-		free_contig_range(end, outer_end - end);
+	if (start != alloc_start)
+		free_contig_range(alloc_start, start - alloc_start);
+	if (end != alloc_end)
+		free_contig_range(end, alloc_end - end);
 
 done:
-	undo_isolate_page_range(pfn_max_align_down(start),
-				pfn_max_align_up(end), migratetype);
+	undo_isolate_page_range(alloc_start,
+				alloc_end, migratetype);
 	return ret;
 }
 EXPORT_SYMBOL(alloc_contig_range);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 64d093ab83ec..0256d5e1032c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -285,6 +285,8 @@  __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  *					 and PageOffline() pages.
  *			REPORT_FAILURE - report details about the failure to
  *			isolate the range
+ * @gfp_flags:		GFP flags used for migrating pages that sit across the
+ *			range boundaries.
  *
  * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
  * the range will never be allocated. Any free pages and pages freed in the
@@ -293,6 +295,10 @@  __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * pages in the range finally, the caller have to free all pages in the range.
  * test_page_isolated() can be used for test it.
  *
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
  * There is no high level synchronization mechanism that prevents two threads
  * from trying to isolate overlapping ranges. If this happens, one thread
  * will notice pageblocks in the overlapping range already set to isolate.
@@ -313,21 +319,38 @@  __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
  */
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			     unsigned migratetype, int flags)
+			     unsigned migratetype, int flags, gfp_t gfp_flags)
 {
 	unsigned long pfn;
 	struct page *page;
+	/* isolation is done at page block granularity */
+	unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
+	unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
+	int ret;
 
-	unsigned long isolate_start = pfn_max_align_down(start_pfn);
-	unsigned long isolate_end = pfn_max_align_up(end_pfn);
+	/* isolate [isolate_start, isolate_start + pageblock_nr_pages] pageblock */
+	ret = isolate_single_pageblock(isolate_start, gfp_flags, 0);
+	if (ret)
+		return ret;
+
+	/* isolate [isolate_end - pageblock_nr_pages, isolate_end] pageblock */
+	ret = isolate_single_pageblock(isolate_end, gfp_flags, 1);
+	if (ret) {
+		unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+		return ret;
+	}
 
-	for (pfn = isolate_start;
-	     pfn < isolate_end;
+	/* skip isolated pageblocks at the beginning and end */
+	for (pfn = isolate_start + pageblock_nr_pages;
+	     pfn < isolate_end - pageblock_nr_pages;
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
 		if (page && set_migratetype_isolate(page, migratetype, flags,
 					start_pfn, end_pfn)) {
 			undo_isolate_page_range(isolate_start, pfn, migratetype);
+			unset_migratetype_isolate(
+				pfn_to_page(isolate_end - pageblock_nr_pages),
+				migratetype);
 			return -EBUSY;
 		}
 	}